In [None]:
from nltk.corpus import wordnet
from collections import Counter
from collections import defaultdict
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance
nltk.download('wordnet')
nltk.download("words")
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

**Simple Thesaurus based**

In [None]:
def expand_query_with_synonyms(query):
    query_tokens = query.lower().split()
    expansion = set(query_tokens)

    for term in query_tokens:
        for syn in wordnet.synsets(term):
            for lemma in syn.lemmas():
                expansion.add(lemma.name().replace('_', ' '))
    return list(expansion)

In [None]:
query = "car"
expanded_query = expand_query_with_synonyms(query)
print("Original Query:", query)
print("Expanded Query:", expanded_query)

Original Query: car
Expanded Query: ['motorcar', 'railway car', 'automobile', 'elevator car', 'cable car', 'machine', 'railroad car', 'railcar', 'car', 'gondola', 'auto']


**Pseudo Relevance based expansion**

In [None]:
text = [
    "John likes soccer. John plays soccer every afternoon after school.",
    "Mary reads books. Mary reads books in the library every evening.",
    "The cat chased the cat around the yard until the cat tired.",
    "He likes football and she likes football more than any other sport.",
    "Soccer is a popular sport around the world. Many people play soccer.",
    "Books and reading help Mary improve her knowledge and vocabulary.",
    "Cats are playful and sometimes chase each other in the yard."
]
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    return [word for word in words if word.lower() not in stop_words and word not in string.punctuation]

tokenized_doc = [remove_stopwords(t) for t in text]

tokenized_doc

[['John',
  'likes',
  'soccer',
  'John',
  'plays',
  'soccer',
  'every',
  'afternoon',
  'school'],
 ['Mary',
  'reads',
  'books',
  'Mary',
  'reads',
  'books',
  'library',
  'every',
  'evening'],
 ['cat', 'chased', 'cat', 'around', 'yard', 'cat', 'tired'],
 ['likes', 'football', 'likes', 'football', 'sport'],
 ['Soccer',
  'popular',
  'sport',
  'around',
  'world',
  'Many',
  'people',
  'play',
  'soccer'],
 ['Books', 'reading', 'help', 'Mary', 'improve', 'knowledge', 'vocabulary'],
 ['Cats', 'playful', 'sometimes', 'chase', 'yard']]

In [None]:
# In real IR, we would rank docs. But,here we simulate by matching query terms.
def retrieve_top_docs(query_tokens, tokenized_docs, top_k=3):
    scores = []
    for i, doc in enumerate(tokenized_docs):
        score = sum(doc.count(term) for term in query_tokens)  # simple frequency score
        scores.append((i, score))
    # Sort by score descending
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_docs = [tokenized_docs[i] for i, s in scores[:top_k] if s > 0]
    return top_docs

In [None]:
def pseudo_relevance_expansion(query_tokens, top_docs_tokens, top_n=5):
    counter = Counter()
    for doc in top_docs_tokens:
        counter.update(doc)

    # Remove original query terms
    for term in query_tokens:
        if term in counter:
            del counter[term]

    # Select top-n frequent terms
    expansion_terms = [t for t, _ in counter.most_common(top_n)]
    return query_tokens + expansion_terms

In [None]:
query_tokens = ["soccer", "sport"]
top_docs = retrieve_top_docs(query_tokens, tokenized_doc, top_k=3)

expanded_query = pseudo_relevance_expansion(query_tokens, top_docs, top_n=5)

In [None]:
print("Original Query:", query_tokens)
print("Top-K Retrieved Docs (Tokens):")
for i, doc in enumerate(top_docs):
    print(f"Doc {i+1}:", doc)
print("Expanded Query:", expanded_query)

Original Query: ['soccer', 'sport']
Top-K Retrieved Docs (Tokens):
Doc 1: ['John', 'likes', 'soccer', 'John', 'plays', 'soccer', 'every', 'afternoon', 'school']
Doc 2: ['Soccer', 'popular', 'sport', 'around', 'world', 'Many', 'people', 'play', 'soccer']
Doc 3: ['likes', 'football', 'likes', 'football', 'sport']
Expanded Query: ['soccer', 'sport', 'likes', 'John', 'football', 'plays', 'every']


**Spelling correction(Edit distance)**

In [None]:
word_list = set(words.words())

def correct_spelling(query):
    corrected = []
    for term in query.split():
        if term in word_list:
            corrected.append(term)
        else:
            # find closest word by edit distance
            closest_word = min(word_list, key=lambda w: edit_distance(term, w))
            corrected.append(closest_word)
    return " ".join(corrected)

In [None]:
query = "socer plaing in fied"
corrected_query = correct_spelling(query)

print("Original Query:", query)
print("Corrected Query:", corrected_query)

Original Query: socer plaing in fied
Corrected Query: socker plating in field


**K-gram indexes**

In [None]:
# Sample Vocabulary
vocab = ["soccer", "football", "tennis", "library", "playing", "john", "mary"]

# Build Simple K-Gram Index
def build_kgram_index(vocab, k=2):
    kgram_index = defaultdict(set)
    for word in vocab:
        word_padded = f"${word}$"  # add start/end markers
        for i in range(len(word_padded)-k+1):
            kgram = word_padded[i:i+k]
            kgram_index[kgram].add(word)
    return kgram_index

kgram_index = build_kgram_index(vocab, k=2)


In [None]:
#Display K-Gram Index
for kgram, words in kgram_index.items():
    print(f"{kgram}: {words}")

$s: {'soccer'}
so: {'soccer'}
oc: {'soccer'}
cc: {'soccer'}
ce: {'soccer'}
er: {'soccer'}
r$: {'soccer'}
$f: {'football'}
fo: {'football'}
oo: {'football'}
ot: {'football'}
tb: {'football'}
ba: {'football'}
al: {'football'}
ll: {'football'}
l$: {'football'}
$t: {'tennis'}
te: {'tennis'}
en: {'tennis'}
nn: {'tennis'}
ni: {'tennis'}
is: {'tennis'}
s$: {'tennis'}
$l: {'library'}
li: {'library'}
ib: {'library'}
br: {'library'}
ra: {'library'}
ar: {'mary', 'library'}
ry: {'mary', 'library'}
y$: {'mary', 'library'}
$p: {'playing'}
pl: {'playing'}
la: {'playing'}
ay: {'playing'}
yi: {'playing'}
in: {'playing'}
ng: {'playing'}
g$: {'playing'}
$j: {'john'}
jo: {'john'}
oh: {'john'}
hn: {'john'}
n$: {'john'}
$m: {'mary'}
ma: {'mary'}


In [None]:
# Simple Candidate Search
def get_candidates(word, kgram_index, k=2):
    word_padded = f"${word}$"
    word_kgrams = [word_padded[i:i+k] for i in range(len(word_padded)-k+1)]

    candidates = set()
    for kg in word_kgrams:
        if kg in kgram_index:
            candidates.update(kgram_index[kg])
    return candidates

In [None]:
misspelled = "socer"
candidates = get_candidates(misspelled, kgram_index)
print("\nMisspelled word:", misspelled)
print("Candidate corrections:", candidates)


Misspelled word: socer
Candidate corrections: {'soccer'}
