<a href="https://colab.research.google.com/github/imtithal/Information-Retrieval-/blob/main/Query_expansion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define the document corpus
documents = [
    "The quick brown fox jumps over the lazy dog",
    "A fast brown fox leaps over sleepy dogs in the park",
    "Foxes are quick and brown animals",
    "Dogs are loyal animals and love to play in the park",
    "Lazy dogs enjoy sleeping and relaxing under trees"
]

# Step 2: Define the initial query
initial_query = "quick brown fox"

# Step 3: Compute TF-IDF weights
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 4: Vectorize the query
query_vector = vectorizer.transform([initial_query])

# Step 5: Compute cosine similarity between query and documents
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Step 6: Retrieve top K documents (e.g., top 2)
top_k_indices = np.argsort(cosine_similarities)[-2:][::-1]  # Get top 2 indices
print("Top-K Documents:")
for idx in top_k_indices:
    print(f"Document {idx+1}: {documents[idx]} (Score: {cosine_similarities[idx]:.4f})")

# Step 7: Extract terms from the top K documents for query expansion
top_k_docs = [documents[i] for i in top_k_indices]
expanded_terms = set()

for doc in top_k_docs:
    for term in doc.split():
        if term not in initial_query:  # Avoid terms already in the query
            expanded_terms.add(term.lower())

# Step 8: Expand the query
expanded_query = initial_query + " " + " ".join(expanded_terms)
print("\nExpanded Query:")
print(expanded_query)

# Optional: Recompute similarity using the expanded query
expanded_query_vector = vectorizer.transform([expanded_query])
new_cosine_similarities = cosine_similarity(expanded_query_vector, tfidf_matrix).flatten()

print("\nUpdated Document Scores with Expanded Query:")
for idx, score in enumerate(new_cosine_similarities):
    print(f"Document {idx+1}: Score = {score:.4f}")


Top-K Documents:
Document 1: The quick brown fox jumps over the lazy dog (Score: 0.5056)
Document 3: Foxes are quick and brown animals (Score: 0.4235)

Expanded Query:
quick brown fox and over the jumps are foxes lazy animals dog

Updated Document Scores with Expanded Query:
Document 1: Score = 0.7916
Document 2: Score = 0.2904
Document 3: Score = 0.6831
Document 4: Score = 0.2715
Document 5: Score = 0.1496
