In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define a set of documents
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A fox is quick and it is jumping over a lazy dog",
    "The lazy dog does not jump over anything",
    "Foxes are very quick and agile animals"
]

# Step 2: Index the documents using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 3: Define a function to retrieve relevant documents based on a query
def search(query, top_n=3):
    # Convert the query into a TF-IDF vector
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity between the query and all documents
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get the indices of the top N relevant documents
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Return the top N documents and their scores
    results = [(documents[i], similarity_scores[i]) for i in top_indices]
    return results

# Step 4: Test the search engine
query = "quick fox"
results = search(query)
print("Search Results for query:", query)
for rank, (doc, score) in enumerate(results, start=1):
    print(f"{rank}. {doc} (Score: {score:.4f})")


Search Results for query: quick fox
1. The quick brown fox jumps over the lazy dog (Score: 0.4337)
2. A fox is quick and it is jumping over a lazy dog (Score: 0.3554)
3. Foxes are very quick and agile animals (Score: 0.1732)
