<a href="https://colab.research.google.com/github/Bishu-21/FastSearch50K/blob/main/FastSearch50K.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 🛠 Install necessary packages
!pip install -q scikit-learn nltk

# 📚 Import libraries
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import os

# 📁 Generate or load 50k documents
if not os.path.exists('docs.txt'):
    with open('docs.txt', 'w') as f:
        for i in range(50000):
            f.write(f"Document number {i} about machine learning, AI, and data science.\n")

# 📥 Read the documents
with open('docs.txt', 'r') as f:
    documents = [line.strip() for line in f.readlines()]

print(f"Loaded {len(documents)} documents.")

# 🧹 Preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = text.lower().split()
    return ' '.join([word for word in tokens if word not in stop_words])

cleaned_docs = [preprocess(doc) for doc in documents]

# 🧠 Build TF-IDF vector space
vectorizer = TfidfVectorizer(max_features=50000)
doc_vectors = vectorizer.fit_transform(cleaned_docs)

# 🔍 Fast search function
def search(query, top_k=5):
    start = time.time()
    query_vec = vectorizer.transform([preprocess(query)])
    similarities = cosine_similarity(query_vec, doc_vectors).flatten()
    ranked_indices = np.argsort(similarities)[::-1][:top_k]
    results = [(i, documents[i], round(similarities[i], 3)) for i in ranked_indices]
    end = time.time()
    print(f"Query Time: {(end - start) * 1000:.2f} ms")
    return results

# 🚀 Run a sample query
results = search("machine learning and AI")
for idx, doc, score in results:
    print(f"[{idx}] ({score}) → {doc}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loaded 50000 documents.
Query Time: 47.29 ms
[1] (0.655) → Document number 1 about machine learning, AI, and data science.
[2] (0.655) → Document number 2 about machine learning, AI, and data science.
[3] (0.655) → Document number 3 about machine learning, AI, and data science.
[4] (0.655) → Document number 4 about machine learning, AI, and data science.
[5] (0.655) → Document number 5 about machine learning, AI, and data science.
