# NLP501 - NATURAL LANGUAGE PROCESSING

# LAB 04: Document Search System with TF-IDF and Locality Sensitive Hashing

- **Dataset:** AG News (120,000 articles)


### Dataset: AG News

AG News is a corpus containing over 1 million news articles from over 2000 sources. The most popular version contains 120,000 training samples and 7,600 test samples, categorized into 4 classes: World, Sports, Business, Sci/Tech


In [None]:
# Install dependencies
# !pip install datasets nltk scikit-learn numpy matplotlib tqdm

# Download NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Import Libraries


In [None]:
import numpy as np
import pandas as pd
import re
import time
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Set

from datasets import load_dataset
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from tqdm import tqdm

np.random.seed(42)


ModuleNotFoundError: No module named 'datasets'

## Task 1: Data preprocessing

### 1.1. Load dataset

We use Hugging Face's `datasets` library to load AG News but just use the first 10,000 documents.


In [None]:
# Load first 10,000 samples for faster processing
dataset = load_dataset('ag_news', split='train[:10000]')

df = pd.DataFrame({
    'text': dataset['text'],
    'label': dataset['label']
})

print(f"Dataset size: {len(df)} documents")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nSample document:")
print(df['text'].iloc[0][:200] + "...")


Generating train split: 100%|██████████| 120000/120000 [00:00<00:00, 2046001.95 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<00:00, 951884.57 examples/s]

Dataset size: 10000 documents

Label distribution:
label
3    2662
0    2523
2    2477
1    2338
Name: count, dtype: int64

Sample document:
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again....





### 1.2. Text processing


In [None]:
STOP_WORDS = set(stopwords.words('english'))


def preprocess_text(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in STOP_WORDS and len(w) > 2]
    return tokens


sample_text = "The quick brown fox jumps over a lazy dogs!"
print(f"Original: {sample_text}")
print(f"Processed: {preprocess_text(sample_text)}")


Original: The quick brown fox jumps over a lazy dogs!
Processed: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dogs']


### 1.3. Preprocess documents


In [None]:
# Process all documents
df['tokens'] = df['text'].apply(preprocess_text)
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

# Statistics
avg_tokens = df['tokens'].apply(len).mean()
print(f"Average tokens per document: {avg_tokens:.1f}")
print(f"Sample processed text: {df['processed_text'].iloc[0][:100]}...")


Average tokens per document: 24.6
Sample processed text: wall bears claw back black reuters reuters shortsellers wall streets dwindlingband ultracynics seein...


## Task 2: TF-IDF Vectorization


In [None]:
vectorizer = TfidfVectorizer(
    max_features=5000,      # Limit vocabulary size
    min_df=2,               # Ignore rare terms
    max_df=0.95,            # Ignore very common terms
    sublinear_tf=True       # Use log(1 + tf) instead of tf
)

print("Creating TF-IDF matrix...")
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

tfidf_dense = tfidf_matrix.toarray()

# Normalize vectors for cosine similarity
tfidf_normalized = normalize(tfidf_dense, norm='l2')

print(f"TF-IDF matrix shape: {tfidf_normalized.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Sample feature names: {vectorizer.get_feature_names_out()[:10]}")


Creating TF-IDF matrix...
TF-IDF matrix shape: (10000, 5000)
Vocabulary size: 5000
Sample feature names: ['aaron' 'abandon' 'abandoned' 'abbey' 'abducted' 'ability' 'able'
 'abroad' 'absence' 'abu']


## Task 3: LSH implementation


In [None]:
def cosine_sim(a, b):
    """Compute cosine similarity"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10)


class LSHIndex:
    """
    LSH with random hyperplanes.
    """

    def __init__(self, n_planes: int, dim: int):
        """
        Initialize LSH index.
        Args:
            n_planes: Number of random hyperplanes
            dim: Dimension of input vectors
        """
        self.n_planes = n_planes
        self.dim = dim

        # Generate random hyperplanes (normal vectors)
        self.planes = np.random.randn(n_planes, dim)

        # Hash table: bucket_id -> list of vector indices
        self.hash_table = defaultdict(list)

        # Store indexed vectors for similarity computation
        self.vectors = None

    def _hash(self, vector: np.ndarray) -> int:
        """
        Compute hash (bucket ID) for a vector.

        Args:
            vector: Input vector of shape (dim,)
        Returns:
            bucket_id: Integer hash value
        """
        # Compute dot product with all planes
        projections = self.planes @ vector

        # Convert to binary: 1 if positive, 0 otherwise
        binary_hash = (projections >= 0).astype(int)

        # Convert binary to integer (bucket ID)
        bucket_id = sum(2**i * b for i, b in enumerate(binary_hash))

        return bucket_id

    def index(self, vectors: np.ndarray):
        """
        Index all vectors into hash table.

        Args:
            vectors: Matrix of shape (n_vectors, dim)
        """
        self.vectors = vectors

        for idx in range(len(vectors)):
            bucket_id = self._hash(vectors[idx])
            self.hash_table[bucket_id].append(idx)

    def query(self, query_vector: np.ndarray, k: int) -> List[Tuple[int, float]]:
        """
        Find k approximate nearest neighbors.

        Args:
            query_vector: Query vector of shape (dim,)
            k: Number of neighbors to return
        Returns:
            List of (index, similarity)
        """
        # Find bucket for query
        bucket_id = self._hash(query_vector)

        # Get candidates from bucket
        candidates = self.hash_table[bucket_id]

        if len(candidates) == 0:
            return []

        # Compute exact similarity for candidates
        similarities = []
        for idx in candidates:
            sim = cosine_sim(query_vector, self.vectors[idx])
            similarities.append((idx, sim))

        # Sort by similarity and return top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]


### 3.3. Multiple tables LSH


In [None]:
class MultiTableLSH:
    """
    LSH with multiple hash tables for improved recall.
    """

    def __init__(self, n_planes: int, n_tables: int, dim: int):
        """
        Args:
            n_planes: Number of planes per table
            n_tables: Number of hash tables
            dim: Vector dimension
        """
        self.n_tables = n_tables
        self.tables = [LSHIndex(n_planes, dim) for _ in range(n_tables)]
        self.vectors = None

    def index(self, vectors: np.ndarray):
        """Index vectors into all tables."""
        self.vectors = vectors
        for table in self.tables:
            table.index(vectors)

    def query(self, query_vector: np.ndarray, k: int) -> List[Tuple[int, float]]:
        """
        Query all tables and merge candidates.
        """
        # Collect unique candidates from all tables
        all_candidates = set()

        for table in self.tables:
            bucket_id = table._hash(query_vector)
            candidates = table.hash_table[bucket_id]
            all_candidates.update(candidates)

        if len(all_candidates) == 0:
            return []

        # Compute exact similarity for all candidates
        similarities = []
        for idx in all_candidates:
            sim = cosine_sim(query_vector, self.vectors[idx])
            similarities.append((idx, sim))

        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def get_stats(self):
        """Get statistics about the index."""
        total_buckets = sum(len(t.hash_table) for t in self.tables)
        avg_bucket_size = np.mean(
            [len(b) for t in self.tables for b in t.hash_table.values()])
        return {
            'n_tables': self.n_tables,
            'total_buckets': total_buckets,
            'avg_bucket_size': avg_bucket_size
        }


### 3.4. Build LSH index


In [None]:
# Parameters
N_PLANES = 10      # Number of hyperplanes per table
N_TABLES = 5       # Number of hash tables
DIM = tfidf_normalized.shape[1]  # Vector dimension

print(f"Building LSH index with {N_PLANES} planes and {N_TABLES} tables...")
print(f"Vector dimension: {DIM}")
print(f"Number of documents: {len(tfidf_normalized)}")

# Create and build index
start_time = time.time()
lsh_index = MultiTableLSH(n_planes=N_PLANES, n_tables=N_TABLES, dim=DIM)
lsh_index.index(tfidf_normalized)
build_time = time.time() - start_time

print(f"\nIndex built in {build_time:.2f} seconds")
print(f"Index stats: {lsh_index.get_stats()}")


Building LSH index with 10 planes and 5 tables...
Vector dimension: 5000
Number of documents: 10000

Index built in 1.20 seconds
Index stats: {'n_tables': 5, 'total_buckets': 5119, 'avg_bucket_size': np.float64(9.767532721234616)}


## Task 4: Search & Evaluation

### 4.1. Brute-Force search baseline


In [None]:
def brute_force_knn(query_vector: np.ndarray, vectors: np.ndarray, k: int) -> List[Tuple[int, float]]:
    """
    Exact k-NN search by computing similarity with all vectors.

    Returns:
        List of (index, similarity)
    """
    similarities = []
    for idx in range(len(vectors)):
        sim = cosine_sim(query_vector, vectors[idx])
        similarities.append((idx, sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]


### 4.2. Evaluation metrics

Metrics:

- **Recall@K:** Proportion of true neighbors found
- **Query Time:** Average time per query (ms)
- **Speedup:** Brute_force_time / LSH_time


In [None]:
def evaluate_search(lsh_index, vectors, n_queries=100, k=10):
    """
    Evaluate LSH search quality and speed.

    Returns:
        Dictionary with evaluation metrics
    """
    # Randomly sample query indices
    query_indices = np.random.choice(len(vectors), n_queries, replace=False)

    recalls = []
    lsh_times = []
    bf_times = []

    for query_idx in tqdm(query_indices, desc="Evaluating"):
        query_vec = vectors[query_idx]

        # Brute-force search
        start = time.time()
        bf_results = brute_force_knn(query_vec, vectors, k)
        bf_times.append(time.time() - start)
        bf_indices = set([idx for idx, _ in bf_results])

        # LSH search
        start = time.time()
        lsh_results = lsh_index.query(query_vec, k)
        lsh_times.append(time.time() - start)
        lsh_indices = set([idx for idx, _ in lsh_results])

        # Compute recall
        if len(bf_indices) > 0:
            recall = len(lsh_indices & bf_indices) / len(bf_indices)
            recalls.append(recall)

    return {
        'avg_recall': np.mean(recalls) * 100,
        'avg_lsh_time_ms': np.mean(lsh_times) * 1000,
        'avg_bf_time_ms': np.mean(bf_times) * 1000,
        'speedup': np.mean(bf_times) / np.mean(lsh_times)
    }


### 4.3. Evaluate:


In [None]:
print("Running evaluation on 100 random queries...\n")

results = evaluate_search(lsh_index, tfidf_normalized, n_queries=100, k=10)

print("RESULTS:")

print(f"Average Recall@10: {results['avg_recall']:.1f}%")
print(f"Average LSH Query Time: {results['avg_lsh_time_ms']:.2f} ms")
print(f"Average Brute-Force Time: {results['avg_bf_time_ms']:.2f} ms")
print(f"Speedup: {results['speedup']:.1f}x")


Running evaluation on 100 random queries...



Evaluating: 100%|██████████| 100/100 [00:04<00:00, 20.06it/s]

RESULTS:
Average Recall@10: 17.4%
Average LSH Query Time: 0.45 ms
Average Brute-Force Time: 49.26 ms
Speedup: 110.7x





### 4.4. Build a function


In [None]:
def search_documents(query_text: str, k: int = 5):
    """
    Search for documents similar to query text.
    """
    # Vectorize query
    query_vec = vectorizer.transform([query_text]).toarray()[0]
    query_vec = normalize(query_vec.reshape(1, -1))[0]

    # Search
    results = lsh_index.query(query_vec, k)

    # Display results
    print(f"Query: '{query_text}'\n")
    print(f"Top {k} results:")

    categories = ['World', 'Sports', 'Business', 'Sci/Tech']
    for i, (idx, sim) in enumerate(results):
        cat = categories[df['label'].iloc[idx]]
        text = df['text'].iloc[idx][:100]
        print(f"{i+1}. [{cat}] (sim={sim:.3f})")
        print(f"   {text}...")
        print()


search_documents("Apple iPhone new technology")
search_documents("stock market investment")


Query: 'Apple iPhone new technology'

Top 5 results:
1. [Sci/Tech] (sim=0.129)
   Bankrupt Commerce One Patents Fetch \$15.5M Bankrupt Internet software maker Commerce One Inc. aucti...

2. [World] (sim=0.061)
   China Tests New Guided Missile Amid Taiwan Tensions  BEIJING (Reuters) - China has successfully test...

3. [Sports] (sim=0.057)
   Yao #39;s 39 carries China past New Zealand ATHENS, Greece - Yao Ming bounced back from a rough open...

4. [Business] (sim=0.049)
   Before-the-Bell: Biocryst Shares Higher  NEW YORK (Reuters) - Shares of Biocryst Pharmaceuticals  In...

5. [Sports] (sim=0.039)
   Northern N.E. reaches peak With some late heroics, Northern New England knocked off undefeated North...

Query: 'stock market investment'

Top 5 results:
1. [Business] (sim=0.156)
   Nortel to Cut 3,500 Jobs to Boost Profits  OTTAWA (Reuters) - Nortel Networks Corp. &lt;A HREF="http...

2. [Business] (sim=0.107)
   UPDATE 1-SEC, NASD probing Jefferies trader #39;s gifts -WSJ US market r