In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle, json
from scipy import sparse

with open("cleaned_docs.pkl", "rb") as f:
    cleaned_docs = pickle.load(f)

with open("file_ids.json", "r") as f:
    file_ids = json.load(f)

with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

X = sparse.load_npz("tfidf_matrix.npz")

print("Loaded all resources successfully.")
print("TF‑IDF shape:", X.shape)


Loaded all resources successfully.
TF‑IDF shape: (10788, 8798)


In [None]:
from nltk.corpus import reuters

# list of categories
doc_categories = {fid: reuters.categories(fid) for fid in file_ids}


Example: test/14826 → ['trade']


In [None]:
queries = {
    "oil prices": ["oil"],
    "foreign exchange": ["money-fx"],
    "company earnings": ["earn"],
    "grain exports": ["grain"],
    "interest rates": ["interest"],
    "trade balance": ["trade"],
    "inflation report": ["inflation"],
    "market acquisition": ["acq"]
}
 # A doc is relevant if any doc category matches one of the target categories.
def is_relevant(doc_id, target_categories):
    doc_cats = doc_categories[doc_id]
    return any(c in doc_cats for c in target_categories)


In [67]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_query(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    cleaned = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned)


In [68]:
def search(query, k=20):
    cleaned = preprocess_query(query)
    qv = vectorizer.transform([cleaned])
    sims = cosine_similarity(qv, X).ravel()
    top_idx = sims.argsort()[::-1][:k]
    return top_idx, sims[top_idx]


In [None]:
def precision_at_k(ranked_ids, target_cats, k):
    top_k = ranked_ids[:k]                      # Take top‑k ranked documents
    relevant = sum(is_relevant(file_ids[i], target_cats) for i in top_k)
    return relevant / k                         # Precision = relevant / k


def recall_at_k(ranked_ids, target_cats, k):
    total_rel = sum(is_relevant(fid, target_cats) for fid in file_ids)
    if total_rel == 0:
        return 0                                # Avoid division by zero

    top_k = ranked_ids[:k]
    rel_found = sum(is_relevant(file_ids[i], target_cats) for i in top_k)
    return rel_found / total_rel                # Recall = found / total relevant


def average_precision(ranked_ids, target_cats):
    score = 0
    rel_seen = 0
    for i, doc_idx in enumerate(ranked_ids):
        if is_relevant(file_ids[doc_idx], target_cats):
            rel_seen += 1                       # Count relevant retrieved so far
            score += rel_seen / (i + 1)         # Precision at this rank

    return score / rel_seen if rel_seen > 0 else 0   


In [70]:
results = []

for query, cats in queries.items():
    ranked, _ = search(query, k=200)

    p5 = precision_at_k(ranked, cats, 5)
    p10 = precision_at_k(ranked, cats, 10)
    r10 = recall_at_k(ranked, cats, 10)
    ap = average_precision(ranked, cats)

    results.append([query, p5, p10, r10, ap])

import pandas as pd
df = pd.DataFrame(results, columns=["Query", "P@5", "P@10", "Recall@10", "AP"])
df


Unnamed: 0,Query,P@5,P@10,Recall@10,AP
0,oil prices,0.0,0.0,0.0,0.0
1,foreign exchange,0.8,0.7,0.009763,0.553835
2,company earnings,1.0,0.9,0.00227,0.939349
3,grain exports,1.0,1.0,0.017182,0.89548
4,interest rates,1.0,1.0,0.020921,0.903442
5,trade balance,0.6,0.3,0.006186,0.719973
6,inflation report,0.0,0.0,0.0,0.0
7,market acquisition,1.0,1.0,0.004221,0.851396


In [71]:
MAP = df["AP"].mean()
MAP

np.float64(0.6079342445976452)

In [None]:
# rocchio algorithme 
def rocchio(query_vector, Dpos, Dneg, a=1, b=0.75, g=0.15):
    new_q = a * query_vector
    if Dpos.shape[0] > 0:
        new_q = new_q + b * Dpos.mean(axis=0)
    if Dneg.shape[0] > 0:
        new_q = new_q - g * Dneg.mean(axis=0)
    return new_q


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

rocchio_results = []

for query, cats in queries.items():

    ranked, _ = search(query, k=200)

    # Build positive + negative sets
    Dpos_idx = [i for i in ranked if is_relevant(file_ids[i], cats)][:5]   # top 5 relevant
    Dneg_idx = ranked[-5:]                                                # bottom 5 docs

    Dpos = X[Dpos_idx]
    Dneg = X[Dneg_idx]

    # Original vector to Rocchio update
    q_vec = vectorizer.transform([preprocess_query(query)])
    q_new = rocchio(q_vec, Dpos, Dneg)
    q_new = np.asarray(q_new)

    # Rerank documents
    sims = cosine_similarity(q_new, X).ravel()
    reranked = sims.argsort()[::-1][:200]

    # Results AFTER Rocchio
    p5  = precision_at_k(reranked, cats, 5)
    p10 = precision_at_k(reranked, cats, 10)
    r10 = recall_at_k(reranked, cats, 10)
    ap  = average_precision(reranked, cats)

    rocchio_results.append([query, p5, p10, r10, ap])

df_rocchio = pd.DataFrame(
    rocchio_results, 
    columns=["Query", "P@5", "P@10", "Recall@10", "AP"]
)

df_rocchio



Unnamed: 0,Query,P@5,P@10,Recall@10,AP
0,oil prices,0.0,0.0,0.0,0.0
1,foreign exchange,0.8,0.7,0.009763,0.555027
2,company earnings,1.0,0.9,0.00227,0.954306
3,grain exports,1.0,1.0,0.017182,0.95276
4,interest rates,1.0,1.0,0.020921,0.937144
5,trade balance,1.0,0.5,0.010309,0.788158
6,inflation report,0.0,0.0,0.0,0.0
7,market acquisition,1.0,1.0,0.004221,0.953182


In [85]:
MAP = df_rocchio["AP"].mean()
MAP

np.float64(0.6425721959853996)

In [86]:
def build_pipeline(stopwords_on=True, stemming_on=True):
    # Modify preprocessing
    def pre(text):
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)
        tokens = text.split()

        if not stopwords_on:
            cleaned = tokens
        else:
            cleaned = [w for w in tokens if w not in stop_words]

        if stemming_on:
            cleaned = [stemmer.stem(w) for w in cleaned]

        return " ".join(cleaned)

    return pre


In [87]:
pipelines = {
    "stopwords_ON_stemming_ON": build_pipeline(True, True),
    "stopwords_ON_stemming_OFF": build_pipeline(True, False),
    "stopwords_OFF_stemming_ON": build_pipeline(False, True),
    "stopwords_OFF_stemming_OFF": build_pipeline(False, False)
}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def evaluate_pipeline(preprocess_fn):
    import nltk                        

    from nltk.corpus import reuters   

    docs = [reuters.raw(fid) for fid in reuters.fileids()]
    #  Re-clean all documents
    cleaned = [preprocess_fn(d) for d in docs]

    # Re-vectorize
    vec = TfidfVectorizer(min_df=3)
    X_local = vec.fit_transform(cleaned)

    # Evaluate the 8 queries
    AP_scores = []

    for q, cats in queries.items():
        q_clean = preprocess_fn(q)
        q_vec = vec.transform([q_clean])
        sims = cosine_similarity(q_vec, X_local).ravel()
        ranked = sims.argsort()[::-1][:50]
        ap = average_precision(ranked, cats)
        AP_scores.append(ap)

    return np.mean(AP_scores)

In [None]:
results = {}

for name, fn in pipelines.items():
    MAP_value = evaluate_pipeline(fn)
    results[name] = MAP_value

results

Evaluating: stopwords_ON_stemming_ON
Evaluating: stopwords_ON_stemming_OFF
Evaluating: stopwords_OFF_stemming_ON
Evaluating: stopwords_OFF_stemming_OFF


{'stopwords_ON_stemming_ON': np.float64(0.6482021430646938),
 'stopwords_ON_stemming_OFF': np.float64(0.6734876686903137),
 'stopwords_OFF_stemming_ON': np.float64(0.6469642132941626),
 'stopwords_OFF_stemming_OFF': np.float64(0.6679797345133085)}

In [90]:
import pandas as pd

df_ablation = pd.DataFrame.from_dict(results, orient='index', columns=["MAP"])
df_ablation

Unnamed: 0,MAP
stopwords_ON_stemming_ON,0.648202
stopwords_ON_stemming_OFF,0.673488
stopwords_OFF_stemming_ON,0.646964
stopwords_OFF_stemming_OFF,0.66798
