In [13]:
# ---------------------------------------------------------
# 1) IMPORT LIBRARIES + LOAD DATA
# ---------------------------------------------------------

import pickle
import numpy as np
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned docs (optional, but useful for display)
with open("cleaned_docs.pkl", "rb") as f:
    cleaned_docs = pickle.load(f)

# Load original file IDs
import json
with open("file_ids.json", "r") as f:
    file_ids = json.load(f)


# Load fitted TF-IDF vectorizer
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Load TF-IDF matrix (10788 documents × ~5000 features)
from scipy import sparse

X = sparse.load_npz("tfidf_matrix.npz")



In [14]:
# ---------------------------------------------------------
# 2) FUNCTION TO PREPROCESS A QUERY
# ---------------------------------------------------------

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_query(text):
    """
    Clean and tokenize a user query so it matches the cleaned docs.
    """
    
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()

    cleaned = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned)   # MUST return string for TF-IDF vectorizer


In [None]:
# ---------------------------------------------------------
# 3) RANK DOCUMENTS USING COSINE SIMILARITY
# ---------------------------------------------------------

def search(query, k=10):
    """
    Run a query on the TF-IDF matrix and get top-k results.
    """
    
    # Preprocess
    cleaned_q = preprocess_query(query)

    # Convert query to TF-IDF vector
    q_vec = vectorizer.transform([cleaned_q])

    # Compute cosine similarities
    sims = cosine_similarity(q_vec, X).ravel()

    # Get top-k document indices
    top_idx = sims.argsort()[::-1][:k]
    return top_idx, sims[top_idx]



[0.08814744 0.         0.         ... 0.         0.         0.        ]
[ 4231  3956  4021  4243  1257  3504  6267  4181 10304  1208]


(array([ 4231,  3956,  4021,  4243,  1257,  3504,  6267,  4181, 10304,
         1208]),
 array([0.42434699, 0.349468  , 0.34057511, 0.31831363, 0.19643211,
        0.19217876, 0.18875948, 0.16738737, 0.16139287, 0.14684295]))

In [24]:
# ---------------------------------------------------------
# 4) TEST SEARCH ENGINE
# ---------------------------------------------------------

query = "oil prices"
top_docs, scores = search(query, k=5)

for rank, (doc_id, score) in enumerate(zip(top_docs, scores)):
    print(f"Rank {rank+1} — Doc: {file_ids[doc_id]} — Score: {score:.4f}")
    
    # preview first 40 tokens
    print("Preview:", " ".join(cleaned_docs[doc_id][:40]))
    print("-" * 60)


[0.         0.         0.06921612 ... 0.         0.         0.        ]
[6126 4717 8784 1740 3750]
Rank 1 — Doc: training/2775 — Score: 0.4978
Preview: c r u d e   o i l   p r i c e   s t o c k   o u t p u t   f a l l   u s   c r u
------------------------------------------------------------
Rank 2 — Doc: training/127 — Score: 0.4869
Preview: d i a m o n d   s h a m r o c k   d i a   c u t   c r u d e   p r i c e   d i a
------------------------------------------------------------
Rank 3 — Doc: training/6876 — Score: 0.4832
Preview: d i v i s   s e e n   h e l p   u s   o i l   i n d u s t r i   u s   c o n g r
------------------------------------------------------------
Rank 4 — Doc: test/18746 — Score: 0.4800
Preview: u n i o n   p a c i f   l t u n p   r a i s   c r u d e   o i l   p r i c e   u
------------------------------------------------------------
Rank 5 — Doc: training/11149 — Score: 0.4423
Preview: u s   r e a s s e s s   m i d e a s t   p o l i c i   a n a l y s t   u s  

In [25]:
queries = [
    "oil prices",
    "foreign exchange",
    "company earnings",
    "grain exports",
    "interest rates",
    "trade balance",
    "inflation report",
    "market acquisition"
]

for q in queries:
    print("\n==============================")
    print("QUERY:", q.upper())
    print("==============================")

    top_docs, scores = search(q, 5)

    for rank, (doc_id, score) in enumerate(zip(top_docs, scores)):
        print(f"{rank+1}. {file_ids[doc_id]} — score: {score:.4f}")



QUERY: OIL PRICES
[0.         0.         0.06921612 ... 0.         0.         0.        ]
[6126 4717 8784 1740 3750]
1. training/2775 — score: 0.4978
2. training/127 — score: 0.4869
3. training/6876 — score: 0.4832
4. test/18746 — score: 0.4800
5. training/11149 — score: 0.4423

QUERY: FOREIGN EXCHANGE
[0.03416077 0.         0.         ... 0.         0.         0.        ]
[7741 1447 7672 4636 8133]
1. training/5279 — score: 0.5358
2. test/17930 — score: 0.4785
3. training/5181 — score: 0.4740
4. training/12480 — score: 0.4421
5. training/5841 — score: 0.3899

QUERY: COMPANY EARNINGS
[0. 0. 0. ... 0. 0. 0.]
[8559 4351 4119 7965 4076]
1. training/6539 — score: 0.5500
2. training/12050 — score: 0.5034
3. training/11708 — score: 0.4918
4. training/5579 — score: 0.4814
5. training/11637 — score: 0.4614

QUERY: GRAIN EXPORTS
[0.14591123 0.15462384 0.         ... 0.         0.         0.        ]
[4976 4162 8111 9606 9463]
1. training/13173 — score: 0.4949
2. training/11769 — score: 0.4827


In [26]:
# ---------------------------------------------------------
# 6) SAVE RANKING RESULTS FOR EVALUATION NOTEBOOK
# ---------------------------------------------------------

import csv

with open("search_results.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["query", "rank", "doc_id", "score"])

    for q in queries:
        top_docs, scores = search(q, 10)
        for rank, (doc_id, score) in enumerate(zip(top_docs, scores)):
            writer.writerow([q, rank+1, file_ids[doc_id], score])

print("Saved search_results.csv")


[0.         0.         0.06921612 ... 0.         0.         0.        ]
[6126 4717 8784 1740 3750 6795 9794 4127 8293 8233]
[0.03416077 0.         0.         ... 0.         0.         0.        ]
[7741 1447 7672 4636 8133 4393 6425 1705  510 6430]
[0. 0. 0. ... 0. 0. 0.]
[8559 4351 4119 7965 4076 6779 7596 5053 3401 4122]
[0.14591123 0.15462384 0.         ... 0.         0.         0.        ]
[4976 4162 8111 9606 9463  584 3126  493 3695 8206]
[0. 0. 0. ... 0. 0. 0.]
[1643 4376 4749 4145 9202 1632 1662 4497 5637 1295]
[0.18275445 0.         0.02281293 ... 0.         0.         0.        ]
[ 6678  5142  6508  5143  5299 10150  1036  3691  6875  6179]
[0. 0. 0. ... 0. 0. 0.]
[9580 7818   44 2996 5400 7821 1479 4884 2065 5484]
[0.03452247 0.         0.         ... 0.         0.         0.        ]
[  678   675  9019  1411  5829  1712  3342 10541  2937  1120]
Saved search_results.csv
