In [1]:
import pickle
import numpy as np
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned docs 
with open("cleaned_docs.pkl", "rb") as f:
    cleaned_docs = pickle.load(f)

# Load file IDs
import json
with open("file_ids.json", "r") as f:
    file_ids = json.load(f)


# Load vectorizer
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Load matrix 
from scipy import sparse

X = sparse.load_npz("tfidf_matrix.npz")



In [2]:
# PREPROCESS FUNCTION
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_query(text):    
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()

    cleaned = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned)   # MUST return string for vectorizer


In [3]:
# RANK DOCUMENTS USING COSINE SIMILARITY

def search(query, k=10):
    # Preprocess
    cleaned_q = preprocess_query(query)

    # Convert query to vector
    q_vec = vectorizer.transform([cleaned_q])

    # Compute cosine similarities
    sims = cosine_similarity(q_vec, X).ravel()

    # Get top-k document indices
    top_idx = sims.argsort()[::-1][:k]
    return top_idx, sims[top_idx]



In [4]:
# TESTING QUERIES 
queries = [
    "oil prices",
    "foreign exchange",
    "company earnings",
    "grain exports",
    "interest rates",
    "trade balance",
    "inflation report",
    "market acquisition"
]

for q in queries:
    print("\n")
    print("QUERY:", q.upper())
    print(" ")

    top_docs, scores = search(q, 5)

    for rank, (doc_id, score) in enumerate(zip(top_docs, scores)):
        print(f"{rank+1}. {file_ids[doc_id]} , score: {score:.4f}")




QUERY: OIL PRICES
 
1. training/2775 , score: 0.4978
2. training/127 , score: 0.4869
3. training/6876 , score: 0.4832
4. test/18746 , score: 0.4800
5. training/11149 , score: 0.4423


QUERY: FOREIGN EXCHANGE
 
1. training/5279 , score: 0.5358
2. test/17930 , score: 0.4785
3. training/5181 , score: 0.4740
4. training/12480 , score: 0.4421
5. training/5841 , score: 0.3899


QUERY: COMPANY EARNINGS
 
1. training/6539 , score: 0.5500
2. training/12050 , score: 0.5034
3. training/11708 , score: 0.4918
4. training/5579 , score: 0.4814
5. training/11637 , score: 0.4614


QUERY: GRAIN EXPORTS
 
1. training/13173 , score: 0.4949
2. training/11769 , score: 0.4827
3. training/5800 , score: 0.4637
4. training/8161 , score: 0.4441
5. training/7934 , score: 0.4222


QUERY: INTEREST RATES
 
1. test/18520 , score: 0.6046
2. training/12091 , score: 0.5564
3. training/12774 , score: 0.5536
4. training/11746 , score: 0.5431
5. training/7538 , score: 0.5185


QUERY: TRADE BALANCE
 
1. training/3610 , sc

In [5]:
# SAVE RESULTS CSV
import csv

with open("search_results.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["query", "rank", "doc_id", "score"])

    for q in queries:
        top_docs, scores = search(q, 10)
        for rank, (doc_id, score) in enumerate(zip(top_docs, scores)):
            writer.writerow([q, rank+1, file_ids[doc_id], score])
