# LSA Models
Done by: Baga

## Import cleaned data

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


documents = pd.read_csv('data/documents_subset.csv', sep='\t')
documents['doc_title'] = documents['doc_title'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
queries = pd.read_csv('data/queries.csv', sep='\t')
queries['query_text_rus'] = queries['query_text_rus'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()

import spacy
from spacy import load
from spacy.lang.ru.examples import sentences
from spacy.lang.ru import Russian


nlp = Russian()
load_model = load("ru_core_news_sm")

lemma = []

for doc in load_model.pipe(documents["doc_title"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
documents['doc_title_clean'] = lemma
lemma = []

for doc in load_model.pipe(queries["query_text_rus"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
queries['query_text_clean'] = lemma

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_ru = stopwords.words("russian")
documents['doc_title_clean'] = documents['doc_title_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
documents['doc_title_clean_as_str'] = [' '.join(map(str, l)) for l in documents['doc_title_clean']]
# documents['doc_title_clean_as_str']


queries['query_text_clean'] = queries['query_text_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
queries['query_text_clean_as_str'] = [' '.join(map(str, l)) for l in queries['query_text_clean']]
# queries['query_text_clean_as_str']



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bagautdinnukhkadiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Vectorize Using TF-IDF

In [18]:
# Step 1: Create TF-IDF Term-Document Matrix
tfidf_vectorizer = TfidfVectorizer()
term_document_matrix = tfidf_vectorizer.fit_transform(documents['doc_title_clean_as_str'].values.astype('U'))
# term_document_matrix

## SVD

In [19]:
from sklearn.decomposition import TruncatedSVD
import numpy as np 

# Step 2: Apply SVD
n_components = 500  # Set number of latent dimensions
svd = TruncatedSVD(n_components=n_components)
svd_matrix = svd.fit_transform(term_document_matrix)

# Extract U, Sigma, V^T
U = svd.components_.T  # Term-topic matrix
Sigma = np.diag(svd.singular_values_)  # Diagonal matrix of singular values
VT = svd.components_  # Document-topic matrix

In [20]:
# Function to project query into topic space
def compute_query_vector_projection(query, tfidf_vectorizer, U):
    # Transform the query into a sparse vector using the same TF-IDF vectorizer
    sparse_query_vector = tfidf_vectorizer.transform([query]).toarray()[0]  # q is mx1

    # Project sparse query vector into the topic space
    query_vector_topic_space = np.dot(U.T, sparse_query_vector)  # q' = U^T q
    return query_vector_topic_space

# Function to compute cosine similarity
def compute_cosine_similarity(query_vector, document_vectors):
    # Normalize query and document vectors
    query_norm = query_vector / np.linalg.norm(query_vector)
    doc_norms = document_vectors / np.linalg.norm(document_vectors, axis=1, keepdims=True)

    # Compute cosine similarity
    cosine_similarities = np.dot(doc_norms, query_norm)
    return cosine_similarities

# Compute rankings for all queries
rankings = {}
for _, row in queries.iterrows():
    query_id = str(row['query_id'])  # Convert query_id to string for consistency
    query_text = row['query_text_clean_as_str']
    
    # Project query into topic space
    query_vector_projected = compute_query_vector_projection(query_text, tfidf_vectorizer, U)
    
    # Compute cosine similarity
    cosine_similarities = compute_cosine_similarity(query_vector_projected, VT.T)  # VT.T is NxK
    
    # Rank documents and get scores
    ranked_docs_indices = np.argsort(-cosine_similarities)  # Sort descending
    ranked_docs = [
        (documents.iloc[i]['doc_id'], cosine_similarities[i]) for i in ranked_docs_indices
    ]
    
    # Store the results in the rankings dictionary
    rankings[query_id] = ranked_docs

# Print the rankings dictionary
# print("Rankings Dictionary:")
# for query_id, results in rankings.items():
#     print(f"Query ID {query_id}: {results[:5]}")  # Display the top 5 results for brevity


## Evaluation

In [13]:
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc


qrels_pd = pd.read_csv('data/qrels.csv', sep='\t')

qrels = [
    ir_measures.Qrel(query_id=str(row['query_id']), doc_id=row['doc_id'], relevance=row['relevance_class'])
    for _, row in qrels_pd.iterrows()
]

In [14]:
# Flatten rankings into a DataFrame
flattened_rankings = []
for query_id, docs in rankings.items():
    for doc_id, score in docs:
        flattened_rankings.append({'query_id': str(query_id), 'doc_id': doc_id, 'score': score})

# Convert to a DataFrame
flattened_rankings_df = pd.DataFrame(flattened_rankings)
# Ensure documents are sorted by score for each query
flattened_rankings_df = flattened_rankings_df.sort_values(by=['query_id', 'score'], ascending=[True, False])

# Display the flattened rankings
flattened_rankings_df[:10]


Unnamed: 0,query_id,doc_id,score
0,200,ecd7c3e5-b990-4ae5-bf62-a65964f7d7ca,0.602216
1,200,91840fdf-31a7-40ca-86db-7e9c9ee26d24,0.577852
2,200,3c9b2cbb-a4b2-491c-94f9-e47099c0f75e,0.547494
3,200,ca3c625d-a4ff-4915-89a1-87e892b8e310,0.537686
4,200,ba7080d3-d795-47f6-85b0-4ceb5799738e,0.537438
5,200,463bdca0-dd6c-40f4-8c56-cef44ab58cc4,0.527697
6,200,6343b421-e1ce-45cb-85bf-4b7900c49fdf,0.527697
7,200,784757d8-4410-4b4a-83e3-7f94bc284a50,0.522468
8,200,b77c3fa1-7879-427b-b482-d80afad036e7,0.522257
9,200,9b5f1c6e-aa46-4a0d-b403-fb29f26fa820,0.461142


In [15]:
def evaluate(qrels, result):
    runs = [
        ScoredDoc(query_id=row['query_id'], doc_id=row['doc_id'], score=row['score'])
        for _, row in result.iterrows()
    ]

    metrics = [
        ir_measures.nDCG @ 20,   # nDCG@20
        ir_measures.AP,          # Average Precision
        ir_measures.RBP(rel=1),  # Relevance Based Precision
        ir_measures.R @ 100,     # Recall@100
        ir_measures.R @ 1000     # Recall@1000
    ]

    scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)
    # scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)

    return scores

In [16]:
performance_tfidf = evaluate(qrels, flattened_rankings_df)
print("Evaluation Metrics SVD:")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")

Evaluation Metrics SVD:
nDCG@20: 0.0009344739449582717
AP: 0.0028777968463369347
R@100: 0.004798560642855103
R@1000: 0.051033122420217104
RBP(rel=1): 0.002689839650097832


## Results

In [None]:
# 1024 components
AP: 0.002972477146510521
nDCG@20: 0.0009344739449582717
RBP(rel=1): 0.0026801942405148675
R@100: 0.0028890962043401553
R@1000: 0.05221304466258818


