# TF-IDF for CLIR: en/ru
done by: Baga

## Preprocessing

In [1]:
def count_terms(corpus):
    """
    Pass in corpus a list of strings to count overall number of terms in corpus.
    """
    from collections import Counter

    all_terms = [term for text in corpus for term in text.split()]

    # Count occurrences of each term
    term_counts = Counter(all_terms)
    return sum(term_counts.values())



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


documents = pd.read_csv('data/documents_subset.csv', sep='\t')
documents['doc_title'] = documents['doc_title'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
queries = pd.read_csv('data/queries.csv', sep='\t')
queries['query_text_rus'] = queries['query_text_rus'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()


In [3]:
print('# of tokens in raw text:', count_terms(documents['doc_title'].astype(str).tolist()))

# of tokens in raw text: 228997


In [4]:
import spacy
from spacy import load
from spacy.lang.ru.examples import sentences
from spacy.lang.ru import Russian


nlp = Russian()
load_model = load("ru_core_news_sm")

In [5]:
lemma = []

for doc in load_model.pipe(documents["doc_title"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
documents['doc_title_clean'] = lemma
documents[['doc_title','doc_title_clean']].head()

Unnamed: 0,doc_title,doc_title_clean
0,путин одобрил суровые наказания россиян за свя...,"[путин, одобрить, суровый, наказание, россияни..."
1,в 2020 году доля оборотной воды в производстве...,"[в, 2020, год, доля, оборотный, вода, в, произ..."
2,ответственное производство и потребление первы...,"[ответственный, производство, и, потребление, ..."
3,правительство раздаст гранты на 4 миллиарда на...,"[правительство, раздать, грант, на, 4, миллиар..."
4,щербакова мишина и галлямов станут специальным...,"[щербаков, мишин, и, галлям, стать, специальны..."


In [6]:
lemma = []

for doc in load_model.pipe(queries["query_text_rus"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
queries['query_text_clean'] = lemma
queries[['query_text_rus','query_text_clean']].head()

Unnamed: 0,query_text_rus,query_text_clean
0,коррупция взяточничество олимпийские игры спор...,"[коррупция, взяточничество, олимпийский, игра,..."
1,инвестиции китая в иран,"[инвестиция, китай, в, иран]"
2,новые технологии точное земледелие интеллектуа...,"[новый, технология, точный, земледелие, интелл..."
3,когда либо предоставленные данные остаются в силе,"[когда, либо, предоставить, данные, оставаться..."
4,наказание за применение допинга в спорте,"[наказание, за, применение, допинг, в, спорт]"


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_ru = stopwords.words("russian")
documents['doc_title_clean'] = documents['doc_title_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
documents['doc_title_clean_as_str'] = [' '.join(map(str, l)) for l in documents['doc_title_clean']]
# documents['doc_title_clean_as_str']



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bagautdinnukhkadiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print('# of tokens in preprocessed text:', count_terms(documents['doc_title_clean_as_str'].astype(str).tolist()))

# of tokens in preprocessed text: 184279


In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_ru = stopwords.words("russian")
queries['query_text_clean'] = queries['query_text_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
queries['query_text_clean_as_str'] = [' '.join(map(str, l)) for l in queries['query_text_clean']]
# queries['query_text_clean_as_str']



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bagautdinnukhkadiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Count TF-IDF

In [10]:
# Step 1: Create TF-IDF Term-Document Matrix
tfidf_vectorizer = TfidfVectorizer()
term_document_matrix = tfidf_vectorizer.fit_transform(documents['doc_title_clean_as_str'].values.astype('U'))
# term_document_matrix

In [11]:
def rank_documents(query, tfidf_vectorizer, term_document_matrix, documents):
    # Transform the query into the TF-IDF space
    query_vector = tfidf_vectorizer.transform([query])  # Shape: (1, num_terms)
    
    # Compute cosine similarity between query vector and document vectors
    similarity_scores = cosine_similarity(query_vector, term_document_matrix).flatten()
    
    # Rank documents by similarity score
    ranked_indices = similarity_scores.argsort()[::-1]  # Descending order
    ranked_results = [(documents.iloc[i]['doc_id'], similarity_scores[i]) for i in ranked_indices]
    
    return ranked_results


In [12]:
from sklearn.metrics.pairwise import cosine_similarity


rankings = {}
for _, query_row in queries.iterrows():
    query_id = query_row['query_id']
    query_text = query_row['query_text_clean_as_str']
    rankings[query_id] = rank_documents(query_text, tfidf_vectorizer, term_document_matrix, documents)

# # Step 5: Display Results
# for query_id, results in rankings.items():
#     print(f"Query ID {query_id}:")
#     for doc_id, score in results:
#         print(f"  Document ID: {doc_id}, Similarity Score: {score:.4f}")

In [18]:
rankings[200]

[('5920d6b9-3006-413c-9711-29183cb47d53', 0.38312778064343045),
 ('c8cf62b1-d540-4524-995b-f01445ed351d', 0.3801034155455418),
 ('f2a00c50-c67d-4163-b86c-57bc0be7f461', 0.35960314728238385),
 ('dbc0d493-44a5-4f5f-84ce-066d341a44a6', 0.3324065301399746),
 ('25a172c7-8b5d-4931-a8c8-cf7cdb5533ef', 0.32813917693798783),
 ('ee13d769-66e1-459b-9293-85417adc2118', 0.32307000741833675),
 ('98e3fb10-fec7-4a26-899a-f6957fab2956', 0.3133831669848835),
 ('cb7a72cb-ce14-4016-93af-af682d6b001c', 0.30791885669055036),
 ('5515c4a6-a4f5-4ae5-aacf-815f7c99a252', 0.28871853604837355),
 ('53a6e769-6955-4c3a-a9c6-cbeca597c7c0', 0.283989833643827),
 ('4f1b38f4-6300-4db9-b3de-9908dad115b0', 0.2835755582970311),
 ('c97d5baa-c04e-40a6-8896-2625fd883b07', 0.27731761460687127),
 ('d8a9d957-6bd5-4449-97ca-c9cb14d4bf75', 0.27721258656714226),
 ('f998e872-7cd3-494a-8135-af3a5020f7e1', 0.2750875442132492),
 ('dac65912-735f-4ec3-b8bc-559cb963a9c4', 0.2749696501602974),
 ('6a4f28b6-e923-44dc-b22e-9695b6d4c35d', 0.2746

## Evaluation

In [13]:
import ir_measures
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc


qrels_pd = pd.read_csv('data/qrels.csv', sep='\t')

qrels = [
    ir_measures.Qrel(query_id=str(row['query_id']), doc_id=row['doc_id'], relevance=row['relevance_class'])
    for _, row in qrels_pd.iterrows()
]

In [14]:
# Flatten rankings into a DataFrame
flattened_rankings = []
for query_id, docs in rankings.items():
    for doc_id, score in docs:
        flattened_rankings.append({'query_id': str(query_id), 'doc_id': doc_id, 'score': score})

# Convert to a DataFrame
flattened_rankings_df = pd.DataFrame(flattened_rankings)
# Ensure documents are sorted by score for each query
flattened_rankings_df = flattened_rankings_df.sort_values(by=['query_id', 'score'], ascending=[True, False])

# Display the flattened rankings
flattened_rankings_df[:10]


Unnamed: 0,query_id,doc_id,score
0,200,5920d6b9-3006-413c-9711-29183cb47d53,0.383128
1,200,c8cf62b1-d540-4524-995b-f01445ed351d,0.380103
2,200,f2a00c50-c67d-4163-b86c-57bc0be7f461,0.359603
3,200,dbc0d493-44a5-4f5f-84ce-066d341a44a6,0.332407
4,200,25a172c7-8b5d-4931-a8c8-cf7cdb5533ef,0.328139
5,200,ee13d769-66e1-459b-9293-85417adc2118,0.32307
6,200,98e3fb10-fec7-4a26-899a-f6957fab2956,0.313383
7,200,cb7a72cb-ce14-4016-93af-af682d6b001c,0.307919
8,200,5515c4a6-a4f5-4ae5-aacf-815f7c99a252,0.288719
9,200,53a6e769-6955-4c3a-a9c6-cbeca597c7c0,0.28399


In [15]:
def evaluate(qrels, result):
    runs = [
        ScoredDoc(query_id=row['query_id'], doc_id=row['doc_id'], score=row['score'])
        for _, row in result.iterrows()
    ]

    metrics = [
        ir_measures.nDCG @ 20,   # nDCG@20
        ir_measures.AP,          # Average Precision
        ir_measures.RBP(rel=1),  # Relevance Based Precision
        ir_measures.R @ 100,     # Recall@100
        ir_measures.R @ 1000     # Recall@1000
    ]

    scores = ir_measures.calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)
    # scores = ir_measures.calc_aggregate([nDCG@20, MAP, Recall@100, Recall@1000], qrels, runs)

    return scores

In [16]:
performance_tfidf = evaluate(qrels, flattened_rankings_df)
print("Evaluation Metrics (TF-IDF):")
for metric, value in performance_tfidf.items():
    print(f"{metric}: {value}")

Evaluation Metrics (TF-IDF):
RBP(rel=1): 0.241736092329987
R@1000: 0.46652286021817835
AP: 0.10099110188958242
nDCG@20: 0.20449876607983064
R@100: 0.24911998059212967


In [None]:
qrels.rename(columns={'relevance_class': 'relevance'}, inplace=True)