# Cross-lingual Fasttext

## Preprocessing

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


documents = pd.read_csv('data/documents_subset.csv', sep='\t')
documents['doc_title'] = documents['doc_title'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()
queries = pd.read_csv('data/queries.csv', sep='\t')
queries['query_text_rus'] = queries['query_text_rus'].replace(r'[^\w\s]',' ',regex=True).replace(r'\s+',' ',regex=True).str.lower()

import spacy
from spacy import load
from spacy.lang.ru.examples import sentences
from spacy.lang.ru import Russian


nlp = Russian()
load_model = load("ru_core_news_sm")

lemma = []

for doc in load_model.pipe(documents["doc_title"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
documents['doc_title_clean'] = lemma
lemma = []

for doc in load_model.pipe(queries["query_text_rus"].values.astype(str).tolist()):
    lemma.append([n.lemma_ for n in doc])


# lemma = [' '.join(i) for i in lemma]
queries['query_text_clean'] = lemma

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_ru = stopwords.words("russian")
documents['doc_title_clean'] = documents['doc_title_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
documents['doc_title_clean_as_str'] = [' '.join(map(str, l)) for l in documents['doc_title_clean']]
# documents['doc_title_clean_as_str']


queries['query_text_clean'] = queries['query_text_clean'].apply(lambda x: [item for item in x if item not in stopwords_ru])
queries['query_text_clean_as_str'] = [' '.join(map(str, l)) for l in queries['query_text_clean']]
# queries['query_text_clean_as_str']



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bagautdinnukhkadiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Fasttext embeddings

In [2]:
import fasttext
import fasttext.util

# Load embeddings
fasttext.util.download_model('en', if_exists='ignore')
fasttext.util.download_model('ru', if_exists='ignore')
en_model = fasttext.load_model('cc.en.300.bin')
ru_model = fasttext.load_model('cc.ru.300.bin')


ModuleNotFoundError: No module named 'fasttext'

In [None]:

# Find Russian equivalents for English query terms
aligned_query = " ".join(ru_model.get_nearest_neighbors(en_model[word])[0][1] for word in english_query.split())

In [None]:
# Step 2: Align English query to Russian terms
def align_query_to_russian(english_query, en_model, ru_model):
    russian_query_terms = []
    for word in english_query.split():
        try:
            # Find the nearest neighbor in Russian space
            nearest_neighbors = ru_model.get_nearest_neighbors(en_model[word])
            russian_query_terms.append(nearest_neighbors[0][1])  # Take the closest word
        except KeyError:
            # Handle words not in the embedding vocabulary
            russian_query_terms.append("")
    return " ".join(russian_query_terms).strip()

aligned_query = align_query_to_russian(english_query, en_model, ru_model)

# Step 3: Compute document similarities
def compute_query_document_similarities(query, documents, ru_model):
    # Convert query to vector (mean of its word vectors)
    query_vector = np.mean([ru_model[word] for word in query.split() if word in ru_model], axis=0)

    # Compute similarities
    similarities = {}
    for doc_id, doc_text in documents.items():
        doc_vector = np.mean([ru_model[word] for word in doc_text.split() if word in ru_model], axis=0)
        similarity = cosine_similarity([query_vector], [doc_vector])[0][0]
        similarities[doc_id] = similarity
    return similarities

similarities = compute_query_document_similarities(aligned_query, documents, ru_model)

# Step 4: Save results into rankings dictionary
rankings = {"query_1": sorted(similarities.items(), key=lambda x: x[1], reverse=True)}

# Print rankings
print("Rankings Dictionary:")
print(rankings)
