In [202]:
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from operator import itemgetter
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from numba import jit
from concurrent.futures import ProcessPoolExecutor
from sklearn.feature_extraction.text import TfidfVectorizer
from annoy import AnnoyIndex
import random
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.random_projection import SparseRandomProjection
from annoy import AnnoyIndex
import time
from collections import defaultdict
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from tfidf import *

# Preprocessing
## Data Retrieval

In [178]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
950989,8841780,Wolves don't hide. They don't even live in cav...
395590,8841787,The UNHCR Country Representative in Kenya. Str...
93101,8841790,2. Describe the misery at Kakuma. 3. Compariso...
669122,8841800,Following the death of his employer and mentor...


In [179]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()#.apply(tokenize)
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

Unnamed: 0,query-id,text
506217,2,Androgen receptor define
65864,3,Another name for the primary visual cortex is
372466,4,Defining alcoholism as a disease is associated...
326447,5,ECT is a treatment that is used for
117580,6,"Ebolavirus is an enveloped virus, which means"
...,...,...
158901,1185863,why did rachel carson die
83120,1185864,definition of ramen
7634,1185865,amex india customer care number
1,1185868,_________ justice is designed to repair the ha...


In [180]:
query_corpus_train_map = pd.read_csv("data/task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

Unnamed: 0,query-id,corpus-id,score
70257,3,1142680,1
395137,4,5613529,1
346352,5,4956428,1
125307,6,1931409,1
66896,8,1094214,1
...,...,...,...
169115,1185863,2545716,1
88577,1185864,1408016,1
8141,1185865,229186,1
1,1185868,16,1


In [181]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is
1,4,Defining alcoholism as a disease is associated...
2,5,ECT is a treatment that is used for
3,6,"Ebolavirus is an enveloped virus, which means"
4,8,"In humans, the normal set point for body tempe..."
...,...,...
7432,18204,anger is fear
7433,18205,anger management definition
7434,18208,angie baby meaning
7435,18209,angie lindvall


In [182]:
df_test = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


## Tools preparation & usage

### New Test 2

In [188]:
correct_corpus = corpus[corpus["corpus-id"]== 1142680]
correct_query = queries_train_subset[queries_train_subset["query-id"] == 3]
print(correct_corpus)
correct_query

         corpus-id                                               text
1061928    1142680  The primary (parts of the cortex that receive ...


Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is


In [253]:
def preprocess_text(text):
    """Optimized text preprocessing function."""
    
    # Cleaning
    text = HTML_PATTERN.sub("", text)
    text = NON_ASCII_DIGITS_PATTERN.sub(" ", text)
    text = NON_ASCII_CHARS_PATTERN.sub('', text)
    text = "".join([ch for ch in text if ch not in string.punctuation])
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords, and then perform Stemming and Lemmatization
    preprocessed_tokens = [
        STEMMER.stem(LEMMATIZER.lemmatize(word))
        for word in tokens
        if word.lower() not in STOPWORDS_SET
    ]
    
    return preprocessed_tokens
preprocess_text(correct_corpus.iloc[0]["text"])

['primari',
 'part',
 'cortex',
 'receiv',
 'sensori',
 'input',
 'thalamu',
 'visual',
 'cortex',
 'also',
 'known',
 'v',
 'v',
 'isual',
 'area',
 'one',
 'striat',
 'cortex',
 'extrastri',
 'area',
 'consist',
 'visual',
 'area',
 'two',
 'v',
 'three',
 'v',
 'four',
 'v',
 'five',
 'v',
 'primari',
 'visual',
 'cortex',
 'best',
 'studi',
 'visual',
 'area',
 'brain',
 'mammal',
 'studi',
 'locat',
 'posterior',
 'pole',
 'occipit',
 'cortex',
 'occipit',
 'cortex',
 'respons',
 'process',
 'visual',
 'stimulu']

In [254]:
subset_docs = corpus.iloc[:5]
subset_docs = pd.concat([subset_docs, correct_corpus], ignore_index=True)
subset_docs

Unnamed: 0,corpus-id,text
0,0,The presence of communication amid scientific ...
1,8,"In June 1942, the United States Army Corps of ..."
2,12,Tutorial: Introduction to Restorative Justice....
3,16,The approach is based on a theory of justice t...
4,23,Phloem is a conductive (or vascular) tissue fo...
5,1142680,The primary (parts of the cortex that receive ...


In [255]:
def populate_tfidf_dataframe(documents, vocabulary):
    # Create a list of dictionaries with term frequencies
    list_of_dicts = [Counter(doc) for doc in documents]
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(list_of_dicts).fillna(0)
    
    # Reorder columns according to the vocabulary and fill missing columns with 0
    df = df.reindex(columns=vocabulary, fill_value=0)
    
    return df

In [272]:
def tfidf_with_pandas(corpus):
    # Parallel tokenization and preprocessing
    print("Process docs ...")
    documents = parallel_preprocess_texts(corpus)

    print("Create vocab ...")
    # Create the vocabulary
    vocabulary = list(set(word for doc in documents for word in doc))
    vocabulary.sort()
    
    # Use the helper function to create and populate the DataFrame for term frequencies
    print("Compute tf ...")
    df = populate_tfidf_dataframe(documents, vocabulary)

    # Compute IDF values
    print("Compute idf ...")
    doc_count = len(documents)
    idf = df[df > 0].count().apply(lambda x: log(doc_count / x))

        
    # Compute TF-IDF values
    print("Compute tf-idf ...")
    tfidf_df = df.apply(lambda x: x / x.sum(), axis=1).multiply(idf)
    
    return original_documents, documents, tfidf_df, vocabulary, idf
original_documents, documents, tfidf_df, vocabulary, idf = tfidf_with_pandas(subset_docs["text"])
#tfidf_with_pandas(subset_docs["text"])["cortex"]

Process docs ...
Create vocab ...
Compute tf ...
Compute idf ...
Compute tf-idf ...


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-da

In [273]:
tfidf_df["cortex"]

0    0.000000
1    0.000000
2    0.000000
3    0.000000
4    0.000000
5    0.206741
Name: cortex, dtype: float64

In [281]:
def vectorize_query(query, vocabulary, idf):
    """Convert the query into its TF-IDF vector."""
    query_tf = Counter(preprocess_text(query))
    print(query_tf)
    query_vector = [query_tf.get(term, 0) * idf[term] for term in vocabulary]
    return query_vector

vector_query = vectorize_query(correct_query["text"].iloc[0], vocabulary, idf)
#query_vector = {k: v for k, v in vector_query.items() if v != 0}

#rint(query_vector)
print(vocabulary)

Counter({'anoth': 1, 'name': 1, 'primari': 1, 'visual': 1, 'cortex': 1})
['accomplish', 'account', 'achiev', 'also', 'amid', 'approach', 'area', 'armi', 'atom', 'base', 'behaviour', 'best', 'bomb', 'brain', 'carri', 'caus', 'cloud', 'commun', 'conduct', 'consid', 'consist', 'cooper', 'corp', 'cortex', 'crime', 'crimin', 'dialogu', 'emphas', 'engin', 'engineersbegan', 'equal', 'extrastri', 'five', 'foster', 'found', 'four', 'glucos', 'govern', 'hang', 'harm', 'highest', 'hundr', 'identifi', 'import', 'impress', 'includ', 'individu', 'innoc', 'input', 'intellect', 'introduct', 'involv', 'isual', 'june', 'justic', 'known', 'lead', 'leaf', 'life', 'locat', 'mammal', 'manhattan', 'meant', 'mind', 'name', 'obliter', 'occipit', 'offend', 'offens', 'one', 'part', 'peopl', 'phloem', 'photosynthesi', 'plant', 'pole', 'posterior', 'practic', 'presenc', 'primari', 'process', 'product', 'program', 'project', 'purpos', 'rate', 'rather', 'receiv', 'reflect', 'relationship', 'repair', 'research', 'res

In [282]:
def batch_query(tfidf_matrix_normalized, query_vectors):
    """Process multiple queries and return ranked document indices for each query."""
    # Compute cosine similarities using matrix operations
    similarity_matrix = np.dot(query_vectors, tfidf_matrix_normalized.T)
    print(similarity_matrix)
    # Get document indices ranked by relevance for each query
    ranked_doc_indices = np.argsort(-similarity_matrix)
    return ranked_doc_indices
print(tfidf_df.values)
batch_query(np.array(tfidf_df.values), vector_query)

[[0.         0.         0.06636146 0.         0.06636146 0.
  0.         0.         0.04068934 0.         0.         0.
  0.         0.         0.         0.         0.06636146 0.02567212
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.06636146 0.
  0.06636146 0.         0.         0.         0.         0.
  0.         0.         0.06636146 0.         0.         0.06636146
  0.         0.06636146 0.06636146 0.         0.         0.06636146
  0.         0.06636146 0.         0.         0.         0.
  0.         0.         0.         0.         0.06636146 0.
  0.         0.04068934 0.06636146 0.06636146 0.         0.06636146
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.06636146 0.         0.         0.         0.         0.04068934
  0.         0.         0.         0.         0.         0.
  0.         0.06636146 0.         0.         0.         0.


array([5, 1, 0, 2, 3, 4])

### New Test 1


In [64]:
subset_docs = corpus.iloc[:7000]
mini_sub = corpus.iloc[:5]
#subset_docs["list"] = subset_docs['text'].apply(lambda x: preprocess_text(x))
#subset_docs["list"]
subset_docs["list"] = parallel_preprocess_texts(subset_docs['text'])
mini_sub["list"] = parallel_preprocess_texts(mini_sub['text'])
subset_docs

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-da

Unnamed: 0,corpus-id,text,list
1000000,0,The presence of communication amid scientific ...,"[presenc, commun, amid, scientif, mind, equal,..."
966376,8,"In June 1942, the United States Army Corps of ...","[june, unit, state, armi, corp, engineersbegan..."
468831,12,Tutorial: Introduction to Restorative Justice....,"[tutori, introduct, restor, justic, restor, ju..."
1000001,16,The approach is based on a theory of justice t...,"[approach, base, theori, justic, consid, crime..."
306952,23,Phloem is a conductive (or vascular) tissue fo...,"[phloem, conduct, vascular, tissu, found, plan..."
...,...,...,...
605287,51256,Common mergansers in the Palearctic region typ...,"[common, mergans, palearct, region, typic, bre..."
206126,51261,How long do they live? The record for the olde...,"[long, live, record, oldest, common, mergans, ..."
263763,51266,If you are using desktop you can easily find t...,"[use, desktop, easili, find, serial, number, b..."
205026,51272,Use the following steps find your model number...,"[use, follow, step, find, model, number, use, ..."


In [65]:
mini_sub

Unnamed: 0,corpus-id,text,list
1000000,0,The presence of communication amid scientific ...,"[presenc, commun, amid, scientif, mind, equal,..."
966376,8,"In June 1942, the United States Army Corps of ...","[june, unit, state, armi, corp, engineersbegan..."
468831,12,Tutorial: Introduction to Restorative Justice....,"[tutori, introduct, restor, justic, restor, ju..."
1000001,16,The approach is based on a theory of justice t...,"[approach, base, theori, justic, consid, crime..."
306952,23,Phloem is a conductive (or vascular) tissue fo...,"[phloem, conduct, vascular, tissu, found, plan..."


In [103]:
vocabulary = list(set(word for doc in mini_sub["list"] for word in doc))
vocabulary.sort()
vocabulary#[0]
#print(subset_docs[subset_docs["list"].apply(lambda x: 'aa' in x)].iloc[0]["text"])


['accomplish',
 'account',
 'achiev',
 'amid',
 'approach',
 'armi',
 'atom',
 'base',
 'behaviour',
 'best',
 'bomb',
 'carri',
 'caus',
 'cloud',
 'commun',
 'conduct',
 'consid',
 'cooper',
 'corp',
 'crime',
 'crimin',
 'dialogu',
 'emphas',
 'engin',
 'engineersbegan',
 'equal',
 'foster',
 'found',
 'glucos',
 'govern',
 'hang',
 'harm',
 'highest',
 'hundr',
 'identifi',
 'import',
 'impress',
 'includ',
 'individu',
 'innoc',
 'intellect',
 'introduct',
 'involv',
 'june',
 'justic',
 'lead',
 'leaf',
 'life',
 'manhattan',
 'meant',
 'mind',
 'name',
 'obliter',
 'offend',
 'offens',
 'part',
 'peopl',
 'phloem',
 'photosynthesi',
 'plant',
 'practic',
 'presenc',
 'process',
 'product',
 'program',
 'project',
 'purpos',
 'rate',
 'rather',
 'reflect',
 'relationship',
 'repair',
 'research',
 'respond',
 'restor',
 'satisfact',
 'scientif',
 'secret',
 'shown',
 'stakehold',
 'state',
 'step',
 'success',
 'sucros',
 'take',
 'theori',
 'thousand',
 'tissu',
 'tradit',
 'tra

In [104]:
def pop_tfidf_dataframe(documents, vocabulary):
    # Create a list of dictionaries with term frequencies
    list_of_dicts = [Counter(doc) for doc in documents]
    #print(list_of_dicts)
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(list_of_dicts).fillna(0)
    
    # Reorder columns according to the vocabulary and fill missing columns with 0
    df = df.reindex(columns=vocabulary, fill_value=0)
    
    return df

pop = pop_tfidf_dataframe(mini_sub['list'], vocabulary)
pop.loc[:, (pop != 0).any(axis=0)]


Unnamed: 0,accomplish,account,achiev,amid,approach,armi,atom,base,behaviour,best,...,thousand,tissu,tradit,transform,truli,tutori,unit,vascular,victim,wrongdo
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [105]:
doc_count = len(mini_sub)
doc_count

5

In [152]:
#gamma = 1/100.0 #PARAMETER TO MAKE THE idf more visible
alpha = 0  # Smoothing parameter, you can experiment with this
idf = pop.sum().apply(lambda x: log(doc_count / (x + alpha)))

print(idf.describe())
#non_one_columns = pop.columns[pop.sum() != 1]
#print(pop.sum()[non_one_columns])
#print(idf[non_one_columns])
#log(5/pop[pop > 0].count()["theori"])
#log(5/2)

count    96.000000
mean      1.429603
std       0.365684
min       0.000000
25%       1.609438
50%       1.609438
75%       1.609438
max       1.609438
dtype: float64


In [153]:
tfidf_df = pop.apply(lambda x: x / x.sum(), axis=1).multiply(idf)
tfidf_df

Unnamed: 0,accomplish,account,achiev,amid,approach,armi,atom,base,behaviour,best,...,thousand,tissu,tradit,transform,truli,tutori,unit,vascular,victim,wrongdo
0,0.0,0.0,0.059609,0.059609,0.0,0.0,0.033937,0.0,0.0,0.0,...,0.059609,0.0,0.0,0.0,0.059609,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.13412,0.076358,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.13412,0.0,0.0,0.0
2,0.034988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034988,0.034988,...,0.0,0.0,0.034988,0.039839,0.0,0.034988,0.0,0.0,0.0,0.0
3,0.0,0.064378,0.0,0.0,0.064378,0.0,0.0,0.064378,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073303,0.064378
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.107296,0.0,0.0,0.0,0.0,0.0,0.107296,0.0,0.0


In [154]:
min_query = queries_train_subset.iloc[:5]
min_query['list'] = parallel_preprocess_texts(min_query['text'])
min_query

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericsaikali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_query['list'] = parallel_preprocess_texts(min_query['text'])


Unnamed: 0,query-id,text,list
0,3,Another name for the primary visual cortex is,"[anoth, name, primari, visual, cortex]"
1,4,Defining alcoholism as a disease is associated...,"[defin, alcohol, diseas, associ, jellinek]"
2,5,ECT is a treatment that is used for,"[ect, treatment, use]"
3,6,"Ebolavirus is an enveloped virus, which means","[ebolaviru, envelop, viru, mean]"
4,8,"In humans, the normal set point for body tempe...","[human, normal, set, point, bodi, temperatur]"


In [158]:
def vectorize_query(query, vocabulary, idf):
    """Convert the query into its TF-IDF vector."""
    query_tf = Counter(query)
    query_vector = [query_tf.get(term, 0) * idf[term] for term in vocabulary]
    return np.array(query_vector)
vects = min_query["list"].apply(lambda x: vectorize_query(x, vocabulary, idf))
vects

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: list, dtype: object

In [175]:

def batch_query(tfidf_matrix_normalized, query_vectors):
    """Process multiple queries and return ranked document indices for each query."""
    # Compute cosine similarities using matrix operations
    print(tfidf_matrix_normalized.shape)
    print(query_vectors.shape)
    similarity_matrix = np.dot(query_vectors, tfidf_matrix_normalized.T)
    print(similarity_matrix.shape)
    # Get document indices ranked by relevance for each query
    ranked_doc_indices = np.argsort(-similarity_matrix)
    
    return ranked_doc_indices
array_2d = np.vstack(vects)
batch_query(np.array(tfidf_df), np.array(array_2d))



(5, 96)
(5, 96)
(5, 5)


array([[1, 0, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

### TEST 1

In [None]:
@jit(nopython=True)
def cosine_distance(u:np.ndarray, v:np.ndarray):
    dot_products = v.dot(u.T)

    # Compute norms
    query_norm = np.linalg.norm(u)
    corpus_norms = np.linalg.norm(v.toarray(), axis=1)

    # Compute cosine similarities
    cosine_similarities = dot_products.flatten() / (query_norm * corpus_norms + 1e-10)  # small value to avoid division by zero

    return cosine_similarities

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
vectorizer.fit(corpus["text"])

In [None]:
n = 100  # maximum size of each chunk

sub_corpus_list = [df for _, df in corpus.groupby(np.arange(len(corpus)) // n)]
sub_corpus_list[1]

In [None]:
def find_top_k_matches(query):
    query_vector = vectorizer.transform([query['text']])

    # Compute cosine similarities
    cosine_similarities = np.array([1,2,3,4,5,6,7,8,9,10,11]) #linear_kernel(query_vector, sub_corpus_matrix).flatten()
    # Get top k corpus indices from this sub-corpus
    top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query-id'],
            'corpus_id': sub_corpus_df.iloc[index]['_id']
        })
    return local_results

In [None]:
results = []
for sub_corpus_df in sub_corpus_list:
    print("Transforming")
    sub_corpus_matrix = vectorizer.fit_transform(sub_corpus_df["text"])
    print("Parallelizing")
    # Use ProcessPoolExecutor to parallelize the inner loop
    count = 0
    for _, query in queries_test.iterrows():
        top_k_indices = find_top_k_matches(query)
        results.extend(top_k_indices)
        if count%100==0:
            print(count)
        count +=1
        
        

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)

In [None]:
# Assuming you have a list of documents and queries
documents = corpus
qs = queries_test 

# 1. Vectorize the documents using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
doc_vectors = vectorizer.fit_transform(documents["text"]).toarray()

# 2. Build Annoy Index
f = doc_vectors.shape[1]  # Number of dimensions of the vector
t = AnnoyIndex(f, 'angular')  # Use 'angular' for cosine similarity

for i in range(len(documents)):
    t.add_item(i, doc_vectors[i])

t.build(50)  # 50 trees. Increase if needed.

# 3. Query the index
results = {}

for query in qs:
    query_vector = vectorizer.transform([query]).toarray()[0]
    top10_indices = t.get_nns_by_vector(query_vector, 10)  # Find the top 10 document indices
    results[query] = [documents[i] for i in top10_indices]

print(results)


### TEST 2

In [None]:
def map_qrs_to_docs(qrs, dcs):
    
    # 1. Hashing: Convert documents to hashed vectors
    start = time.time()
    hash_vectorizer = HashingVectorizer(n_features=2**20, stop_words='english', norm=None)
    hashed_docs = hash_vectorizer.transform(dcs["text"])
    end = time.time()
    print(f"Hashing documents took {end - start} seconds.")
    
    # 2. Dimensionality Reduction
    start = time.time()
    transformer = SparseRandomProjection(n_components=100)  # Reducing to 100 dimensions
    reduced_docs = transformer.fit_transform(hashed_docs).toarray()
    end = time.time()
    print(f"Dimensionality reduction took {end - start} seconds.")
    
    # Build Annoy Index
    start = time.time()
    f = reduced_docs.shape[1]
    t = AnnoyIndex(f, 'angular')
    for i, vector in enumerate(reduced_docs):
        t.add_item(i, vector)
    t.build(50)
    end = time.time()
    print(f"Building Annoy index took {end - start} seconds.")
    
    # Query the index
    results = {}
    start = time.time()
    for index, row in qrs.iterrows():
        query_text = row["text"]
        hashed_query = hash_vectorizer.transform([query_text])
        reduced_query = transformer.transform(hashed_query).toarray()[0]
        top10_indices = t.get_nns_by_vector(reduced_query, 10)
        
        # Storing the _id from documents
        results[row["query-id"]] = [dcs.iloc[i]["corpus-id"] for i in top10_indices]
    end = time.time()
    print(f"Querying the index took {end - start} seconds.")
    
    return results


In [None]:
results = map_qrs_to_docs(queries_train_subset, corpus)
pd.DataFrame(results)

### TEST 3

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def build_inverted_index(docs):
    index = defaultdict(set)
    for doc_id, row in docs.iterrows():
        # Remove punctuation and convert to lowercase
        clean_text = remove_punctuation(row["text"].lower())
        for word in clean_text.split():
            index[word].add(doc_id)
    return index
    
def filter_docs(query, index):
    relevant_doc_ids = set()
    for word in query.split():
        relevant_doc_ids.update(index.get(word, set()))
    return relevant_doc_ids

start = time.time()
inverted_index = build_inverted_index(corpus)
end = time.time()
print(f"Indexing took {end - start} seconds.")
inverted_index

In [None]:
def remove_stopwords_from_index(inverted_index):
    # Use a predefined list of stopwords (ENGLISH_STOP_WORDS from sklearn here)
    for stopword in ENGLISH_STOP_WORDS:
        if stopword in inverted_index:
            del inverted_index[stopword]
    return inverted_index
inverted_index = remove_stopwords_from_index(inverted_index)
inverted_index

## JUNK & OTHERS

In [None]:
"""for idx, df in enumerate(sub_corpus_list):
    # Transform the text in the dataframe using the vectorizer
    tfidf_matrix = vectorizer.transform(df["text"])
    
    # Convert the sparse matrix to a dense DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Save the DataFrame to a CSV file
    tfidf_df.to_csv(f'tfidf_matrix_{idx}.csv', index=False)"""
"""
for idx, df in enumerate(sub_corpus_list):
    print("Transforming")
    tfidf_matrix = vectorizer.transform(df["text"])
    #print("framing")
    #tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    #print("storing")
    #tfidf_df.to_parquet(f'tfidf_matrix_{idx}.parquet')
    for idx, q_text  in queries:
        query_feature = tf.transform([query])
        cosine_similarities = linear_kernel(query_feature,corpus_feature).flatten()
        top_10 = np.argpartition(cosine_similarities,-5)[-5:]"""  
# Placeholder for the results

"""
results = []
# Assuming list_of_dfs is the list of your sub-corpuses created earlier
for sub_corpus_df in sub_corpus_list:
    print("Transform sub")
    sub_corpus_matrix = vectorizer.transform(sub_corpus_df["text"])
    
    for _, query in queries.iterrows():
        print("Treat query")
        query_vector = vectorizer.transform([query['text']])
        
        # Compute cosine similarities
        cosine_similarities = linear_kernel(query_vector, sub_corpus_matrix).flatten()
        
        # Get top k corpus indices from this sub-corpus
        top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
        
        for index in top_k_indices:
            results.append({
                'query_id': query['_id'],
                'corpus_id': sub_corpus_df.iloc[index]['_id']
            })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)
"""
def find_top_k_matches(query):
    print("Transform query")
    query_vector = vectorizer.transform([query['text']])

    print("Cosine Sim")
    # Compute cosine similarities
    cosine_similarities = linear_kernel(query_vector, sub_corpus_matrix).flatten()

    print("K vals")
    # Get top k corpus indices from this sub-corpus
    top_k_indices = cosine_similarities.argsort()[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query_id'],
            'corpus_id': sub_corpus_df.iloc[index]['corpus_id']
        })
    print("Finished query !")
    return local_results
"""   
def find_top_k_matches(query):
    query_vector = vectorizer.transform([query['text']])
    cosine_similarities = []

    for idx in range(sub_corpus_matrix.shape[0]):
        corpus_vector = sub_corpus_matrix[idx].toarray().flatten()
        similarity = cosine_distance(query_vector.toarray().flatten(), corpus_vector)
        cosine_similarities.append(similarity)

    # Get top k corpus indices from this sub-corpus
    top_k_indices = np.argsort(cosine_similarities)[-10:][::-1]  # Here, k=10
    
    local_results = []
    for index in top_k_indices:
        local_results.append({
            'query_id': query['query_id'],
            'corpus_id': sub_corpus_df.iloc[index]['corpus_id']
        })
    
    return local_results
"""
results = []

# Assuming list_of_dfs is the list of your sub-corpuses created earlier
for sub_corpus_df in sub_corpus_list:
    print("Transforming")
    sub_corpus_matrix = vectorizer.transform(sub_corpus_df["text"])
    print("Parallelizing")
    # Use ProcessPoolExecutor to parallelize the inner loop
    with ProcessPoolExecutor() as executor:
        results.extend(executor.map(find_top_k_matches, queries_test.iterrows()))

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('query_corpus_mapping.csv', index=False)
        
        
