In [111]:
import numpy as np
import pandas as pd
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
from collections import Counter
from math import log
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()

# Data Preparation

In [81]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
950989,8841780,Wolves don't hide. They don't even live in cav...
395590,8841787,The UNHCR Country Representative in Kenya. Str...
93101,8841790,2. Describe the misery at Kakuma. 3. Compariso...
669122,8841800,Following the death of his employer and mentor...


In [82]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()#.apply(tokenize)
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

Unnamed: 0,query-id,text
506217,2,Androgen receptor define
65864,3,Another name for the primary visual cortex is
372466,4,Defining alcoholism as a disease is associated...
326447,5,ECT is a treatment that is used for
117580,6,"Ebolavirus is an enveloped virus, which means"
...,...,...
158901,1185863,why did rachel carson die
83120,1185864,definition of ramen
7634,1185865,amex india customer care number
1,1185868,_________ justice is designed to repair the ha...


In [83]:
query_corpus_train_map = pd.read_csv("data/task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

Unnamed: 0,query-id,corpus-id,score
70257,3,1142680,1
395137,4,5613529,1
346352,5,4956428,1
125307,6,1931409,1
66896,8,1094214,1
...,...,...,...
169115,1185863,2545716,1
88577,1185864,1408016,1
8141,1185865,229186,1
1,1185868,16,1


In [84]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is
1,4,Defining alcoholism as a disease is associated...
2,5,ECT is a treatment that is used for
3,6,"Ebolavirus is an enveloped virus, which means"
4,8,"In humans, the normal set point for body tempe..."
...,...,...
7432,18204,anger is fear
7433,18205,anger management definition
7434,18208,angie baby meaning
7435,18209,angie lindvall


In [85]:
df_test = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


# WATCH OUT HERE IS CRITIAL CHANGING CODE ...

In [118]:
# queries2 = queries_test
queries2 = queries_train_subset
corpus2 = corpus
print(f"Used corpus length : {len(corpus2)}")
print(f"Used queries length : {len(queries2)}")

Used corpus length : 1471406
Used queries length : 7437


# TF-IDF Functions

In [87]:
# Download necessary resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

STEMMER = PorterStemmer()
LEMMATIZER = WordNetLemmatizer()

# Precompile regex patterns for efficiency
HTML_PATTERN = re.compile("(<.*?>)")
NON_ASCII_DIGITS_PATTERN = re.compile("(\\W|\\d)")
NON_ASCII_CHARS_PATTERN = re.compile(r'[^\x00-\x7F]+')
PUNCTUATION_PATTERN = re.compile(f"[{re.escape(string.punctuation)}]")

# Convert stopwords list to set for faster lookup
STOPWORDS_SET = set(stopwords.words("english"))

In [88]:
from gensim.models import KeyedVectors
from gensim import downloader as api

try:
    model = KeyedVectors.load('data/glove.model.d2v')
except:
    print("model not found, loading from api")
    model = api.load("glove-wiki-gigaword-50")
    model.save('data/glove.model.d2v')

In [89]:
def preprocess_text(text):
    """Optimized text preprocessing function."""
    
    # Convert to lowercase early on
    text = text.lower()
    
    # Cleaning using compiled regex patterns
    text = HTML_PATTERN.sub("", text)
    text = NON_ASCII_DIGITS_PATTERN.sub("", text)
    text = NON_ASCII_CHARS_PATTERN.sub('', text)
    text = PUNCTUATION_PATTERN.sub('', text)
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords, and then perform Stemming and Lemmatization
    preprocessed_tokens = [
        # STEMMER.stem(LEMMATIZER.lemmatize(word))
        STEMMER.stem(word)
        for word in tokens
        if word not in STOPWORDS_SET
    ]
    
    return preprocessed_tokens

stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    def is_bad_word(s):
        return len(s) <= 1 or any(i not in string.printable or i in string.digits for i in s)

    words = text.split()
    cleaned_words = [word for word in words if not is_bad_word(word)]
    text = " ".join(cleaned_words)

    # the text contains many unidentified character, we decide to keep only ASCII characters
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)

    #remove punctuation and digits
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.translate(str.maketrans('', '', string.digits))

    # Tokenize the text (split it into words)
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stopwords_ = stopwords.words('english')
    words = [stemmer.stem(word) for word in words if word not in stopwords_ and word in model]

    # Lemmatize words
    #words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [91]:
def populate_tfidf_dataframe(documents, vocabulary):
    # Create a list of dictionaries with term frequencies
    list_of_dicts = [Counter(doc) for doc in documents]
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(list_of_dicts).fillna(0)
    
    # Reorder columns according to the vocabulary and fill missing columns with 0
    df = df.reindex(columns=vocabulary, fill_value=0)
    
    return df

def populate_tfidf_dataframe_sparse(documents, vocabulary):
    # Create a sparse matrix to hold the term frequencies
    tf_matrix = lil_matrix((len(documents), len(vocabulary)), dtype=int)

    # Map each word in the vocabulary to its column index for faster lookup
    vocab_index_map = {word: idx for idx, word in enumerate(vocabulary)}

    for i, doc in enumerate(documents):
        for word in doc:
            if word in vocab_index_map:
                tf_matrix[i, vocab_index_map[word]] += 1

    return tf_matrix

In [92]:
def tfidf(corpus_text):
    print("Process docs ... 2")
    documents = corpus_text.progress_apply(lambda x: preprocess_text(x))

    print("Create vocab ...")
    vocabulary = list(set(word for doc in documents for word in doc))
    vocabulary.sort()

    print("Compute tf ...")
    tf_matrix = populate_tfidf_dataframe_sparse(documents, vocabulary)

    print("Compute idf ...")
    doc_count = len(documents)
    df = (tf_matrix > 0).sum(axis=0)
    idf = np.log(doc_count / df)

    print("Compute tf-idf ...")
    tf_matrix = tf_matrix.tocsr()
    tf_matrix = tf_matrix.multiply(1 / tf_matrix.sum(axis=1))
    tfidf_matrix = tf_matrix.multiply(idf)

    print("Done!")
    return documents, tfidf_matrix, vocabulary, idf

In [93]:
def preprocess_query(query):
    """Tokenize, stem, and remove stopwords from the query."""
    return preprocess_text(query)  

# Corpus Processing

In [97]:
%%time
_, tfidf, vocabulary, idf = tfidf(corpus2["text"])
len(corpus2["text"])

Process docs ... 2


100%|██████████| 1471406/1471406 [17:40<00:00, 1388.11it/s]


Create vocab ...
Compute tf ...
Compute idf ...
Compute tf-idf ...


  tf_matrix = tf_matrix.multiply(1 / tf_matrix.sum(axis=1))


Done!
CPU times: total: 6min 16s
Wall time: 19min 46s


1471406

In [98]:
import pickle

with open('output/tfidf-ay.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('output/idf-ay.pkl', 'wb') as f:
    pickle.dump(idf, f)
    
with open('output/vocabulary-ay.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)

In [108]:
len(vocabulary)

135442

In [39]:
tfidf = pd.read_pickle('output/tfidf-stem.pkl')
idf = pd.read_pickle('output/idf-stem.pkl')
vocabulary = pd.read_pickle('output/vocabulary-stem.pkl')

In [107]:
len(vocabulary)

135442

# Queries Processing

In [None]:
def top_k_indices_sparse(matrix: csr_matrix, k: int):
    """Get top k indices for each row of a sparse matrix."""
    
    # Placeholder list for top k indices for each row
    top_indices = []
    
    # Iterate over each row
    print('Iterate over each row ...')
    for i in range(matrix.shape[0]):
        row_data = matrix.data[matrix.indptr[i]:matrix.indptr[i+1]]
        row_indices = matrix.indices[matrix.indptr[i]:matrix.indptr[i+1]]
        
        # If the row has less than k values, take them all
        if len(row_data) < k:
            top_indices.append(row_indices)
        else:
            # Sort the row data and get top k indices
            sorted_indices = np.argsort(-row_data)
            top_indices.append(row_indices[sorted_indices[:k]])
    
    return top_indices

In [None]:
def predict_documents(tfidf_matrix_normalized, query_vectors, k):
    """Process multiple queries and return ranked document indices for each query."""
    
    # Compute cosine similarities using matrix operations
    print("Compute cosine similarities ...")
    similarity_matrix = cosine_similarity(query_vectors, tfidf_matrix_normalized, dense_output=False)
    
    # Get document indices ranked by relevance for each query
    print("Rank documents ...")
    # print(similarity_matrix.shape)
    # ranked_doc_indices = np.argsort(-similarity_matrix)[:, :k]
    ranked_doc_indices = top_k_indices_sparse(similarity_matrix, k)
    
    return ranked_doc_indices

In [127]:
def predictions_to_ids_ranking(corpus, queries, prediction):
    
    print("Mapping results ...")
    mapped_results = [corpus.iloc[row]['corpus-id'].values for row in prediction]
    df = pd.DataFrame(mapped_results)
    df.insert(0, 'query-id', queries['query-id'].iloc[:len(df)])
    df.columns = ['query-id'] + [f'rank-{i}' for i in range(1, df.shape[1])]
    return df

# def predictions_to_ids_ranking(corpus, queries, prediction):
#     # Map the prediction rows to the corresponding 'corpus-id' values from the corpus
#     mapped_results = [corpus.iloc[row]['corpus-id'].values.tolist() for row in prediction]
#     
#     # Create a DataFrame with 'id', 'corpus-id', and 'score' columns
#     df = pd.DataFrame({
#         'id': queries['query-id'].iloc[:len(mapped_results)],
#         'corpus-id': mapped_results,
#         'score': [-1 for _ in range(len(mapped_results))]
#     })
#     
#     return df
    
#predictions_to_ids_ranking(corpus2, queries2, prediction)

In [128]:
%%time

def vectorize_queries(queries_df, vocabulary, idf):
    """Convert each query in the DataFrame into its TF-IDF vector."""
    
    print("Process queries ...")
    # Preprocess all queries
    queries_df.loc[:,'processed'] = queries_df['text'].apply(preprocess_query)

    print("Prepare dataframe ...")
    # Flatten for efficient computation
    flattened = queries_df.explode('processed')

    print("Compute  tf ...")
    # Get dummy variables for each term
    dummies = pd.get_dummies(flattened['processed']).groupby(flattened.index).sum()

    print("Rename tf dataframe ...")
    # Create a DataFrame for term frequencies, using only the columns in our vocabulary
    tf_df = dummies.reindex(columns=vocabulary, fill_value=0)

    print("Multiply by idf ...")
    # Convert idf dictionary to a Series for easier multiplication
    idf_series = pd.Series(idf)
    
    # Compute TF-IDF
    tfidf_df = tf_df.multiply(idf_series, axis=1)
    print("Done !")
    return tfidf_df

def vectorize_queries(queries_df, vocabulary, idf):
    """Convert each query in the DataFrame into its TF-IDF vector."""
    
    print("Process queries 2 ...")
    # Preprocess all queries
    queries_df['processed'] = queries_df['text'].apply(preprocess_query)

    print("Initialize sparse matrix ...")
    num_queries = len(queries_df)
    num_terms = len(vocabulary)
    
    # Using a dictionary for term index lookup
    vocab_dict = {term: index for index, term in enumerate(vocabulary)}
    tf_matrix = lil_matrix((num_queries, num_terms))

    print("Compute  tf ...")
    # Populate the sparse matrix
    for idx, row in queries_df.iterrows():
        for term in row['processed']:
            if term in vocab_dict:
                tf_matrix[idx, vocab_dict[term]] += 1

    print("Multiply by idf ...")
    # Convert to CSR format for efficient multiplication and transform TFs to TF-IDF
    tfidf_matrix = (tf_matrix.tocsr()).multiply(idf)

    print("Done !")
    return tfidf_matrix

vectors = vectorize_queries(queries2, vocabulary, idf)
vectors

Process queries 2 ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Initialize sparse matrix ...
Compute  tf ...
Multiply by idf ...
Done !
CPU times: total: 672 ms
Wall time: 1.95 s


<7437x135442 sparse matrix of type '<class 'numpy.float64'>'
	with 28643 stored elements in COOrdinate format>

In [121]:
prediction = predict_documents(tfidf, vectors, 10)

Compute cosine similarities ...
Rank documents ...
Iterate over each row ...


In [129]:
%%time
map_ = predictions_to_ids_ranking(corpus,queries2,prediction)

Mapping results ...
CPU times: total: 234 ms
Wall time: 473 ms


In [130]:
map_

Unnamed: 0,query-id,rank-1,rank-2,rank-3,rank-4,rank-5,rank-6,rank-7,rank-8,rank-9,rank-10
0,3,3060840.0,4017713.0,6810084.0,3924145.0,4054972.0,1142680.0,4614442.0,6979298.0,1907989.0,4884133.0
1,4,3483260.0,7882593.0,5613530.0,1897001.0,6161435.0,6207070.0,6640812.0,1062386.0,5745282.0,1006351.0
2,5,3823109.0,1474051.0,3090587.0,4956433.0,4783951.0,7296715.0,7841695.0,5034673.0,7296711.0,4997248.0
3,6,1931415.0,607642.0,8147639.0,5799740.0,5751818.0,7640836.0,5441940.0,5371373.0,6391108.0,7746209.0
4,8,3232970.0,2380588.0,3852443.0,294627.0,142019.0,6977529.0,993853.0,7926255.0,3740282.0,2832726.0
...,...,...,...,...,...,...,...,...,...,...,...
7432,18204,622811.0,3699536.0,4027354.0,2310510.0,366533.0,5801430.0,4913273.0,2128451.0,5801434.0,912451.0
7433,18205,912451.0,1235451.0,4027354.0,7876668.0,5801430.0,912448.0,2310510.0,5801434.0,366533.0,4913273.0
7434,18208,869926.0,4975091.0,2824382.0,4396909.0,4975089.0,6912140.0,2566514.0,5287460.0,4975090.0,1090706.0
7435,18209,828552.0,869926.0,4975091.0,2824382.0,4396909.0,6912140.0,2566514.0,4975089.0,4975090.0,1090706.0


In [None]:
def map_to_csv(map_):
    map_ = map_.drop(columns=['id'])
    map_.to_csv("output/predictions-ay.csv", index=True, header=True)

In [117]:
map_to_csv(map_)

In [124]:
print(queries2[queries2["query-id"] == map_.iloc[0]['query-id']]["text"].values[0])

KeyError: 'query-id'

In [46]:
query_corpus_train_map[query_corpus_train_map["query-id"] == map_.iloc[0]['query-id']]

Unnamed: 0,query-id,corpus-id,score


In [36]:
corpus[corpus["corpus-id"] == 1142680]["text"].values[0]

'The primary (parts of the cortex that receive sensory inputs from the thalamus) visual cortex is also known as V1, V isual area one, and the striate cortex.The extrastriate areas consist of visual areas two (V2), three (V3), four (V4), and five (V5).he primary visual cortex is the best-studied visual area in the brain. In all mammals studied, it is located in the posterior pole of the occipital cortex (the occipital cortex is responsible for processing visual stimuli).'

In [47]:
for idx in map_.iloc[0][1:]:
    print(corpus[corpus["corpus-id"] == idx]["text"].values[0])
    print('__________________________________________________________')

The term SARMS stands for âSelective Androgen Receptor Modulators .â Androgens are a class of hormones that serve as ligands that bind to cellular androgen receptors. The androgen receptor is involved in a complex signal transduction pathway that ultimately results in greater expression of specific genes.
__________________________________________________________
Hormone therapy for prostate cancerâalso called androgen suppression therapy or androgen deprivation therapyâcan block the production and use of androgens (3). Currently available treatments can: Reduce androgen production by the testicles. Block the action of androgens in the body.
__________________________________________________________
Antiandrogens, are compounds either natural or synthetic, that have the ability to lower the levels of testosterone and DHT (the androgens) and/or prevent the natural androgen pathways and their functions by blocking the androgen receptors in tissue.
________________________________

In [131]:
def accuracy(true_map, given_map):
    merged_map = pd.merge(true_map, given_map, on='query-id', how='inner')

    counter = 0

    # Iterate over each row in the merged dataframe
    for _, row in merged_map.iterrows():
        # Check if corpus-id exists in any of the rank columns
        if row['corpus-id'] in [row[f'rank-{i}'] for i in range(1, 11)]:
            counter += 1

    # If you want the accuracy as a fraction of total queries, you can return counter/len(merged_map)
    return counter / len(merged_map) * 100
    
accuracy(query_corpus_train_map, map_)

36.71960237604558