In [1]:
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import random
import time
from collections import defaultdict
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
from collections import Counter
from math import log


# Data Preparation

In [2]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
950989,8841780,Wolves don't hide. They don't even live in cav...
395590,8841787,The UNHCR Country Representative in Kenya. Str...
93101,8841790,2. Describe the misery at Kakuma. 3. Compariso...
669122,8841800,Following the death of his employer and mentor...


In [3]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()#.apply(tokenize)
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

Unnamed: 0,query-id,text
506217,2,Androgen receptor define
65864,3,Another name for the primary visual cortex is
372466,4,Defining alcoholism as a disease is associated...
326447,5,ECT is a treatment that is used for
117580,6,"Ebolavirus is an enveloped virus, which means"
...,...,...
158901,1185863,why did rachel carson die
83120,1185864,definition of ramen
7634,1185865,amex india customer care number
1,1185868,_________ justice is designed to repair the ha...


In [4]:
query_corpus_train_map = pd.read_csv("data/task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

Unnamed: 0,query-id,corpus-id,score
70257,3,1142680,1
395137,4,5613529,1
346352,5,4956428,1
125307,6,1931409,1
66896,8,1094214,1
...,...,...,...
169115,1185863,2545716,1
88577,1185864,1408016,1
8141,1185865,229186,1
1,1185868,16,1


In [5]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is
1,4,Defining alcoholism as a disease is associated...
2,5,ECT is a treatment that is used for
3,6,"Ebolavirus is an enveloped virus, which means"
4,8,"In humans, the normal set point for body tempe..."
...,...,...
7432,18204,anger is fear
7433,18205,anger management definition
7434,18208,angie baby meaning
7435,18209,angie lindvall


In [6]:
df_test = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


# WATCH OUT HERE IS CRITIAL CHANGING CODE ...

In [27]:
queries2 = queries_train_subset.iloc[: 100]
corpus2 = corpus.iloc[:10000]
corpus2 = pd.concat([corpus2, corpus[corpus["corpus-id"] == 1142680]])
print(f"Used corpus length : {len(corpus2)}")
print(f"Used queries length : {len(queries2)}")

Used corpus length : 10001
Used queries length : 100


# TF-IDF Functions

In [28]:
# Download necessary resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

STEMMER = PorterStemmer()
LEMMATIZER = WordNetLemmatizer()

# Precompile regex patterns for efficiency
HTML_PATTERN = re.compile("(<.*?>)")
NON_ASCII_DIGITS_PATTERN = re.compile("(\\W|\\d)")
NON_ASCII_CHARS_PATTERN = re.compile(r'[^\x00-\x7F]+')

# Convert stopwords list to set for faster lookup
STOPWORDS_SET = set(stopwords.words("english"))

In [9]:
def preprocess_text(text):
    """Optimized text preprocessing function."""
    
    # Cleaning
    text = HTML_PATTERN.sub("", text)
    text = NON_ASCII_DIGITS_PATTERN.sub(" ", text)
    text = NON_ASCII_CHARS_PATTERN.sub('', text)
    text = "".join([ch for ch in text if ch not in string.punctuation])
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords, and then perform Stemming and Lemmatization
    preprocessed_tokens = [
        STEMMER.stem(LEMMATIZER.lemmatize(word))
        for word in tokens
        if word.lower() not in STOPWORDS_SET
    ]
    
    return preprocessed_tokens

In [10]:
def parallel_preprocess_texts(texts):
    with Pool() as pool:
        preprocessed_batches = pool.map(preprocess_text, texts)
    return preprocessed_batches

In [11]:
def populate_tfidf_dataframe(documents, vocabulary):
    # Create a list of dictionaries with term frequencies
    list_of_dicts = [Counter(doc) for doc in documents]
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(list_of_dicts).fillna(0)
    
    # Reorder columns according to the vocabulary and fill missing columns with 0
    df = df.reindex(columns=vocabulary, fill_value=0)
    
    return df

In [12]:
def tfidf(corpus_text):
    # Parallel tokenization and preprocessing
    print("Process docs ...")
    documents = corpus_text.apply(lambda x: preprocess_text(x))

    print("Create vocab ...")
    # Create the vocabulary
    vocabulary = list(set(word for doc in documents for word in doc))
    vocabulary.sort()
    
    # Use the helper function to create and populate the DataFrame for term frequencies
    print("Compute tf ...")
    df = populate_tfidf_dataframe(documents, vocabulary)
            
    # Compute IDF values
    print("Compute idf ...")
    doc_count = len(documents)
    idf = df[df > 0].count().apply(lambda x: log(doc_count / x))
    
    # Compute TF-IDF values
    print("Compute tf-idf ...")
    tfidf_df = df.apply(lambda x: x / x.sum(), axis=1).multiply(idf)
    print("Done !")
    return documents, tfidf_df, vocabulary, idf

In [13]:
def preprocess_query(query):
    """Tokenize, stem, and remove stopwords from the query."""
    return preprocess_text(query)  

In [14]:
def vectorize_query(query, vocabulary, idf):
    """Convert the query into its TF-IDF vector."""
    query_tf = Counter(preprocess_query(query))
    query_vector = [query_tf.get(term, 0) * idf[term] for term in vocabulary]
    return np.array(query_vector)

In [15]:
def predict_documents(tfidf_matrix_normalized, query_vectors, k):
    """Process multiple queries and return ranked document indices for each query."""
    # Compute cosine similarities using matrix operations
    similarity_matrix = np.dot(query_vectors, tfidf_matrix_normalized.T)
    
    # Get document indices ranked by relevance for each query
    ranked_doc_indices = np.argsort(-similarity_matrix)[:, :k]
    return ranked_doc_indices

# Corpus Processing

In [37]:
%%time
_, tfidf_df, vocabulary, idf = tfidf(corpus2["text"])
len(corpus2["text"])

Process docs ...
Create vocab ...
Compute tf ...
Compute idf ...
Compute tf-idf ...
Done !
CPU times: user 58.9 s, sys: 16.1 s, total: 1min 15s
Wall time: 1min 19s


10001

In [30]:
l2_norms = np.linalg.norm(tfidf_df.values, axis=1)
df_normalized = tfidf_df.divide(l2_norms, axis=0)
df_normalized

Unnamed: 0,aa,aaa,aac,aacn,aadvantag,aafcu,aafp,aaicama,aalsmeer,aamc,...,zugspitz,zulay,zune,zurich,zwick,zwitterion,zygnematacea,zygot,zz,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
tfidf_np = np.array(df_normalized.values)
tfidf_np.shape

(10001, 22700)

# Queries Processing

## Way 1

In [32]:
print("Vectorizing...")
queries2.loc[:, "list"] = queries2["text"].apply(lambda x: vectorize_query(x, vocabulary, idf))
print("Stacking...")
query_vectors = np.vstack(queries2["list"])
query_vectors.shape

Vectorizing...
Stacking...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  queries2.loc[:, "list"] = queries2["text"].apply(lambda x: vectorize_query(x, vocabulary, idf))


(100, 22700)

In [33]:
prediction = predict_documents(tfidf_np, query_vectors, 10)
prediction

array([[10000,  5238,  2273,  9770,  8353,  8531,  6516,  3475,  9389,
         3656],
       [ 4257,  5972,   702,  4258,  4975,  5531,  5780,   212,  4161,
         7689],
       [ 5297,  8655,  6353,   268,  8656,  1994,   938,  7358,  6031,
          939],
       [ 9642,  8167,  5016,  5802,  6052,  5801,  6728,  3988,  7364,
         7225],
       [ 6327,  1659,  4757,  8241,  6325,  8240,  1658,  1660,  8804,
         6326],
       [ 8907,  1678,  8910,  8908,  6978,  8909,  8911,  5280,   932,
         9600],
       [ 4051,  9245,  5931,  6535,  3321,   475,  1163,  4386,  6348,
         1462],
       [ 2832,  5264,  1796,  8660,  7287,  2950,  2082,  3116,  2833,
         6968],
       [ 3775,  8317,  2386,  3716,  9616,  3774,  8645,  3776,   454,
         3715],
       [ 8365,  4715,  4714,   708,  8240,  6327,  8804,  1659,  8241,
         6792],
       [ 5213,  3671,  9203,  8530,  9845,  3084,  7911,  1464,  3800,
          247],
       [ 8308,  5195,  6801,  9207,  7185, 

In [34]:
def predictions_to_ids_ranking(corpus, queries, prediction):
    
    mapped_results = [corpus.iloc[row]['corpus-id'].values for row in prediction]
    df = pd.DataFrame(mapped_results)
    df.insert(0, 'query-id', queries['query-id'].iloc[:len(df)])
    df.columns = ['query-id'] + [f'rank-{i}' for i in range(1, df.shape[1])]
    return df
    
predictions_to_ids_ranking(corpus2, queries2, prediction)

Unnamed: 0,query-id,rank-1,rank-2,rank-3,rank-4,rank-5,rank-6,rank-7,rank-8,rank-9,rank-10
0,3,1142680,38226,17077,71221,61031,62371,47722,25388,68448,26772
1,4,31090,43629,5359,31095,36295,40499,42198,1511,30450,56230
2,5,38677,63262,46549,1944,63267,15047,7010,53956,44068,7012
3,6,70257,59694,36602,42322,44232,42318,49395,29277,53986,53001
4,8,46284,12742,34756,60260,46276,60258,12737,12743,64341,46283
...,...,...,...,...,...,...,...,...,...,...,...
95,266,62053,63693,39597,47317,7091,62046,45772,35333,11605,35885
96,271,42603,41719,71460,42604,19146,28528,59641,1911,27691,32929
97,275,13048,13300,64172,56340,32937,62478,58869,41825,24045,25105
98,279,33367,23615,53312,14397,8514,32130,14401,71242,10359,64798


## Way 2

In [35]:
def vectorize_queries(queries_df, vocabulary, idf):
    """Convert each query in the DataFrame into its TF-IDF vector."""
    
    print("Process queries ...")
    # Preprocess all queries
    queries_df.loc[:,'processed'] = queries_df['text'].apply(preprocess_query)

    print("Prepare dataframe ...")
    # Flatten for efficient computation
    flattened = queries_df.explode('processed')

    print("Compute  tf ...")
    # Get dummy variables for each term
    dummies = pd.get_dummies(flattened['processed']).groupby(flattened.index).sum()

    print("Rename tf dataframe ...")
    # Create a DataFrame for term frequencies, using only the columns in our vocabulary
    tf_df = dummies.reindex(columns=vocabulary, fill_value=0)

    print("Multiply by idf ...")
    # Convert idf dictionary to a Series for easier multiplication
    idf_series = pd.Series(idf)
    
    # Compute TF-IDF
    tfidf_df = tf_df.multiply(idf_series, axis=1)
    print("Done !")
    return tfidf_df

vectors = vectorize_queries(queries2, vocabulary, idf)

Process queries ...
Prepare dataframe ...
Compute  tf ...
Rename tf dataframe ...
Multiply by idf ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  queries_df.loc[:,'processed'] = queries_df['text'].apply(preprocess_query)


Done !


In [36]:
predictions_to_ids_ranking(corpus2,queries2,predict_documents(tfidf_np, vectors, 10))

Unnamed: 0,query-id,rank-1,rank-2,rank-3,rank-4,rank-5,rank-6,rank-7,rank-8,rank-9,rank-10
0,3,1142680,38226,17077,71221,61031,62371,47722,25388,68448,26772
1,4,31090,43629,5359,31095,36295,40499,42198,1511,30450,56230
2,5,38677,63262,46549,1944,63267,15047,7010,53956,44068,7012
3,6,70257,59694,36602,42322,44232,42318,49395,29277,53986,53001
4,8,46284,12742,34756,60260,46276,60258,12737,12743,64341,46283
...,...,...,...,...,...,...,...,...,...,...,...
95,266,62053,63693,39597,47317,7091,62046,45772,35333,11605,35885
96,271,42603,41719,71460,42604,19146,28528,59641,1911,27691,32929
97,275,13048,13300,64172,56340,32937,62478,58869,41825,24045,25105
98,279,33367,23615,53312,14397,8514,32130,14401,71242,10359,64798
