In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
import pickle
from collections import Counter
from gensim.models import KeyedVectors
from gensim import downloader as api
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()

## Data Pre-Process

In [3]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"}).reset_index(drop=True)
corpus

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
950989,8841780,Wolves don't hide. They don't even live in cav...
395590,8841787,The UNHCR Country Representative in Kenya. Str...
93101,8841790,2. Describe the misery at Kakuma. 3. Compariso...
669122,8841800,Following the death of his employer and mentor...


In [4]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

df_test = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])

##Free queries
queries = None

queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


### Importing Model

In [11]:
try:
    model = KeyedVectors.load('data/glove.model.d2v')
except:
    print("404, Now Fetching Model ...")
    model = api.load("glove-wiki-gigaword-50")
    model.save('data/glove.model.d2v')

### Prepare text processing constants

In [37]:
STEMMER = PorterStemmer()
NON_ASCII_PATTERN = re.compile(r'\\u[0-9a-fA-F]{4}')
STOPWORDS_SET = set(stopwords.words("english"))    

In [38]:
def preprocess_text(text):
    """
    Preprocesses the given text by performing several operations:
    1. Converts the text to lowercase.
    2. Removes non-ASCII characters.
    3. Replaces punctuation with spaces.
    4. Removes digits.
    5. Tokenizes the text using NLTK's word_tokenize.
    6. Removes stopwords and stems the words using PorterStemmer.
    7. Filters out words that are not in the model vocabulary.
    
    Args:
    - text (str): The input text to preprocess.

    Returns:
    - list of str: A list of preprocessed and tokenized words.
    """
    text = text.lower()
    text = NON_ASCII_PATTERN.sub('', text)
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.translate(str.maketrans('', '', string.digits))
    
    words = nltk.word_tokenize(text)
    stopwords_ = stopwords.words('english')
    words = [STEMMER.stem(word) for word in words if word not in STOPWORDS_SET and word in model]
    return words

##  TF-IDF Implementation

In [14]:
def populate_tfidf_dataframe_sparse(documents, vocabulary):
    """
    Generates a term frequency (TF) matrix for the given documents and vocabulary.

    Args:
    - documents (list of list of str): The preprocessed documents represented as lists of words.
    - vocabulary (list of str): The unique words to be considered from all documents.

    Returns:
    - lil_matrix: A sparse matrix representation of the term frequencies.
    """
    
    # Create a sparse matrix to hold the term frequencies
    tf_matrix = lil_matrix((len(documents), len(vocabulary)), dtype=int)

    # Map each word in the vocabulary to its column index for faster lookup
    vocab_index_map = {word: idx for idx, word in enumerate(vocabulary)}

    for i, doc in enumerate(documents):
        for word in doc:
            if word in vocab_index_map:
                tf_matrix[i, vocab_index_map[word]] += 1

    return tf_matrix

In [15]:
def tfidf(corpus_text):
    """
    Computes the Term Frequency-Inverse Document Frequency (TF-IDF) matrix for the given corpus.

    Args:
    - corpus_text (iterable): The input corpus where each item is a raw text document.

    Returns:
    - tuple: A tuple containing the following:
        1. documents (list of list of str): Preprocessed documents.
        2. tfidf_matrix (csr_matrix): The computed TF-IDF matrix.
        3. vocabulary (list of str): The vocabulary extracted from the corpus.
        4. idf (numpy array): The computed inverse document frequencies for each word in the vocabulary.
    """
    
    print("Process docs ...")
    documents = corpus_text.progress_apply(lambda x: preprocess_text(x))
    
    print("Create vocab ...")
    vocabulary = list(set(word for doc in documents for word in doc))
    vocabulary.sort()

    print("Compute tf ...")
    tf_matrix = populate_tfidf_dataframe_sparse(documents, vocabulary)

    print("Compute idf ...")
    doc_count = len(documents)
    df = (tf_matrix > 0).sum(axis=0)
    idf = np.log((doc_count + 0.5) / (df + 0.5))
    
    print("Compute tf-idf ...")
    tf_matrix = tf_matrix.tocsr()
    tf_matrix = tf_matrix.multiply(1 / tf_matrix.sum(axis=1))
    tfidf_matrix = tf_matrix.multiply(idf)

    print("Done!")
    return documents, tfidf_matrix, vocabulary, idf

### TF-IDF Corpus Processing

In [35]:
def tfidf_process_corpus():
    TFIDF_FOLDER = "data/"
    FILE_NAME = "submission"
    try:
        tf_idf = pd.read_pickle(f'{TFIDF_FOLDER}tfidf-{FILE_NAME}.pkl')
        idf = pd.read_pickle(f'{IDF_FOLDER}idf-{FILE_NAME}.pkl')
        vocabulary = pd.read_pickle(f'{VOCABULARY_FOLDER}vocabulary-{FILE_NAME}.pkl')
        documents = pd.read_pickle(f'{DOCUMENT_FOLDER}document-{FILE_NAME}.pkl')
        return documents, tf_idf, vocabulary, idf
    except:
        print("404, creating required metadata ...")
        documents, tf_idf, vocabulary, idf = tfidf(corpus["text"])
        
        with open(f'{TFIDF_FOLDER}tfidf-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(tf_idf, f)

        with open(f'{IDF_FOLDER}idf-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(idf, f)
    
        with open(f'{VOCABULARY_FOLDER}vocabulary-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(vocabulary, f)
            
        with open(f'{DOCUMENT_FOLDER}document-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(documents, f)

        return documents, tf_idf, vocabulary, idf

In [None]:
%%time
tfidf_process_corpus()

### TF-IDF Query Processing & Prediction

In [17]:
def tfidf_vectorize_queries(queries_df, vocabulary, idf):
    """Convert each query in the DataFrame into its TF-IDF vector."""
    
    print("Process queries ...")
    # Preprocess all queries
    queries_df['processed'] = queries_df['text'].apply(preprocess_query)

    print("Initialize sparse matrix ...")
    num_queries = len(queries_df)
    num_terms = len(vocabulary)
    
    # Using a dictionary for term index lookup
    vocab_dict = {term: index for index, term in enumerate(vocabulary)}
    tf_matrix = lil_matrix((num_queries, num_terms))

    print("Compute  tf ...")
    # Populate the sparse matrix
    for idx, row in queries_df.iterrows():
        for term in row['processed']:
            if term in vocab_dict:
                tf_matrix[idx, vocab_dict[term]] += 1

    print("Multiply by idf ...")
    # Convert to CSR format for efficient multiplication and transform TFs to TF-IDF
    tfidf_matrix = (tf_matrix.tocsr()).multiply(idf)

    print("Done !")
    return tfidf_matrix

In [18]:
def top_k_indices_sparse(matrix: csr_matrix, k: int):
    """Get top k indices for each row of a sparse matrix."""
    
    # Placeholder list for top k indices for each row
    top_indices = []
    
    # Iterate over each row
    print('Iterate over each row ...')
    for i in range(matrix.shape[0]):
        row_data = matrix.data[matrix.indptr[i]:matrix.indptr[i+1]]
        row_indices = matrix.indices[matrix.indptr[i]:matrix.indptr[i+1]]
        
        # If the row has less than k values, take them all
        if len(row_data) < k:
            top_indices.append(row_indices)
        else:
            # Sort the row data and get top k indices
            sorted_indices = np.argsort(-row_data)
            top_indices.append(row_indices[sorted_indices[:k]])
    
    return top_indices

In [19]:
def tfidf_predict_documents(tfidf_matrix_normalized, query_vectors, k):
    """Process multiple queries and return ranked document indices for each query."""
    
    # Compute cosine similarities using matrix operations
    print("Compute cosine similarities ...")
    similarity_matrix = cosine_similarity(query_vectors, tfidf_matrix_normalized, dense_output=False)
    
    # Get document indices ranked by relevance for each query
    print("Rank documents ...")
    # print(similarity_matrix.shape)
    # ranked_doc_indices = np.argsort(-similarity_matrix)[:, :k]
    ranked_doc_indices = top_k_indices_sparse(similarity_matrix, k)
    
    return ranked_doc_indices

In [20]:
def predictions_to_ids_ranking(corpus, queries, prediction):
    # Map the prediction rows to the corresponding 'corpus-id' values from the corpus
    mapped_results = [corpus.iloc[row]['corpus-id'].values.tolist() for row in prediction]

    # Create a DataFrame with 'id', 'corpus-id', and 'score' columns
    df = pd.DataFrame({
        'id': queries['query-id'].iloc[:len(mapped_results)],
        'corpus-id': mapped_results,
        'score': [-1 for _ in range(len(mapped_results))]
    })

    return df

### Deep Embedder Corpus Processing 

In [25]:
def load_pretrained_embedder():
    try:
        with open('DeepEmbedder.pkl', 'rb') as f:
            deep_embedder = pickle.load(f)
        return deep_embedder
    except:
        print('404, Fetching DeepEmbedder')
        deep_embedder = SentenceTransformer('all-MiniLM-L6-v2')
        with open(f'DeepEmbedder.pkl', 'wb') as f:
            pickle.dump(deep_embedder, f)
        return deep_embedder

In [27]:
DEEP_EMBEDDER = load_pretrained_embedder()

In [28]:
def deep_embedder_process_corpus():
    try:
        with open('EmbededCorpus.pkl', 'rb') as f:
            embedded_corpus = pickle.load(f)
        return embedded_corpus
    except:
        print('404, Computing Embeded Corpus ...')
        embedded_corpus = DEEP_EMBEDDER.encode(sentences=corpus["text"].tolist(), 
                                       batch_size= 128, # TO BE CHANGED
                                       show_progress_bar=True, 
                                       device='cpu', # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                       )
        
        with open(f'EmbededCorpus.pkl', 'wb') as f:
            pickle.dump(embedded_corpus, f)
        return embedded_corpus  

### Deep Embedder Query Processing & Prediction

In [30]:
def deep_vectorize_queries(queries):
    return deep_embedder.encode(queries.text.tolist(),
                                          batch_size= 128, # TO BE CHANGED 
                                          show_progress_bar=True, 
                                          device='cpu', # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                       )


In [33]:
def deep_predict_documents(top_large_k, vectorized_queries,  vectorized_corpus):
    
    # 2D Array for storing indices to relevant documents
    # Shape (Number of queries, k)
    top_10 = np.zeros((vectorized_queries.shape[0], 10))
    
    # Iterate through each query embedding
    for idx, vector_query in enumerate(vectorized_queries):
    
        # Index the embedding of relevant candidates
        # Shape of sentence_feature: (large_k, 384)
        sentence_feature = vectorized_corpus[top_large_k[idx]]
    
        # Dot product (numerator of cosine similarity), similar to linear_kernel
        similarity = sentence_feature @ vector_query
    
        # Get indices of top-k highest similarities
        top_10[idx] = np.argsort(similarity)[-10:] 
    return top_10

In [39]:
%%time

## TF-IDF PREDICTION 
k = 1000
documents, tfidf, vocabulary, idf = tfidf_process_corpus()
tfidf_vectors = vectorize_queries(queries_test, vocabulary, idf)
prediction = predict_documents(tfidf, vectors, tfidf_vectors)
map_ = predictions_to_ids_ranking(corpus, queries_test, prediction)

## DEEP EMBEDDING PREDICTION
VECTORIZED_CORPUS = deep_embedder_process_corpus()
top_large_k = map_["corpus-id"].apply(lambda x: list(x))
deep_vectors =  deep_vectorize_queries(queries)

top10 = deep_predict_documents(top_large_k, deep_vectors,  VECTORIZED_CORPUS)
top10






404, creating required metadata ...
Process docs ...


  1%|▎                                | 11546/1471406 [00:11<23:16, 1045.27it/s]


KeyboardInterrupt: 