In [1]:
pip install sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import string
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
import re
import pickle
from gensim.models import KeyedVectors
from gensim import downloader as api
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

tqdm.pandas()

## Data Pre-Process

In [3]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(
    columns={"_id": "corpus-id"}).reset_index(drop=True)
corpus

Unnamed: 0,corpus-id,text
0,0,The presence of communication amid scientific ...
1,8,"In June 1942, the United States Army Corps of ..."
2,12,Tutorial: Introduction to Restorative Justice....
3,16,The approach is based on a theory of justice t...
4,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
1471401,8841780,Wolves don't hide. They don't even live in cav...
1471402,8841787,The UNHCR Country Representative in Kenya. Str...
1471403,8841790,2. Describe the misery at Kakuma. 3. Compariso...
1471404,8841800,Following the death of his employer and mentor...


In [4]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True)
queries['text'] = queries['text'].str.strip()
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

Unnamed: 0,query-id,text
0,1185869,)what was the immediate impact of the success ...
1,1185868,_________ justice is designed to repair the ha...
2,597651,what color is amber urine
3,403613,is autoimmune hepatitis a bile acid synthesis ...
4,1183785,elegxo meaning
...,...,...
509957,147073,difference between discrete and process manufa...
509958,243761,how long did abraham lincoln serve
509959,162662,does adult acne rosacea give you blepharitis
509960,247194,how long do you bake muffins


Mapping queries ids to queries texts for task 1 and task 2

In [5]:
df_task_1 = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_task_1, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])

df_task_2 = pd.read_csv("data/task2_test.tsv", sep="\t")
df_task_2 = pd.merge(df_task_2, queries, left_on='query-id', right_on='query-id')
df_task_2 = df_task_2.drop(columns=['query-id'])
df_task_2['corpus-id'] = df_task_2['corpus-id'].str.replace("[", '')
df_task_2['corpus-id'] = df_task_2['corpus-id'].str.replace("]", '')
df_task_2['corpus-id'] = df_task_2['corpus-id'].str.split(", ")

##Free queries
queries = None

### Importing GloVe Model
Loading pre-trained word-vectors from [gensim-data](https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html).
We are using the [glove-wiki-gigaword-50](https://nlp.stanford.edu/projects/glove/) model ([PDF](https://nlp.stanford.edu/pubs/glove.pdf)).
[Models available to download in gensim](https://github.com/piskvorky/gensim-data#models).

In [6]:
try:
    model = KeyedVectors.load('data/glove.model.d2v')
except:
    print("404, Now Fetching Model ...")
    model = api.load("glove-wiki-gigaword-50")
    model.save('data/glove.model.d2v')

### Prepare text processing constants
For text processing, we will use the following:
- [PorterStemmer](https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.porter) from NLTK for stemming.
- [stopwords](https://www.nltk.org/api/nltk.corpus.html#module-nltk.corpus.stopwords) from NLTK for removing stopwords.
- [string.punctuation](https://docs.python.org/3/library/string.html#string.punctuation) from Python for removing punctuation.
- [string.digits](https://docs.python.org/3/library/string.html#string.digits) from Python for removing digits.
- [re](https://docs.python.org/3/library/re.html) from Python for removing non-ASCII characters.
- [nltk.word_tokenize](https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.word_tokenize) from NLTK for tokenizing text.

In [7]:
STEMMER = PorterStemmer()
NON_ASCII_PATTERN = re.compile(r'\\u[0-9a-fA-F]{4}')
STOPWORDS_SET = set(stopwords.words("english"))

In [8]:
def preprocess_text(text):
    """
    Preprocesses the given text by performing several operations:
    1. Converts the text to lowercase.
    2. Removes non-ASCII characters.
    3. Replaces punctuation with spaces.
    4. Removes digits.
    5. Tokenizes the text using NLTK's word_tokenize.
    6. Removes stopwords and stems the words using PorterStemmer.
    7. Filters out words that are not in the model vocabulary.
    
    Args:
    - text (str): The input text to preprocess.

    Returns:
    - words: A list of preprocessed and tokenized words.
    """
    text = text.lower()
    text = NON_ASCII_PATTERN.sub('', text)
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = text.translate(str.maketrans('', '', string.digits))

    words = nltk.word_tokenize(text)
    words = [STEMMER.stem(word) for word in words if word not in STOPWORDS_SET and word in model]
    return words

##  TF-IDF Implementation

In [9]:
def populate_tfidf_dataframe(documents, vocabulary):
    """
    Generates a term frequency (TF) matrix for the given documents and vocabulary.

    Args:
    - documents: The preprocessed documents represented as lists of words.
    - vocabulary: The unique words to be considered from all documents.

    Returns:
    - tf_matrix: A sparse matrix representation of the term frequencies.
    """

    # Create a sparse matrix to hold the term frequencies
    tf_matrix = lil_matrix((len(documents), len(vocabulary)), dtype=int)

    # Map each word in the vocabulary to its column index for faster lookup
    vocab_index_map = {word: idx for idx, word in enumerate(vocabulary)}

    for i, doc in enumerate(documents):
        for word in doc:
            if word in vocab_index_map:
                tf_matrix[i, vocab_index_map[word]] += 1

    return tf_matrix

In [10]:
def tfidf(corpus_text):
    """
    Computes the Term Frequency-Inverse Document Frequency (TF-IDF) matrix for the given corpus.

    Args:
    - corpus_text: The input corpus where each item is a raw text document.

    Returns:
    - tuple: A tuple containing the following:
        1. documents: Preprocessed documents.
        2. tfidf_matrix: The computed TF-IDF matrix.
        3. vocabulary: The vocabulary extracted from the corpus.
        4. idf: The computed inverse document frequencies for each word in the vocabulary.
    """

    print("Process docs ...")
    documents = corpus_text.progress_apply(lambda x: preprocess_text(x))

    print("Create vocab ...")
    vocabulary = list(set(word for doc in documents for word in doc))
    vocabulary.sort()

    print("Compute tf ...")
    tf_matrix = populate_tfidf_dataframe(documents, vocabulary)

    print("Compute idf ...")
    doc_count = len(documents)
    df = (tf_matrix > 0).sum(axis=0)
    idf = np.log((doc_count + 0.5) / (df + 0.5))

    print("Compute tf-idf ...")
    tf_matrix = tf_matrix.tocsr()
    tf_matrix = tf_matrix.multiply(1 / tf_matrix.sum(axis=1))
    tfidf_matrix = tf_matrix.multiply(idf)

    print("Done!")
    return documents, tfidf_matrix, vocabulary, idf

### TF-IDF Corpus Processing
To avoid re-computing the TF-IDF matrix every time, we will save the computed matrix to disk using [pickle](https://docs.python.org/3/library/pickle.html).

In [11]:
def tfidf_process_corpus():
    """
    Load or compute TF-IDF vectors and related metadata for a given corpus.

    Args:
        corpus (pandas.DataFrame): DataFrame containing 'text' column with raw text data.

    Returns:
        tuple: A tuple containing four elements -
            1. documents: Processed documents from the corpus.
            2. tf_idf: TF-IDF matrix for the documents.
            3. vocabulary: Vocabulary list used for TF-IDF vectorization.
            4. idf: Inverse Document Frequencies (IDF) for the terms in the vocabulary.
    """
    DATA_FOLDER = "data/"
    FILE_NAME = "submission"
    try:
        tf_idf = pd.read_pickle(f'{DATA_FOLDER}tfidf-{FILE_NAME}.pkl')
        idf = pd.read_pickle(f'{DATA_FOLDER}idf-{FILE_NAME}.pkl')
        vocabulary = pd.read_pickle(f'{DATA_FOLDER}vocabulary-{FILE_NAME}.pkl')
        documents = pd.read_pickle(f'{DATA_FOLDER}document-{FILE_NAME}.pkl')
        return documents, tf_idf, vocabulary, idf
    except:
        print("404, creating required metadata ...")
        documents, tf_idf, vocabulary, idf = tfidf(corpus["text"])

        with open(f'{DATA_FOLDER}tfidf-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(tf_idf, f)

        with open(f'{DATA_FOLDER}idf-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(idf, f)

        with open(f'{DATA_FOLDER}vocabulary-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(vocabulary, f)

        with open(f'{DATA_FOLDER}document-{FILE_NAME}.pkl', 'wb') as f:
            pickle.dump(documents, f)

        return documents, tf_idf, vocabulary, idf

Execution cell for TF-IDF corpus processing.

In [12]:
%%time
tfidf_process_corpus()

CPU times: total: 9.14 s
Wall time: 19 s


(0          [presenc, commun, amid, scientif, mind, equal,...
 1          [june, unit, state, armi, corp, manhattan, pro...
 2          [tutori, introduct, restor, justic, restor, ju...
 3          [approach, base, theori, justic, consid, crime...
 4          [phloem, conduct, vascular, tissu, found, plan...
                                  ...                        
 1471401    [wolv, hide, even, live, cave, live, open, for...
 1471402    [unhcr, countri, repres, kenya, street, addres...
 1471403    [describ, miseri, kakuma, comparison, popul, k...
 1471404    [follow, death, employ, mentor, bumpi, johnson...
 1471405    [present, puerto, rico, hold, titl, miss, univ...
 Name: text, Length: 1471406, dtype: object,
 <1471406x135442 sparse matrix of type '<class 'numpy.float64'>'
 	with 34858541 stored elements in COOrdinate format>,
 ['a',
  'aa',
  'aaa',
  'aaaa',
  'aaaaa',
  'aaah',
  'aaahh',
  'aab',
  'aaba',
  'aabb',
  'aac',
  'aacc',
  'aach',
  'aachen',
  'aacm',
  'aacn

### TF-IDF Query Processing & Prediction

In [13]:
def tfidf_vectorize_queries(queries_df, vocabulary, idf):
    """
    Convert each query in the DataFrame into its TF-IDF vector.

    Args:
        queries_df: DataFrame containing 'text' column with raw queries.
        vocabulary: List of unique terms used for vectorization.
        idf: Array containing Inverse Document Frequencies for each term.

    Returns:
        tfidf_matrix: TF-IDF matrix of shape (num_queries, num_terms).
    """

    print("Process queries ...")
    # Preprocess all queries
    queries_df['processed'] = queries_df['text'].apply(preprocess_text)

    print("Initialize sparse matrix ...")
    num_queries = len(queries_df)
    num_terms = len(vocabulary)

    # Using a dictionary for term index lookup
    vocab_dict = {term: index for index, term in enumerate(vocabulary)}
    tf_matrix = lil_matrix((num_queries, num_terms))

    print("Compute  tf ...")
    # Populate the sparse matrix
    for idx, row in queries_df.iterrows():
        for term in row['processed']:
            if term in vocab_dict:
                tf_matrix[idx, vocab_dict[term]] += 1

    print("Multiply by idf ...")
    # Convert to CSR format for efficient multiplication and transform TFs to TF-IDF
    tfidf_matrix = (tf_matrix.tocsr()).multiply(idf)

    print("Done !")
    return tfidf_matrix

In [14]:
def top_k_indices(matrix: csr_matrix, k: int):
    """
    Get top k indices for each row of a sparse matrix.

    Args:
        matrix (scipy.sparse.csr_matrix): Input sparse matrix.
        k (int): Number of top indices to retrieve for each row.

    Returns:
        top_indices: List of lists containing the top k indices for each row of the input matrix.
    """

    # Placeholder list for top k indices for each row
    top_indices = []

    # Iterate over each row
    print('Iterate over each row ...')
    for i in range(matrix.shape[0]):
        row_data = matrix.data[matrix.indptr[i]:matrix.indptr[i + 1]]
        row_indices = matrix.indices[matrix.indptr[i]:matrix.indptr[i + 1]]

        if len(row_data) < k:
            top_indices.append(row_indices)
        else:
            # Sort the row data and get top k indices
            sorted_indices = np.argsort(-row_data)
            top_indices.append(row_indices[sorted_indices[:k]])


    return top_indices

In [15]:
def tfidf_predict_documents(tfidf_matrix_normalized, query_vectors, k):
    """
    Process multiple queries and return ranked document indices for each query.

    Args:
        tfidf_matrix_normalized (scipy.sparse.csr_matrix): Normalized TF-IDF matrix of documents.
        query_vectors (numpy.array): TF-IDF vectors of query documents.
        k (int): Number of top-ranked documents to retrieve for each query.

    Returns:
        ranked_doc_indices: List of lists containing the top k ranked document indices for each query.
    """

    # Compute cosine similarities using matrix operations
    print("Compute cosine similarities ...")
    similarity_matrix = cosine_similarity(query_vectors, tfidf_matrix_normalized, dense_output=False)

    # Get document indices ranked by relevance for each query
    print("Rank documents ...")
    # print(similarity_matrix.shape)
    # ranked_doc_indices = np.argsort(-similarity_matrix)[:, :k]
    ranked_doc_indices = top_k_indices(similarity_matrix, k)

    return ranked_doc_indices

In [16]:
def predictions_to_ids_ranking(corpus, queries, prediction):
    """
    Map prediction rows to corresponding 'corpus-id' values from the corpus and create a DataFrame with 'id',
    'corpus-id', and 'score' columns.

    Args:
        corpus (pandas.DataFrame): DataFrame containing 'corpus-id' values.
        queries (pandas.DataFrame): DataFrame containing 'query-id' values.
        prediction (list): List of lists containing top ranked 'corpus-id' values for each query.

    Returns:
        ids_ranking: DataFrame with 'id', 'corpus-id', and 'score' columns representing ranked predictions.
    """
    
    # Map the prediction rows to the corresponding 'corpus-id' values from the corpus
    mapped_results = [corpus.iloc[row]['corpus-id'].values.tolist() for row in prediction]

    # Create a DataFrame with 'id', 'corpus-id', and 'score' columns
    ids_ranking = pd.DataFrame({
        'id': queries['query-id'].iloc[:len(mapped_results)],
        'corpus-id': mapped_results,
        'score': [-1 for _ in range(len(mapped_results))]
    })

    return ids_ranking

### Deep Embedder Corpus Processing
We will use the [SentenceTransformer](https://www.sbert.net/) library to generate embeddings for the corpus.
For this, we will use the [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model.

In [17]:
DATA_FOLDER = "data/"

To avoid re-computing the embeddings every time, we will save the computed embeddings to disk using [pickle](https://docs.python.org/3/library/pickle.html).

In [18]:
def load_pretrained_embedder():
    try:
        with open(f'{DATA_FOLDER}deep_embedder.pkl', 'rb') as f:
            deep_embedder = pickle.load(f)
        return deep_embedder
    except:
        print('404, Fetching deep embedder')
        deep_embedder = SentenceTransformer('all-MiniLM-L6-v2')
        with open(f'{DATA_FOLDER}deep_embedder.pkl', 'wb') as f:
            pickle.dump(deep_embedder, f)
        return deep_embedder

In [19]:
DEEP_EMBEDDER = load_pretrained_embedder()

We will use the [encode](https://www.sbert.net/docs/package_reference/encoders.html#sentence-transformers-encode) method of the SentenceTransformer class to generate embeddings for the corpus.
To avoid running out of memory, we will process the corpus in batches.
To avoid re-computing the embeddings every time, we will save the computed embeddings to disk using [pickle](https://docs.python.org/3/library/pickle.html).

In [20]:
def deep_embedder_process_corpus():
    try:
        with open(f'{DATA_FOLDER}vectorized_corpus.pkl', 'rb') as f:
            embedded_corpus = pickle.load(f)
        return embedded_corpus
    except:
        print('404, Computing Embeded Corpus ...')
        embedded_corpus = DEEP_EMBEDDER.encode(sentences=corpus["text"].tolist(),
                                               batch_size=500,  # TO BE CHANGED
                                               show_progress_bar=True,
                                               device='cpu',  # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                               )

        with open(f'{DATA_FOLDER}vectorized_corpus.pkl', 'wb') as f:
            pickle.dump(embedded_corpus, f)
        return embedded_corpus

### Deep Embedder Query Processing & Prediction
We will use the [encode](https://www.sbert.net/docs/package_reference/encoders.html#sentence-transformers-encode) method of the SentenceTransformer class to generate embeddings for the queries.

In [21]:
def deep_vectorize_queries(queries):
    return DEEP_EMBEDDER.encode(queries.text.tolist(),
                                batch_size=500,  # TO BE CHANGED 
                                show_progress_bar=True,
                                device='cpu',  # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                )


In [22]:
def deep_predict_documents(top_large_k, vectorized_queries, vectorized_corpus, k=10):
    """
    Predict relevant documents for each query using deep embeddings.

    Args:
        top_large_k: 2D array containing top candidates for each query.
        vectorized_queries: Array of query embeddings.
        vectorized_corpus: DataFrame containing embeddings of the corpus.
        k (int, optional): Number of top-ranked documents to retrieve for each query. Default is 10.

    Returns:
        top_k: 2D array containing indices of relevant documents for each query.
    """
    
    # 2D Array for storing indices to relevant documents
    # Shape (Number of queries, k)
    top_k = np.zeros((vectorized_queries.shape[0], k))

    # Iterate through each query embedding
    for idx, vector_query in enumerate(vectorized_queries):
        # Index the embedding of relevant candidates
        # Shape of sentence_feature: (large_k, 384)
        sentence_feature = vectorized_corpus.loc[top_large_k[idx]]  

        # Dot product (numerator of cosine similarity), similar to linear_kernel
        similarity = sentence_feature @ vector_query

        # Get indices of top-k highest similarities
        top_k[idx] = np.argsort(similarity)[-k:]
    return top_k.astype(int)

# Task 1
Execution cell for task-1 documents retrieval using TF-IDF and deep embeddings.
We also track time taken for documents retrieval.

In [23]:
%%time

## TF-IDF PREDICTION 
k = 5000
documents, tf_idf, vocabulary, idf = tfidf_process_corpus()
tfidf_query_vectors = tfidf_vectorize_queries(queries_test, vocabulary, idf)
prediction = tfidf_predict_documents(tf_idf, tfidf_query_vectors, k)
map_ = predictions_to_ids_ranking(corpus, queries_test, prediction)

## DEEP EMBEDDING PREDICTION
VECTORIZED_CORPUS = deep_embedder_process_corpus()
VECTORIZED_CORPUS = pd.DataFrame(VECTORIZED_CORPUS, index=corpus['corpus-id'])

top_large_k = np.zeros(shape=(map_.shape[0], k))
for i in range(map_.shape[0]):
    new_line = np.array(map_.iloc[i]['corpus-id'])
    for j in range(len(new_line)):
        top_large_k[i][j] = new_line[j]

top_large_k = top_large_k.astype(int)
deep_vectors = deep_vectorize_queries(queries_test)

top10 = deep_predict_documents(top_large_k, deep_vectors, VECTORIZED_CORPUS, 10)
top10 = pd.DataFrame(top10)

task_1_result = pd.DataFrame(columns=['corpus-id', 'score'])

for i in range(top10.shape[0]):
    new_top = [top_large_k[i][top10.iloc[i]].tolist()]
    task_1_result.loc[i] = new_top + [-1]
    
# task_1_result.head()

Process queries ...
Initialize sparse matrix ...
Compute  tf ...
Multiply by idf ...
Done !
Compute cosine similarities ...
Rank documents ...
Iterate over each row ...


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

CPU times: total: 3min 36s
Wall time: 6min 7s


# Task 2
Execution cell for task-2 documents reranking using previously calculated and stored TF-IDF results.
We also track time taken for documents reranking.

In [24]:
%%time

# Get a list of mappings from corpus-id to index in the TF-IDF matrix
corpus_ids_mapping = []
for row in df_task_2.iloc:
    corpus_ids_indices = []
    for corpus_id in row['corpus-id']:
        corpus_ids_indices.append(corpus.index[corpus['corpus-id'] == int(corpus_id)][0])
    corpus_ids_mapping.append(corpus_ids_indices)

# Get the TF-IDF matrix for queries       
vectorized = tfidf_vectorize_queries(df_task_2, vocabulary, idf).tocsr()
relevant_scores = []

# Compute relevance scores as cosine similarity between each query and its relevant documents
for idx, vector_query in enumerate(vectorized):
    docc = tf_idf.tocsr()[corpus_ids_mapping[idx]]
    similarity = cosine_similarity(vector_query, docc).flatten()

    relevant_scores.append(similarity.tolist())

Process queries ...
Initialize sparse matrix ...
Compute  tf ...
Multiply by idf ...
Done !
CPU times: total: 16.9 s
Wall time: 38.7 s


In [25]:
df_task_2['score'] = relevant_scores
df_task_2['corpus-id'] = -1

task_2_result = df_task_2.drop(columns=['processed', 'text', 'id'])

# Submission
Here we concatenate the results of task 1 and task 2 and save the final submission file.

In [29]:
def submission_to_csv(submission):
    """
    Save the submission DataFrame to a CSV file.

    Args:
        submission (pandas.DataFrame): DataFrame containing submission data.

    Returns:
        None
    """
    
    DATA_FOLDER = "data/"
    FILE_NAME = "submission"
    submission.index.name='id'
    submission.to_csv(f'{DATA_FOLDER}{FILE_NAME}.csv', index=True, header=True)

In [30]:
# Concatenation with result of task 2
result = pd.concat([task_1_result, task_2_result], axis=0).reset_index(drop=True)

In [31]:
submission_to_csv(result)