In [6]:
# if not installed, run command: pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Data Preparation

**(Same as other notebook. Skip if already imported data)**

In [7]:
corpus = pd.read_json('data/corpus.jsonl', lines=True).sort_values(by=["_id"]).rename(columns={"_id": "corpus-id"})
corpus

Unnamed: 0,corpus-id,text
1000000,0,The presence of communication amid scientific ...
966376,8,"In June 1942, the United States Army Corps of ..."
468831,12,Tutorial: Introduction to Restorative Justice....
1000001,16,The approach is based on a theory of justice t...
306952,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
950989,8841780,Wolves don't hide. They don't even live in cav...
395590,8841787,The UNHCR Country Representative in Kenya. Str...
93101,8841790,2. Describe the misery at Kakuma. 3. Compariso...
669122,8841800,Following the death of his employer and mentor...


In [8]:
queries = pd.read_json(path_or_buf='data/queries.jsonl', lines=True).sort_values(by=["_id"])
queries['text'] = queries['text'].str.strip()
queries = queries.drop(columns=["metadata"]).rename(columns={"_id": "query-id"})
queries

Unnamed: 0,query-id,text
506217,2,Androgen receptor define
65864,3,Another name for the primary visual cortex is
372466,4,Defining alcoholism as a disease is associated...
326447,5,ECT is a treatment that is used for
117580,6,"Ebolavirus is an enveloped virus, which means"
...,...,...
158901,1185863,why did rachel carson die
83120,1185864,definition of ramen
7634,1185865,amex india customer care number
1,1185868,_________ justice is designed to repair the ha...


In [9]:
query_corpus_train_map = pd.read_csv("data/task1_train.tsv", sep="\t")
query_corpus_train_map.sort_values(by="query-id")

Unnamed: 0,query-id,corpus-id,score
70257,3,1142680,1
395137,4,5613529,1
346352,5,4956428,1
125307,6,1931409,1
66896,8,1094214,1
...,...,...,...
169115,1185863,2545716,1
88577,1185864,1408016,1
8141,1185865,229186,1
1,1185868,16,1


In [10]:
queries_train = pd.merge(queries, query_corpus_train_map, left_on='query-id', right_on='query-id', how='inner').drop(columns=[ "score","corpus-id"])
queries_train_subset = queries_train.iloc[:7437, :]
queries_train_subset

Unnamed: 0,query-id,text
0,3,Another name for the primary visual cortex is
1,4,Defining alcoholism as a disease is associated...
2,5,ECT is a treatment that is used for
3,6,"Ebolavirus is an enveloped virus, which means"
4,8,"In humans, the normal set point for body tempe..."
...,...,...
7432,18204,anger is fear
7433,18205,anger management definition
7434,18208,angie baby meaning
7435,18209,angie lindvall


In [11]:
df_test = pd.read_csv("data/task1_test.tsv", sep="\t")
queries_test = pd.merge(queries, df_test, left_on='query-id', right_on='query-id', how='inner').drop(columns=["id"])
queries_test

Unnamed: 0,query-id,text
0,2,Androgen receptor define
1,1215,3 levels of government in canada and their res...
2,1288,3/5 of 60
3,1576,60x40 slab cost
4,2235,Bethel University was founded in what year
...,...,...
7432,1102335,why do people buy cars
7433,1102351,why do jefferson and stanton include these sim...
7434,1102390,why do children get aggressive
7435,1102393,why do celebrate st patrick's day


# Download pre-trained model

Model: all-MiniLM-L6-v2

Description: All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. 

Link: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/

In [3]:
deep_embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Encode for the whole corpus by batch

## **Important**: Reset index of the corpus dataframe

In [13]:
# Change the index column of the sorted corpus 
corpus = corpus.reset_index(drop=True)
corpus

Unnamed: 0,corpus-id,text
0,0,The presence of communication amid scientific ...
1,8,"In June 1942, the United States Army Corps of ..."
2,12,Tutorial: Introduction to Restorative Justice....
3,16,The approach is based on a theory of justice t...
4,23,Phloem is a conductive (or vascular) tissue fo...
...,...,...
1471401,8841780,Wolves don't hide. They don't even live in cav...
1471402,8841787,The UNHCR Country Representative in Kenya. Str...
1471403,8841790,2. Describe the misery at Kakuma. 3. Compariso...
1471404,8841800,Following the death of his employer and mentor...


In [None]:
# Initialize batch
# Inputs: sentences: list of sentences 
embedded_corpus = deep_embedder.encode(sentences=corpus["text"].tolist(), 
                                       batch_size= 128, # TO BE CHANGED
                                       show_progress_bar=True, 
                                       device=None, # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                       )


## Save corpus embedding for reuse

In [None]:
# Saving uncompressed array.
np.save('output/vectorized_corpus', embedded_corpus)

## Load saved corpus embedding

In [None]:
embedded_corpus = np.load('output/vectorized_corpus.npy')


# Inferring relevant documents for queries

## TODO: Update variables

In [None]:
## !!!! UPDATE CORRECT VARIABLES
queries = queries_train_subset # TO BE CHANGED
top_large_k = np.zeros(queries.shape[0], 1000) # TO BE CHANGED

## Calculate embeddings for all queries

In [None]:
# 2D array the embeddings of queries
# Shape of vectorized_queries: (Number of queries, 384)

vectorized_queries = deep_embedder.encode(queries.text.tolist(),
                                          batch_size= 128, # TO BE CHANGED 
                                          show_progress_bar=True, 
                                          device=None, # TO BE CHANGED -- 'cpu', 'cuda', automatic if None
                                       )

## Calculate similarities to good candidates

In [None]:
k =10

# 2D Array for storing indices to relevant documents
# Shape (Number of queries, k)
top_10 = np.zeros((vectorized_queries.shape[0], k))

# Iterate through each query embedding
for idx, vector_query in enumerate(vectorized_queries):

    # Index the embedding of relevant candidates
    # Shape of sentence_feature: (large_k, 384)
    sentence_feature = embedded_corpus[top_large_k[idx]]

    # Dot product (numerator of cosine similarity), similar to linear_kernel
    similarity = sentence_feature @ vector_query

    # Get indices of top-k highest similarities
    top_10[idx] = np.argsort(similarity)[-k:] 
