In [None]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocess_single_passage(passage,stop_words=True):
    tokenizer = RegexpTokenizer(r'\w+')
    passage = passage.lower()
    tok_pass = tokenizer.tokenize(passage)
    tok_pass = [tok for tok in tok_pass if tok.isalpha()]
    if stop_words == True:
        stop_words = stopwords.words('english')
        tokens = [tok for tok in tok_pass if tok not in stop_words]
    else:
        tokens = tok_pass
    return tokens

In [None]:
candidate_passages_all = pd.read_csv('candidate_passages_top1000.tsv',sep='\t',names=['qid','pid','query','passage'])
candidate_passages_unique = candidate_passages_all.drop_duplicates(subset=['pid'], inplace=False)
N = len(candidate_passages_unique)

In [None]:
candidate_passages_all.head()

Unnamed: 0,qid,pid,query,passage
0,494835,7130104,"sensibilities, definition",This is the definition of RNA along with examp...
1,1128373,7130104,iur definition,This is the definition of RNA along with examp...
2,131843,7130104,definition of a sigmet,This is the definition of RNA along with examp...
3,20455,7130335,ar glasses definition,Best Answer: The AR designation comes from the...
4,719381,7130335,what is ar balance,Best Answer: The AR designation comes from the...


In [None]:
test_queries = pd.read_csv('test-queries.tsv',sep='\t',names=['qid','query'])
test_queries.head()

Unnamed: 0,qid,query
0,1108939,what slows down the flow of blood
1,1112389,"what is the county for grand rapids, mn"
2,792752,what is ruclip
3,1119729,what do you do when you have a nosebleed from ...
4,1105095,where is sugar lake lodge located


In [None]:
validation_data = pd.read_csv('validation_data.tsv',sep='\t')
validation_data_unique = validation_data.drop_duplicates(subset=['pid'], inplace=False)
print(validation_data.shape)
print(validation_data_unique.shape)

(1103039, 5)
(955211, 5)


In [None]:
validation_data_unique.head()

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0


In [None]:
inverted_index = {}

for index, data in validation_data_unique.iterrows():
    pid = data['pid']
    tokens = preprocess_single_passage(data['passage'],stop_words=True)
    freq_tokens = nltk.FreqDist(tokens)
    words_passage = len(tokens)
    for token, freq in freq_tokens.items():
        inverted_index.setdefault(token, [])
        inverted_index[token].append((pid, freq, words_passage))

In [None]:
vocab = list(inverted_index.keys())
total_length_vocab = len(vocab)

In [None]:
word_occur_corpus = 0
for idx, data in validation_data_unique.iterrows():
    word_occur_corpus += len(preprocess_single_passage(data['passage']))
avg_passage_len = word_occur_corpus/N

In [None]:
k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0

In [None]:
def BM25_model(query, passage):
    q_tokens = preprocess_single_passage(query)
    p_tokens = preprocess_single_passage(passage)
    q_length = len(q_tokens)
    query_freq_dist = nltk.FreqDist(q_tokens)
    passage_freq_dist = nltk.FreqDist(p_tokens)
    doclen = len(p_tokens)
    K = k1*((1-b) + b *(float(doclen)/float(avg_passage_len)))
    score = 0
    for token in q_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_freq_dist[token]
        qf = query_freq_dist[token]
        one = np.log(((r + 0.5)/(R - r + 0.5))/((n-r+0.5)/(N-n-R+r+0.5)))
        two = ((k1 + 1) * f)/(K+f)
        three = ((k2+1) * qf)/(k2+qf)
        score += one * two * three
    return score

In [None]:
bm25_dict = {}
for qid in np.unique(validation_data['qid']):
    bm25_dict[qid] = []
    validation_data_ = validation_data[validation_data['qid'] == qid]
    for idx2, row2 in validation_data_.iterrows():
        passage = row2['passage']
        query = row2['queries']
        bm25_dict[qid].append(BM25_model(query, passage))

In [None]:
def sim_rank(cosine_sim_results):
    result = np.array(cosine_sim_results).argsort()[-1103039:][::-1]
    return result

In [None]:
results_bm25 = {}
for idx3, row3 in bm25_dict.items():
    qid = idx3   
    results_bm25[qid] = sim_rank(bm25_dict[qid])

In [None]:
# results_bm25 : For every qid I have 100 pid ranked, keys=qid,values=list of pid index

In [None]:
def average_precision_calc(df):
    df = df.reset_index(drop=True, inplace=False)
    R = 0
    avg_precision_num = 0
    for idx4, row4 in df.iterrows():
        relevancy = row4['relevancy']
        if (relevancy):
            R += 1
            avg_precision_num += (R / (idx4 + 1))
    if R==0:
        return 0
    else:
        return (avg_precision_num / R)

In [None]:
def NDCG_calc(df):
  df1 = df.sort_values(by=['relevancy'], ascending=False) #For Perfect NDCG possible - all 1 top and then if 0
  df1 = df1.reset_index().reindex(df1.columns, axis=1) #reset index
  df2 = df.reset_index(drop=True, inplace=False) #For Original DCG
  PDCG = 0
  DCG = 0
  for idx6,row6 in df1.iterrows():
    relevance_score = row6['relevancy']
    gain = 2**(relevance_score)-1
    disc_gain = gain / np.log2(idx6 + 2) #1 more than formula for index
    PDCG += disc_gain

  for idx7,row7 in df2.iterrows():
    relevance_score1 = row7['relevancy']
    gain1 = 2**(relevance_score1)-1
    disc_gain1 = gain1 / np.log2(idx7 + 2)
    DCG += disc_gain1
  
  if PDCG == 0:
    return 0
  else:
    return (DCG/PDCG)

In [None]:
mean_average_precision = 0
mean_NDCG = 0
for qid, indices in results_bm25.items():
    df_qid = validation_data[validation_data['qid'] == qid]
    ranked_df = df_qid.iloc[indices]

    average_precision = average_precision_calc(ranked_df)
    ndcg = NDCG_calc(ranked_df)

    mean_average_precision += average_precision
    mean_NDCG += ndcg

mean_average_precision = mean_average_precision / len(results_bm25)
mean_NDCG = mean_NDCG/len(results_bm25)

In [None]:
print(f'Average Precision is {mean_average_precision}')
print(f'Normalized Discounted Cumulative Gain(NDCG) is {mean_NDCG}')

Average Precision is 0.2434577587023214
Normalized Discounted Cumulative Gain(NDCG) is 0.3814350153021302


In [None]:
!wget -P /root/input/ -c "https://nlp.stanford.edu/data/glove.6B.zip"

--2022-04-10 17:04:26--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-04-10 17:04:26--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/root/input/glove.6B.zip’


2022-04-10 17:07:07 (5.13 MB/s) - ‘/root/input/glove.6B.zip’ saved [862182613/862182613]



In [None]:
!unzip /root/input/glove.6B.zip

Archive:  /root/input/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = '/content/glove.6B.50d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
model[',']

array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
       -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
       -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
       -0.41634 , -0.15428 ,  0.10068 ,  0.48891 ,  0.31226 , -0.1252  ,
       -0.037512, -1.5179  ,  0.12612 , -0.02442 , -0.042961, -0.28351 ,
        3.5416  , -0.11956 , -0.014533, -0.1499  ,  0.21864 , -0.33412 ,
       -0.13872 ,  0.31806 ,  0.70358 ,  0.44858 , -0.080262,  0.63003 ,
        0.32111 , -0.46765 ,  0.22786 ,  0.36034 , -0.37818 , -0.56657 ,
        0.044691,  0.30392 ], dtype=float32)

In [None]:
validation_data.head()

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0
