In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
pd.options.mode.chained_assignment = None
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
validation_data = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/validation_data_cosine.csv',index_col=0)

In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/train_data.tsv',sep='\t')
print(train_data.shape)

(4364339, 5)


In [5]:
train_data['relevancy'].value_counts(normalize=True)

0.0    0.998901
1.0    0.001099
Name: relevancy, dtype: float64

In [6]:
def negative_sampling(data,r,k):
  qid_list = np.unique(np.asarray(data['qid']))
  samples = []
  for qid in qid_list:
    pos_temp = data[(data['qid'] == qid) & (data['relevancy'] == 1)]
    neg_temp = data[(data['qid'] == qid) & (data['relevancy'] == 0)]
    if len(pos_temp)<r:
      samples.append(pos_temp)
    else:
      samples.append(pos_temp.sample(n=r, random_state=1))
    if len(neg_temp) < k:
      samples.append(neg_temp)
    else:
      samples.append(neg_temp.sample(n=k, random_state=1))
  new_data = pd.concat(samples)
  return new_data.reset_index(drop=True)

In [7]:
final_train = negative_sampling(train_data,10,20)
final_train = final_train.reset_index(drop=True)

In [8]:
# from imblearn.over_sampling import RandomOverSampler

In [9]:
final_train['relevancy'].value_counts(normalize=True)

0.0    0.949966
1.0    0.050034
Name: relevancy, dtype: float64

In [10]:
def preprocess_single_passage(passage,stop_words=True):
    tokenizer = RegexpTokenizer(r'\w+')
    tok_pass = tokenizer.tokenize(passage)
    tok_pass = [tok for tok in tok_pass if tok.isalpha()]
    tok_pass = [tok.lower() for tok in tok_pass]
    
    if stop_words == True:
        stop_words = stopwords.words('english')
        tokens = [tok for tok in tok_pass if tok not in stop_words]
    else:
        tokens = tok_pass
    return tokens

In [11]:
query_tokens_dict = {}
qid_list, ind_list = np.unique(np.asarray(final_train['qid']),return_index=True)
for qid, ind in tqdm(zip(qid_list,ind_list)):
  query_tokens_dict[qid] = preprocess_single_passage(final_train.loc[ind,'queries'])
final_train.loc[:,'query_tokens'] = final_train['qid'].map(query_tokens_dict)

passage_tokens_dict = {}
pid_list, ind_list = np.unique(np.asarray(final_train['pid']),return_index=True)
for pid, ind in tqdm(zip(pid_list,ind_list)):
  passage_tokens_dict[pid] = preprocess_single_passage(final_train.loc[ind,'passage'])
final_train['passage_tokens'] = final_train['pid'].map(passage_tokens_dict)

4590it [00:00, 5099.32it/s]
91708it [00:31, 2871.35it/s]


In [12]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = '/content/drive/MyDrive/IRDM Coursework 2/glove.6B.50d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file,binary=False)

In [13]:
def get_embedding(tokens):
  '''
  INPUT
  tokens: a list of tokens
  OUTPUT
  average embedding: a vectorb represents average embedding of the input list of tokens
  '''
  embedding = 0
  nom = len(tokens)
  for token in tokens:
    if token not in model:
      nom -= 1
    else:
      embedding += model[token]
  try:
    return embedding/nom
  except:
    return 0

In [14]:
passage_embedding_dict = {}
pid_list, ind_list = np.unique(np.asarray(final_train['pid']),return_index=True)
for pid,ind in tqdm(zip(pid_list,ind_list)):
  passage_embedding_dict[pid] = get_embedding(final_train.loc[ind,'passage_tokens']) 
final_train['passage_embedding'] = final_train['pid'].map(passage_embedding_dict)

91708it [00:12, 7632.45it/s]


In [15]:
query_embedding_dict = {}
qid_list, ind_list = np.unique(np.asarray(final_train['qid']),return_index=True)
for qid,ind in tqdm(zip(qid_list,ind_list)):
  query_embedding_dict[qid] = get_embedding(final_train.loc[ind,'query_tokens']) 
final_train['query_embedding'] = final_train['qid'].map(query_embedding_dict)

4590it [00:00, 34662.23it/s]


In [16]:
def cosine_similarity(data):
  temp = []
  for i in tqdm(range(len(data))):
    denom = np.dot(data.loc[i,'query_embedding'],data.loc[i,'passage_embedding'])
    nom = np.sqrt(np.square(data.loc[i,'query_embedding']).sum())*np.sqrt(np.square(data.loc[i,'passage_embedding']).sum())
    if nom == 0:
      temp.append(0)
    else:
      temp.append(denom/nom)
  data['cosine_similarity'] = temp
  return data

In [17]:
cosine_similarity(final_train)

100%|██████████| 95874/95874 [00:09<00:00, 10238.70it/s]


Unnamed: 0,qid,pid,queries,passage,relevancy,query_tokens,passage_tokens,passage_embedding,query_embedding,cosine_similarity
0,2,4339068,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1.0,"[androgen, receptor, define]","[androgen, receptor, ar, also, known, nuclear,...","[0.49091795, 0.29157883, 0.11163568, 0.1308564...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.874866
1,2,7279219,Androgen receptor define,: ligand binding to a G protein-coupled recept...,0.0,"[androgen, receptor, define]","[ligand, binding, g, protein, coupled, recepto...","[0.70091456, 0.38167945, 0.17395169, 0.4451514...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.804005
2,2,6229171,Androgen receptor define,When insulin binds to the receptor on the cell...,0.0,"[androgen, receptor, define]","[insulin, binds, receptor, cell, surface, rece...","[0.8574829, 0.10805729, 0.43395844, 0.2213999,...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.752639
3,2,2946560,Androgen receptor define,1. exteroceptor-any receptor that responds to ...,0.0,"[androgen, receptor, define]","[exteroceptor, receptor, responds, stimuli, ou...","[0.7432729, 0.17395918, -0.2440114, -0.0124907...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.660131
4,2,4803916,Androgen receptor define,a device that measures the quantity of radiati...,0.0,"[androgen, receptor, define]","[device, measures, quantity, radiation, reache...","[0.484962, 0.23540024, 0.50870895, -0.0826678,...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.642420
...,...,...,...,...,...,...,...,...,...,...
95869,1102400,2894724,why do bears hibernate,"To put your computer into hibernate mode, clic...",0.0,"[bears, hibernate]","[put, computer, hibernate, mode, click, start,...","[0.04930706, -0.24188125, 0.5842177, 0.0156327...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.406505
95870,1102400,8835271,why do bears hibernate,"Bears also eat other animals, from rodents to ...",0.0,"[bears, hibernate]","[bears, also, eat, animals, rodents, moose, wo...","[0.25202996, 0.24495722, 0.08694486, 0.0634161...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.400412
95871,1102400,1481892,why do bears hibernate,Hershey Bears Staff | Special to PennLive Bear...,0.0,"[bears, hibernate]","[hershey, bears, staff, special, pennlive, bea...","[-0.024949748, 0.3823909, 0.06031613, 0.056671...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.420733
95872,1102400,5275866,why do bears hibernate,"However, when female pandas are pregnant (or p...",0.0,"[bears, hibernate]","[however, female, pandas, pregnant, pseudopreg...","[0.21554077, 0.35769, -0.18675022, -0.19184287...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.561383


In [18]:
final_train['DocLen'] = 0
final_train['queryLen'] = 0
for i, row in final_train.iterrows():
  final_train['DocLen'][i] = len(final_train['passage'][i])
  final_train['queryLen'][i] = len(final_train['queries'][i])

In [19]:
validation_data['DocLen'] = 0
validation_data['queryLen'] = 0
for i, row in validation_data.iterrows():
  validation_data['DocLen'][i] = len(validation_data['passage'][i])
  validation_data['queryLen'][i] = len(validation_data['queries'][i])

In [20]:
final_train.shape

(95874, 12)

In [21]:
X = final_train[['cosine_similarity', 'DocLen', 'queryLen','qid','relevancy']]

In [22]:
from sklearn.model_selection import train_test_split
Tr,V = train_test_split(X,test_size = 0.2,stratify = final_train['relevancy'])

In [23]:
from xgboost import DMatrix,train,cv
num_of_features = 3
training_data = Tr.sort_values(by=['qid'], ascending=False)

In [24]:
xTr = training_data[['cosine_similarity', 'DocLen', 'queryLen']].values
yTr = training_data['relevancy'].values
QueryCount_Tr = training_data['qid'].value_counts().to_dict() #qid as key and counts as value

In [25]:
Dgroup_Tr = []
qid_list_Tr = []
for idx, row in training_data.iterrows():
  query = row['qid']
  if query in qid_list_Tr:
    continue
  else:
    qid_list_Tr.append(query)
    Dgroup_Tr.append(QueryCount_Tr[query])

In [26]:
V = V.sort_values(by=['qid'], ascending=False)

In [27]:
xV = V[['cosine_similarity', 'DocLen', 'queryLen']].values
yV = V['relevancy'].values
QueryCount_V = V['qid'].value_counts().to_dict() #qid as key and counts as value

In [28]:
Dgroup_V = []
qid_list_V = []
for idx, row in V.iterrows():
  query = row['qid']
  if query in qid_list_V:
    continue
  else:
    qid_list_V.append(query)
    Dgroup_V.append(QueryCount_V[query])

In [29]:
validation_data = validation_data.sort_values(by=['qid'], ascending=False)
xTe = validation_data[['cosine_similarity', 'DocLen', 'queryLen']].values
yTe = validation_data['relevancy'].values
QueryCount_Te = validation_data['qid'].value_counts().to_dict() #qid as key and counts as value

In [30]:
Dgroup_Te = []
qid_list_Te = []
for idx, row in validation_data.iterrows():
  query = row['qid']
  if query in qid_list_Te:
    continue
  else:
    qid_list_Te.append(query)
    Dgroup_Te.append(QueryCount_Te[query])

In [31]:
Tr_dmatrix = DMatrix(data = xTr, label = yTr)
Te_dmatrix = DMatrix(data = xTe, label = yTe)
V_dmatrix = DMatrix(data = xV, label = yV)
Tr_dmatrix.set_group(Dgroup_Tr)
Te_dmatrix.set_group(Dgroup_Te)
V_dmatrix.set_group(Dgroup_V)

In [32]:
params1 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.0001, 'max_depth': 5}
#eta is learning rate

In [33]:
lambdaRank_model1 = train(params1, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.9377
[100]	validation-ndcg:0.94004
[200]	validation-ndcg:0.940097
[300]	validation-ndcg:0.939922
[400]	validation-ndcg:0.940003
[499]	validation-ndcg:0.940044


In [34]:
params2 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.001, 'max_depth': 5}
lambdaRank_model2 = train(params2, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.9377
[100]	validation-ndcg:0.940084
[200]	validation-ndcg:0.941274
[300]	validation-ndcg:0.941309
[400]	validation-ndcg:0.941489
[499]	validation-ndcg:0.94141


In [35]:
params3 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.01, 'max_depth': 5}
lambdaRank_model3 = train(params3, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.9377
[100]	validation-ndcg:0.945426
[200]	validation-ndcg:0.946998
[300]	validation-ndcg:0.947598
[400]	validation-ndcg:0.948475
[499]	validation-ndcg:0.94855


In [36]:
params4 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.1, 'max_depth': 5}
lambdaRank_model4 = train(params4, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.9377
[100]	validation-ndcg:0.950241
[200]	validation-ndcg:0.950601
[300]	validation-ndcg:0.950257
[400]	validation-ndcg:0.950551
[499]	validation-ndcg:0.950344


In [37]:
params5 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 1, 'max_depth': 5}
lambdaRank_model5 = train(params5, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.9377
[100]	validation-ndcg:0.947339
[200]	validation-ndcg:0.944399
[300]	validation-ndcg:0.942288
[400]	validation-ndcg:0.941448
[499]	validation-ndcg:0.941885


In [38]:
params6 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0, 'max_depth': 5}
lambdaRank_model6 = train(params6, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.916415
[100]	validation-ndcg:0.916415
[200]	validation-ndcg:0.916415
[300]	validation-ndcg:0.916415
[400]	validation-ndcg:0.916415
[499]	validation-ndcg:0.916415


In [39]:
params7 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.01, 'max_depth': 8}
lambdaRank_model7 = train(params7, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.938624
[100]	validation-ndcg:0.939208
[200]	validation-ndcg:0.940778
[300]	validation-ndcg:0.942221
[400]	validation-ndcg:0.943607
[499]	validation-ndcg:0.943828


In [40]:
params8 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.01, 'max_depth': 12}
lambdaRank_model8 = train(params8, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.931387
[100]	validation-ndcg:0.93462
[200]	validation-ndcg:0.936569
[300]	validation-ndcg:0.938087
[400]	validation-ndcg:0.939243
[499]	validation-ndcg:0.938835


In [41]:
params9 = {'objective': 'rank:ndcg', 'eval_metric': 'ndcg', 'eta': 0.01}
lambdaRank_model9 = train(params9, Tr_dmatrix, num_boost_round=500,evals=[(V_dmatrix, 'validation')],verbose_eval =100)

[0]	validation-ndcg:0.938293
[100]	validation-ndcg:0.942206
[200]	validation-ndcg:0.944259
[300]	validation-ndcg:0.946183
[400]	validation-ndcg:0.946085
[499]	validation-ndcg:0.946429


In [42]:
# import xgboost as xgb
# model = xgb.XGBRanker(
#     tree_method='exact',
#     booster='gbtree',
#     objective='rank:ndcg',
#     random_state=42,
#     learning_rate=0.06,
#     max_depth=5,
#     n_estimators=700,
#     subsample=0.75,
#     min_child_weight=0.06
#     )

# model.fit(xTr, yTr, group=Dgroup_Tr, verbose=True,eval_set= xV,eval_group =Dgroup_V )

In [43]:
# import scipy.stats as stats
# param_dist = {'n_estimators': stats.randint(40, 1000),
#               'learning_rate': stats.uniform(0.01, 0.59),
#               'subsample': stats.uniform(0.3, 0.6),
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'colsample_bytree': stats.uniform(0.5, 0.4),
#               'min_child_weight': [0.05, 0.1, 0.02]
#               }

In [44]:
# from sklearn.model_selection import RandomizedSearchCV
# import sklearn
# scoring = sklearn.metrics.make_scorer(sklearn.metrics.ndcg_score, greater_is_better=True)
# clf = RandomizedSearchCV(model,
#                          param_distributions=param_dist,
#                          cv=2,
#                          n_iter=5,  
#                          scoring= scoring,
#                          error_score=0,
#                          verbose=3,
#                          n_jobs=-1)
# xyz = clf.fit(xTr,yTr,fit_params={"model__groups": Dgroup_Tr})

In [45]:
y_pred = lambdaRank_model3.predict(Te_dmatrix)
y_pred.shape
#validation_data.shape

(1103039,)

In [46]:
validation_data['score'] = y_pred
validation_data.head()

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_tokens,query_tokens,passage_embedding,query_embedding,cosine_similarity,DocLen,queryLen,score
918703,1102335,4998265,why do people buy cars,"I would like to buy a clean, very dependable u...",0.0,"['would', 'like', 'buy', 'clean', 'dependable'...","['people', 'buy', 'cars']",[ 0.24620189 0.05969301 0.5865029 -0.039355...,[ 0.74518996 -0.19672668 1.14016 -0.288984...,0.916904,311,22,2.052936
750686,1102335,1979444,why do people buy cars,1 Buy a straight scope if you will be watching...,0.0,"['buy', 'straight', 'scope', 'watching', 'bird...","['people', 'buy', 'cars']",[ 0.32140464 0.2716451 0.2731784 -0.312907...,[ 0.74518996 -0.19672668 1.14016 -0.288984...,0.778136,278,22,0.806581
922978,1102335,5281080,why do people buy cars,"The truth is, contrary to popular mythology, d...",0.0,"['truth', 'contrary', 'popular', 'mythology', ...","['people', 'buy', 'cars']",[ 2.90012389e-01 -2.57377932e-03 4.05004352e-...,[ 0.74518996 -0.19672668 1.14016 -0.288984...,0.91853,268,22,2.113142
783197,1102335,4124116,why do people buy cars,When to buy a used car. Seasonal trends in sal...,0.0,"['buy', 'used', 'car', 'seasonal', 'trends', '...","['people', 'buy', 'cars']",[ 2.75420636e-01 -6.14087358e-02 4.32080090e-...,[ 0.74518996 -0.19672668 1.14016 -0.288984...,0.882939,309,22,1.700156
614272,1102335,1139141,why do people buy cars,People buy a swim spa for a lot of different r...,0.0,"['people', 'buy', 'swim', 'spa', 'lot', 'diffe...","['people', 'buy', 'cars']",[ 0.3271537 0.20768057 -0.05867239 -0.132127...,[ 0.74518996 -0.19672668 1.14016 -0.288984...,0.825189,310,22,1.420088


In [47]:
validation_data['LM_rank'] = validation_data.groupby('qid')['score'].rank(method='first',ascending=False).astype('int')

In [48]:
trial_data = validation_data[['qid','pid','LM_rank','score']]

In [49]:
trial_data = trial_data.reset_index(drop=True)

In [50]:
LM_dict = {}
qid_list = trial_data['qid'].unique()
for qid in qid_list:
    top_ones = trial_data[trial_data['qid'] == qid]
    top_ones = top_ones.reset_index(drop=True)
    top_ones = top_ones.sort_values(by=['LM_rank'])
    LM_dict[qid] = top_ones[:100]

In [58]:
f = open("LM.txt", "w")
for lr_df in LM_dict.values():
    for i, data in lr_df.iterrows():
        qid = str(data['qid'].astype(int))
        pid = str(data['pid'].astype(int))
        score = str(data['score'])
        rank = str(data['LM_rank'].astype(int))
        f.write(qid + "," + "A2" + "," + pid + "," + rank + "," + score + "," + "LM" + "\n")
f.close()

In [52]:
def average_precision_calc(df,retrieved,score,rank):
    average_precision = 0
    qid_list = np.unique(np.asarray(df['qid']))
    ranked_passages = df[df[rank] <= retrieved]

    relevant_passage = ranked_passages[ranked_passages['relevancy'] != 0]
    relevant_passage['rank'] = relevant_passage.groupby('qid')[score].rank(method = 'first',ascending=False)

    for qid in qid_list:
        temp = relevant_passage[relevant_passage['qid'] == qid]
        temp['rank'] = temp['rank']/temp[rank]
        if len(temp) == 0:
            average_precision += 0
        else:
            average_precision += sum(temp['rank'])/len(temp)

    average_precision = average_precision/len(qid_list)
    return average_precision

In [53]:
average_precision_LM = average_precision_calc(validation_data,100,'score','LM_rank')

In [54]:
average_precision_LM

0.06723106384538753

In [55]:
def NDCG_calc(df,retrieved, rank):

    all_DCG = 0
    relevant_passage = df[df['relevancy'] != 0]
    relevant_passage_retrived = relevant_passage[relevant_passage[rank] <= retrieved]

    qid_list = np.unique(np.asarray(df['qid']))

    for qid in qid_list:
        temp = relevant_passage[relevant_passage['qid'] == qid]
        DCG = sum(1/np.log2(np.asarray(temp[rank])+1))
        optDCG = sum(1/np.log2(np.arange(1,len(temp)+1)+1))
        all_DCG += DCG/optDCG
    all_DCG = all_DCG/len(qid_list)

    return all_DCG

In [56]:
NDCG_LM = NDCG_calc(validation_data,100,'LM_rank')

In [57]:
NDCG_LM

0.2019752552793564