In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
pd.options.mode.chained_assignment = None
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = '/content/drive/MyDrive/IRDM Coursework 2/glove.6B.50d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file,binary=False)

In [4]:
def preprocess_single_passage(passage,stop_words=True):
    tokenizer = RegexpTokenizer(r'\w+')
    tok_pass = tokenizer.tokenize(passage)
    tok_pass = [tok for tok in tok_pass if tok.isalpha()]
    tok_pass = [tok.lower() for tok in tok_pass]
    
    if stop_words == True:
        stop_words = stopwords.words('english')
        tokens = [tok for tok in tok_pass if tok not in stop_words]
    else:
        tokens = tok_pass
    return tokens

In [5]:
validation_data = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/validation_data.tsv',sep='\t')
print(validation_data.shape)

(1103039, 5)


In [6]:
train_data = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/train_data.tsv',sep='\t')
print(train_data.shape)

(4364339, 5)


In [7]:
def negative_sampling(data,k):
  qid_list = np.unique(np.asarray(data['qid']))
  samples = []
  for qid in qid_list:
    pos_temp = data[(data['qid'] == qid) & (data['relevancy'] == 1)]
    neg_temp = data[(data['qid'] == qid) & (data['relevancy'] == 0)]
    samples.append(pos_temp.sample(n=1, random_state=1))
    if len(neg_temp) < k:
      samples.append(neg_temp)
    else:
      samples.append(neg_temp.sample(n=k, random_state=1))
  new_data = pd.concat(samples)
  return new_data.reset_index(drop=True)

In [8]:
final_train = negative_sampling(train_data,10)
final_train = final_train.reset_index(drop=True)

In [9]:
#preprocess queries and turn queries into list of tokens

# query_tokens_dict = {}
# qid_list, ind_list = np.unique(np.asarray(validation_data['qid']),return_index=True)
# for qid, ind in tqdm(zip(qid_list,ind_list)):
#   query_tokens_dict[qid] = preprocess_single_passage(validation_data.loc[ind,'queries'])
# validation_data.loc[:,'query_tokens'] = validation_data['qid'].map(query_tokens_dict)

# passage_tokens_dict = {}
# pid_list, ind_list = np.unique(np.asarray(validation_data['pid']),return_index=True)
# for pid, ind in tqdm(zip(pid_list,ind_list)):
#   passage_tokens_dict[pid] = preprocess_single_passage(validation_data.loc[ind,'passage'])
# validation_data['passage_tokens'] = validation_data['pid'].map(passage_tokens_dict)

In [10]:
#preprocess queries and turn queries into list of tokens

# query_tokens_dict = {}
# qid_list, ind_list = np.unique(np.asarray(final_train['qid']),return_index=True)
# for qid, ind in tqdm(zip(qid_list,ind_list)):
#   query_tokens_dict[qid] = preprocess_single_passage(final_train.loc[ind,'queries'])
# final_train.loc[:,'query_tokens'] = final_train['qid'].map(query_tokens_dict)

# passage_tokens_dict = {}
# pid_list, ind_list = np.unique(np.asarray(final_train['pid']),return_index=True)
# for pid, ind in tqdm(zip(pid_list,ind_list)):
#   passage_tokens_dict[pid] = preprocess_single_passage(final_train.loc[ind,'passage'])
# final_train['passage_tokens'] = final_train['pid'].map(passage_tokens_dict)

In [11]:
final_train.iloc[9952]

qid                                                     393881
pid                                                    2358751
queries                        in what county in ga is canton'
passage      Leave a comment. Filed under --TALBOT COUNTY G...
relevancy                                                  0.0
Name: 9952, dtype: object

In [12]:
final_train['passage_tokens'] = 0
for i,passage in enumerate(tqdm(final_train['passage'])):
  final_train['passage_tokens'][i] = preprocess_single_passage(passage)

100%|██████████| 50341/50341 [00:18<00:00, 2795.21it/s]


In [13]:
final_train['query_tokens'] = 0
for i,query in enumerate(tqdm(final_train['queries'])):
  final_train['query_tokens'][i] = preprocess_single_passage(query)

100%|██████████| 50341/50341 [00:09<00:00, 5363.89it/s]


In [14]:
validation_data['passage_tokens'] = 0
for i,passage in enumerate(tqdm(validation_data['passage'])):
  validation_data['passage_tokens'][i] = preprocess_single_passage(passage)

100%|██████████| 1103039/1103039 [06:55<00:00, 2655.36it/s]


In [15]:
validation_data['query_tokens'] = 0
for i,query in enumerate(tqdm(validation_data['queries'])):
  validation_data['query_tokens'][i] = preprocess_single_passage(query)

100%|██████████| 1103039/1103039 [03:43<00:00, 4935.08it/s]


In [16]:
def get_embedding(tokens):
  '''
  INPUT
  tokens: a list of tokens
  OUTPUT
  average embedding: a vectorb represents average embedding of the input list of tokens
  '''
  embedding = 0
  nom = len(tokens)
  for token in tokens:
    if token not in model:
      nom -= 1
    else:
      embedding += model[token]
  try:
    return embedding/nom
  except:
    return 0

In [17]:
passage_embedding_dict = {}
pid_list, ind_list = np.unique(np.asarray(final_train['pid']),return_index=True)
for pid,ind in tqdm(zip(pid_list,ind_list)):
  passage_embedding_dict[pid] = get_embedding(final_train.loc[ind,'passage_tokens']) 
final_train['passage_embedding'] = final_train['pid'].map(passage_embedding_dict)

48723it [00:06, 6979.30it/s]


In [18]:
query_embedding_dict = {}
qid_list, ind_list = np.unique(np.asarray(final_train['qid']),return_index=True)
for qid,ind in tqdm(zip(qid_list,ind_list)):
  query_embedding_dict[qid] = get_embedding(final_train.loc[ind,'query_tokens']) 
final_train['query_embedding'] = final_train['qid'].map(query_embedding_dict)

4590it [00:00, 26241.84it/s]


In [19]:
passage_embedding_dict = {}
pid_list, ind_list = np.unique(np.asarray(validation_data['pid']),return_index=True)
for pid,ind in tqdm(zip(pid_list,ind_list)):
  passage_embedding_dict[pid] = get_embedding(validation_data.loc[ind,'passage_tokens']) 
validation_data['passage_embedding'] = validation_data['pid'].map(passage_embedding_dict)

955211it [02:11, 7271.50it/s]


In [20]:
query_embedding_dict = {}
qid_list, ind_list = np.unique(np.asarray(validation_data['qid']),return_index=True)
for qid,ind in tqdm(zip(qid_list,ind_list)):
  query_embedding_dict[qid] = get_embedding(validation_data.loc[ind,'query_tokens']) 
validation_data['query_embedding'] = validation_data['qid'].map(query_embedding_dict)

1148it [00:00, 30260.94it/s]


In [21]:
# validation_data['passage_embedding'] = np.empty((len(validation_data), 0)).tolist()
# sum_em = 0
# for i,tkn in enumerate(tqdm(validation_data['passage_tokens'])):
#   for t in tkn:
#     v = 0
#     if t not in model:
#       v+=1
#       continue
#     else:
#       sum_em += model[t]
#   validation_data['passage_embedding'][i] = [x/(len(tkn)-v) for x in sum_em]

In [22]:
# def pid_emb(pid):
#   em=0
#   sum_em = 0
#   tkns = validation_data['passage_tokens'][pid]
#   for tkn in tkns:
#     v = 0
#     if tkn not in model:
#       v+=1
#       continue
#     else:
#       sum_em += model[tkn]
#     em = [x/(len(tkn)-v) for x in sum_em]
#   return em

# key_list = np.unique(validation_data['pid'])
# passage_emb_dict = {}
# for i,key in enumerate(tqdm(key_list)):
#   passage_emb_dict[key] = pid_emb(key)

In [23]:
# new_unique_qid = validation_data.drop_duplicates(subset=['qid'])

In [24]:
# new_unique_qid = new_unique_qid.set_index('qid')

In [25]:
# def qid_emb(qid):
#   em=0
#   sum_em = 0
#   tkns = validation_data['query_tokens'][qid]
#   for tkn in tkns:
#     v = 0
#     if tkn not in model:
#       v+=1
#       continue
#     else:
#       sum_em += model[tkn]
#     em = [x/(len(tkn)-v) for x in sum_em]
#   return em

In [26]:
# key_list = np.unique(validation_data['qid'])
# query_emb_dict = {}
# for i,key in enumerate(tqdm(key_list)):
#   query_emb_dict[key] = qid_emb(key)

In [27]:
# validation_data['query_embedding'] = validation_data['qid'].map(query_emb_dict)

In [28]:
def cosine_similarity(data):
  temp = []
  for i in tqdm(range(len(data))):
    denom = np.dot(data.loc[i,'query_embedding'],data.loc[i,'passage_embedding'])
    nom = np.sqrt(np.square(data.loc[i,'query_embedding']).sum())*np.sqrt(np.square(data.loc[i,'passage_embedding']).sum())
    if nom == 0:
      temp.append(0)
    else:
      temp.append(denom/nom)
  data['cosine_similarity'] = temp
  return data

In [29]:
cosine_similarity(validation_data)

100%|██████████| 1103039/1103039 [02:18<00:00, 7948.42it/s]


Unnamed: 0,qid,pid,queries,passage,relevancy,passage_tokens,query_tokens,passage_embedding,query_embedding,cosine_similarity
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0,"[start, studying, bonding, carbs, proteins, li...","[golgi, apparatus, proteins, lipids, arrive]","[0.2690694, 0.07032246, -0.22095296, -0.134478...","[0.832084, -0.16500281, 0.050816, 0.16417, 0.4...",0.617340
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0,"[example, pc, expansion, card, maybe, jargon, ...","[graphic, card, located, cpu]","[0.118339844, 0.1926797, 0.47564104, 0.2192803...","[-0.017310008, 0.32846758, 0.68064654, 0.68913...",0.831845
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0,"[common, cards, buses, common, types, expansio...","[graphic, card, located, cpu]","[0.020571431, 0.11528355, 0.52425003, 0.421483...","[-0.017310008, 0.32846758, 0.68064654, 0.68913...",0.847569
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0,"[occurrence, since, tells, us, first, statemen...","[property, premises, meaning]","[0.24477586, 0.12094858, -0.1262933, -0.035436...","[0.5320867, 0.13899668, -0.18879335, -0.206136...",0.751343
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0,"[windows, desktop, applications, develop, desk...","[printing, mechanism]","[0.21993366, -0.27575365, 0.6672145, 0.2243792...","[0.17438498, -0.67538, -0.050255, -0.1327915, ...",0.813332
...,...,...,...,...,...,...,...,...,...,...
1103034,176994,999706,dynamic link library meaning,Internet Public Library – The Internet Public ...,0.0,"[internet, public, library, internet, public, ...","[dynamic, link, library, meaning]","[0.14674158, 0.34966046, 0.017511245, 0.040230...","[0.3431825, 0.6234838, 0.048773758, 0.18839249...",0.858977
1103035,1089177,999765,united home life insurance phone number,Geico Customer Service Phone Number: 1-800-861...,0.0,"[geico, customer, service, phone, number, call...","[united, home, life, insurance, phone, number]","[0.45963958, 0.10225543, 0.6842066, 0.4092991,...","[0.28253, 0.2855925, 0.43787614, 0.13951649, 0...",0.894578
1103036,42555,999824,average salary for primary care sports medicin...,The current average NBA salary of $6.2 million...,0.0,"[current, average, nba, salary, million, seaso...","[average, salary, primary, care, sports, medic...","[-0.47118738, 0.523908, 0.34010562, 0.17925736...","[-0.16498211, 0.5072343, 0.055729706, -0.42456...",0.851662
1103037,1044249,999824,what is the average salary of an nba player,The current average NBA salary of $6.2 million...,0.0,"[current, average, nba, salary, million, seaso...","[average, salary, nba, player]","[-0.47118738, 0.523908, 0.34010562, 0.17925736...","[-1.3559026, 0.6446125, 0.39901552, 0.1891925,...",0.905307


In [30]:
validation_data.isnull().sum()

qid                  0
pid                  0
queries              0
passage              0
relevancy            0
passage_tokens       0
query_tokens         0
passage_embedding    0
query_embedding      0
cosine_similarity    0
dtype: int64

In [31]:
cosine_similarity(final_train)

100%|██████████| 50341/50341 [00:04<00:00, 10448.85it/s]


Unnamed: 0,qid,pid,queries,passage,relevancy,passage_tokens,query_tokens,passage_embedding,query_embedding,cosine_similarity
0,2,4339068,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1.0,"[androgen, receptor, ar, also, known, nuclear,...","[androgen, receptor, define]","[0.49091795, 0.29157883, 0.11163568, 0.1308564...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.874866
1,2,7279219,Androgen receptor define,: ligand binding to a G protein-coupled recept...,0.0,"[ligand, binding, g, protein, coupled, recepto...","[androgen, receptor, define]","[0.70091456, 0.38167945, 0.17395169, 0.4451514...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.804005
2,2,6229171,Androgen receptor define,When insulin binds to the receptor on the cell...,0.0,"[insulin, binds, receptor, cell, surface, rece...","[androgen, receptor, define]","[0.8574829, 0.10805729, 0.43395844, 0.2213999,...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.752639
3,2,2946560,Androgen receptor define,1. exteroceptor-any receptor that responds to ...,0.0,"[exteroceptor, receptor, responds, stimuli, ou...","[androgen, receptor, define]","[0.7432729, 0.17395918, -0.2440114, -0.0124907...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.660131
4,2,4803916,Androgen receptor define,a device that measures the quantity of radiati...,0.0,"[device, measures, quantity, radiation, reache...","[androgen, receptor, define]","[0.484962, 0.23540024, 0.50870895, -0.0826678,...","[0.65137, 0.6691633, -0.05296333, 0.198797, 0....",0.642420
...,...,...,...,...,...,...,...,...,...,...
50336,1102400,4236937,why do bears hibernate,although pikas live in colonies they are very ...,0.0,"[although, pikas, live, colonies, territorial,...","[bears, hibernate]","[0.21388185, 0.02184949, -0.23158751, -0.12833...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.521109
50337,1102400,5703282,why do bears hibernate,Systemd based method to suspend or hibernate y...,0.0,"[systemd, based, method, suspend, hibernate, l...","[bears, hibernate]","[0.29989046, -0.2653506, 0.249474, 0.13818963,...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.422469
50338,1102400,3762253,why do bears hibernate,"A symbol of the Arctic, polar bears are the wo...",0.0,"[symbol, arctic, polar, bears, world, largest,...","[bears, hibernate]","[0.15273103, 0.44464138, -0.051849578, 0.00197...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.472612
50339,1102400,239157,why do bears hibernate,"Grizzly bears, like black bears, eat a lot of ...",0.0,"[grizzly, bears, like, black, bears, eat, lot,...","[bears, hibernate]","[0.111869894, 0.037523024, -0.355963, -0.26498...","[0.015780002, 0.16692501, -0.128705, -0.283115...",0.644838


In [32]:
final_train['cosine_similarity'].isnull().sum()

0

In [33]:
validation_data['cosine_similarity'].isnull().sum()

0

In [34]:
validation_data.to_csv('validation_data_cosine.csv')
!cp validation_data_cosine.csv "/content/drive/MyDrive/IRDM Coursework 2"

In [35]:
final_train.to_csv('train_data_cosine.csv')
!cp train_data_cosine.csv "/content/drive/MyDrive/IRDM Coursework 2"

LOGISTIC

In [36]:
final_train = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/train_data_cosine.csv',index_col=0)

In [37]:
final_train.head()

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_tokens,query_tokens,passage_embedding,query_embedding,cosine_similarity
0,2,4339068,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1.0,"['androgen', 'receptor', 'ar', 'also', 'known'...","['androgen', 'receptor', 'define']",[ 4.9091795e-01 2.9157883e-01 1.1163568e-01 ...,[ 0.65137 0.6691633 -0.05296333 0.198797...,0.874866
1,2,7279219,Androgen receptor define,: ligand binding to a G protein-coupled recept...,0.0,"['ligand', 'binding', 'g', 'protein', 'coupled...","['androgen', 'receptor', 'define']",[ 0.70091456 0.38167945 0.17395169 0.445151...,[ 0.65137 0.6691633 -0.05296333 0.198797...,0.804005
2,2,6229171,Androgen receptor define,When insulin binds to the receptor on the cell...,0.0,"['insulin', 'binds', 'receptor', 'cell', 'surf...","['androgen', 'receptor', 'define']",[ 0.8574829 0.10805729 0.43395844 0.221399...,[ 0.65137 0.6691633 -0.05296333 0.198797...,0.752639
3,2,2946560,Androgen receptor define,1. exteroceptor-any receptor that responds to ...,0.0,"['exteroceptor', 'receptor', 'responds', 'stim...","['androgen', 'receptor', 'define']",[ 0.7432729 0.17395918 -0.2440114 -0.012490...,[ 0.65137 0.6691633 -0.05296333 0.198797...,0.660131
4,2,4803916,Androgen receptor define,a device that measures the quantity of radiati...,0.0,"['device', 'measures', 'quantity', 'radiation'...","['androgen', 'receptor', 'define']",[ 0.484962 0.23540024 0.50870895 -0.082667...,[ 0.65137 0.6691633 -0.05296333 0.198797...,0.64242


In [38]:
validation_data = pd.read_csv('/content/drive/MyDrive/IRDM Coursework 2/validation_data_cosine.csv',index_col=0)

In [39]:
validation_data.head()

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_tokens,query_tokens,passage_embedding,query_embedding,cosine_similarity
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0,"['start', 'studying', 'bonding', 'carbs', 'pro...","['golgi', 'apparatus', 'proteins', 'lipids', '...",[ 0.2690694 0.07032246 -0.22095296 -0.134478...,[ 0.832084 -0.16500281 0.050816 0.16417 ...,0.61734
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0,"['example', 'pc', 'expansion', 'card', 'maybe'...","['graphic', 'card', 'located', 'cpu']",[ 1.18339844e-01 1.92679703e-01 4.75641042e-...,[-0.01731001 0.32846758 0.68064654 0.689135...,0.831845
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0,"['common', 'cards', 'buses', 'common', 'types'...","['graphic', 'card', 'located', 'cpu']",[ 0.02057143 0.11528355 0.52425003 0.421483...,[-0.01731001 0.32846758 0.68064654 0.689135...,0.847569
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0,"['occurrence', 'since', 'tells', 'us', 'first'...","['property', 'premises', 'meaning']",[ 0.24477586 0.12094858 -0.1262933 -0.035436...,[ 0.5320867 0.13899668 -0.18879335 -0.206136...,0.751343
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0,"['windows', 'desktop', 'applications', 'develo...","['printing', 'mechanism']",[ 0.21993366 -0.27575365 0.6672145 0.224379...,[ 0.17438498 -0.67538 -0.050255 -0.132791...,0.813332


In [40]:
final_train['DocLen'] = 0
final_train['queryLen'] = 0
for i, row in final_train.iterrows():
  final_train['DocLen'][i] = len(final_train['passage'][i])
  final_train['queryLen'][i] = len(final_train['queries'][i])

In [41]:
validation_data['DocLen'] = 0
validation_data['queryLen'] = 0
for i, row in validation_data.iterrows():
  validation_data['DocLen'][i] = len(validation_data['passage'][i])
  validation_data['queryLen'][i] = len(validation_data['queries'][i])

In [42]:
xTr = final_train[['cosine_similarity','DocLen','queryLen']]
# xTr = final_train[['cosine_similarity']]
yTr = final_train['relevancy']
xTe = validation_data[['cosine_similarity','DocLen','queryLen']]
# xTe = validation_data[['cosine_similarity']]
yTe = validation_data['relevancy']

In [43]:
class LogisticReg:
  def __init__(self,lr,n_iters):
    self.lr = lr
    self.n_iters = n_iters
    self.weights = None
    self.bias = None
  
  def fit(self,X,y):
    n,d = X.shape
    self.weights = np.ones(d)
    self.bias = 0 #np.zeros(n)

    #gradient descent 
    for _ in range(self.n_iters):
      linear = np.dot(X,self.weights) + self.bias #xw+b
      y_predicted = self.sigmoid(linear) #sigmoid
      #caluclate  gradients for bias and weights
      dw = (1/n) * np.dot(X.T,(y_predicted-y)) #2 is scaling
      db = (1/n) * np.sum(y_predicted-y)
      #update them
      self.weights -= self.lr * dw
      self.bias -= self.lr * db

  def predict(self,X):
    linear = np.dot(X,self.weights) + self.bias
    y_predicted = self.sigmoid(linear)
    class_pred = [1 if i>=0.5 else 0 for i in y_predicted]
    return class_pred, y_predicted

  def sigmoid(self,z):
    return 1/(1+np.exp(-z))

In [44]:
regressor = LogisticReg(lr = 0.1,n_iters = 1000)

In [45]:
regressor.fit(xTr,yTr)



In [46]:
y_pred,prob = regressor.predict(xTe)



In [47]:
train_pred,prob2 = regressor.predict(xTr)



In [49]:
from sklearn.metrics import accuracy_score

In [50]:
accuracy_score(yTr, train_pred)

0.9088218350847221

In [51]:
regressor.weights

array([  1.53959932, -23.7192423 ,  -1.28640197])

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(yTe, y_pred)

0.9989048437997206

In [53]:
regressor1 = LogisticReg(lr = 0.01,n_iters = 1000)
regressor1.fit(xTr,yTr)
y_pred1,prob1 = regressor1.predict(xTe)
accuracy_score(yTe, y_pred1)



0.9963627759308601

In [54]:
regressor2 = LogisticReg(lr = 0.001,n_iters = 1000)
regressor2.fit(xTr,yTr)
y_pred2,prob2 = regressor2.predict(xTe)
accuracy_score(yTe, y_pred2)

0.9977317211812093

In [55]:
regressor3 = LogisticReg(lr = 0,n_iters = 1000)
regressor3.fit(xTr,yTr)
y_pred3,prob3 = regressor3.predict(xTe)
accuracy_score(yTe, y_pred3)

0.00109515620027941

In [56]:
regressor4 = LogisticReg(lr = 1,n_iters = 1000)
regressor4.fit(xTr,yTr)
y_pred4,prob4 = regressor4.predict(xTe)
accuracy_score(yTe, y_pred4)



0.9989048437997206

In [57]:
validation_data['LogitRegr_prob'] = prob2
validation_data['LogitRegr_label'] = y_pred2

In [58]:
#For every qid rank results. So each rank comes almost 1148 times as its unique query length
validation_data['Logit_rank'] = validation_data.groupby('qid')['LogitRegr_prob'].rank(method='first',ascending=False).astype('int')

In [59]:
trial_data = validation_data[['qid','pid','Logit_rank','LogitRegr_prob']]

In [60]:
trial_data.head()

Unnamed: 0,qid,pid,Logit_rank,LogitRegr_prob
0,1082792,1000084,18,0.370778
1,995825,1000492,632,0.02517
2,995825,1000494,642,0.023438
3,1091246,1000522,482,0.028967
4,1047854,1000585,997,1.6e-05


In [61]:
LR_dict = {}
qid_list = trial_data['qid'].unique()
for qid in qid_list:
    top_ones = trial_data[trial_data['qid'] == qid]
    top_ones = top_ones.reset_index(drop=True)
    top_ones = top_ones.sort_values(by=['Logit_rank'])
    LR_dict[qid] = top_ones[:100]
    # top_100 = top_ones.sort_values(by=top_ones['Logit_rank'])
    # LR_dict[qid] = top_100

In [62]:
f = open("LR.txt", "w")
for lr_df in LR_dict.values():
    for i, data in lr_df.iterrows():
        qid = str(data['qid'].astype(int))
        pid = str(data['pid'].astype(int))
        score = str(data['LogitRegr_prob'])
        rank = str(data['Logit_rank'].astype(int))
        f.write(qid + "," + "A2" + "," + pid + "," + rank + "," + score + "," + "LR" + "\n")
f.close()

In [63]:
def average_precision_calc(df,retrieved,score,rank):
    average_precision = 0
    qid_list = np.unique(np.asarray(df['qid']))
    ranked_passages = df[df[rank] <= retrieved]

    relevant_passage = ranked_passages[ranked_passages['relevancy'] != 0]
    relevant_passage['rank'] = relevant_passage.groupby('qid')[score].rank(method = 'first',ascending=False)

    for qid in qid_list:
        temp = relevant_passage[relevant_passage['qid'] == qid]
        temp['rank'] = temp['rank']/temp[rank]
        if len(temp) == 0:
            average_precision += 0
        else:
            average_precision += sum(temp['rank'])/len(temp)

    average_precision = average_precision/len(qid_list)
    return average_precision

In [64]:
average_precision_LR = average_precision_calc(validation_data,100,'LogitRegr_prob','Logit_rank')

In [65]:
average_precision_LR

0.015994317626481853

In [66]:
def NDCG_calc(df,retrieved, rank):

    all_DCG = 0
    relevant_passage = df[df['relevancy'] != 0]
    relevant_passage_retrived = relevant_passage[relevant_passage[rank] <= retrieved]

    qid_list = np.unique(np.asarray(df['qid']))

    for qid in qid_list:
        temp = relevant_passage[relevant_passage['qid'] == qid]
        DCG = sum(1/np.log2(np.asarray(temp[rank])+1))
        optDCG = sum(1/np.log2(np.arange(1,len(temp)+1)+1))
        all_DCG += DCG/optDCG
    all_DCG = all_DCG/len(qid_list)

    return all_DCG

In [67]:
NDCG_LR = NDCG_calc(validation_data,100,'Logit_rank')

In [68]:
NDCG_LR

0.13809445313492075