# Data Preprocessing

In [None]:
!pip install transformers
import transformers
from transformers import BertTokenizerFast, BertModel

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

col=['qid','pid','query','passage','relevancy']

train_data=pd.read_csv("part2/train_data.tsv", sep='\t', header=None, names=col)
train_data=pd.DataFrame(train_data)
train_data = train_data.iloc[1:]

train_passages = train_data['passage'].values
train_queries = train_data['query'].values
train_pids = train_data['pid'].values.astype(np.int64)
train_qids = train_data['qid'].values.astype(np.int64)
train_labels = train_data['relevancy'].values.astype(np.float64).astype(np.int64)



test_data=pd.read_csv("part2/validation_data.tsv", sep='\t', header=None, names=col)
test_data=pd.DataFrame(test_data)
test_data = test_data.iloc[1:]

test_passages = test_data['passage'].values
test_queries = test_data['query'].values
test_pids = test_data['pid'].values.astype(np.int64)
test_qids = test_data['qid'].values.astype(np.int64)
test_labels = test_data['relevancy'].values.astype(np.float64).astype(np.int64)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertModel.from_pretrained("prajjwal1/bert-tiny").to(device)
model.eval()

In [4]:
class DataSeq(Dataset):
    '''
        Dataset for generating tokens of sequences.
    '''
    def __init__(self, queries, passages, labels):
        self.queries = queries
        self.passages = passages
        self.labels = labels
    
    def __getitem__(self, index):
        query, passage, label = self.queries[index], self.passages[index], self.labels[index]
        
        ids_query = tokenizer.batch_encode_plus([query], add_special_tokens=False, padding='max_length', max_length=50, truncation=True)
        ids_passage = tokenizer.batch_encode_plus([passage], add_special_tokens=False, padding='max_length', max_length=300, truncation=True)

        return np.array([ids_query['input_ids'], ids_query['attention_mask']]), np.array([ids_passage['input_ids'], ids_passage['attention_mask']]), label
        # return np.array([ids_query['input_ids'], ids_query['attention_mask']]), np.array([ids_passage['input_ids'], ids_passage['attention_mask']])
    def __len__(self):
        return len(self.labels)

def generate_embedding(queries, passages, labels, section):
    '''
        This function is used for generating embeddings using pre-processed data sequences.

        ***Note that if GPU memory runs out, we should decrease batchsize

        Input:
            -sequences: list, processed sequences
            -labels: list/np.array, corresponding labels for sequences
            -section: str, "train", "test", "dev", indicating which section we are loading.

        Output:
            -embedding-repre: np.array, embedding representation of selected data set.
    '''
    bs = 128
    dataset = DataSeq(queries=queries, passages=passages, labels=labels)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=bs, shuffle=False, num_workers=2)

    progress_bar = tqdm(dataloader)
    all_embedding_q = []
    all_embedding_p = []
    with torch.no_grad():
        for i, (ids_q, ids_p, label) in enumerate(progress_bar):
            input_ids_q = torch.squeeze(ids_q[:,0,:,:]).to(device)
            attention_mask_q = torch.squeeze(ids_q[:,1,:,:]).to(device)   
            query_length = torch.count_nonzero(attention_mask_q, 1).unsqueeze(1)                 
            embedding_q = model(input_ids=input_ids_q,attention_mask=attention_mask_q)[0]
            attention_mask_q = attention_mask_q.unsqueeze(2).repeat(1, 1, 128) 
            embedding_q = torch.sum(embedding_q * attention_mask_q, 1) / query_length #bs, embedding_size

            input_ids_p = torch.squeeze(ids_p[:,0,:,:]).to(device)
            attention_mask_p = torch.squeeze(ids_p[:,1,:,:]).to(device)   
            passage_length = torch.count_nonzero(attention_mask_p, 1).unsqueeze(1)                                
            embedding_p = model(input_ids=input_ids_p,attention_mask=attention_mask_p)[0]
            attention_mask_p = attention_mask_p.unsqueeze(2).repeat(1, 1, 128) 
            embedding_p = torch.sum(embedding_p * attention_mask_p, 1) / passage_length #bs, embedding_size

            all_embedding_q.append(embedding_q.cpu().numpy())
            all_embedding_p.append(embedding_p.cpu().numpy())

    all_embedding_q = np.concatenate(all_embedding_q[:])
    all_embedding_p = np.concatenate(all_embedding_p[:])
    np.save('embedding_queries_{0:}.npy'.format(section), all_embedding_q)
    np.save('embedding_passages_{0:}.npy'.format(section), all_embedding_p)

    return all_embedding_q, all_embedding_p

# Task1

In [None]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy import sparse

def Average_Precision(true_label):
    '''
        input: true_label, list, this list contains the truth label of retrieved passage with order. 
        For example, the score for 5 retrieved passages are S = [0.2, 0.3, 0.4, 0.5, 0.6], the truth labels of these passages are T = [1, 1, 0, 0, 0]
        Then the input true_label is sorted T, which is sorted by S, as T' = [0, 0, 0, 1, 1]
        This means that L' contains both order information as well as truth label information.
    '''
    # AP for single query
    rela_idx = np.where(true_label == 1)[0]
    n_rela_passage = len(rela_idx)
    denom = rela_idx + 1
    numerator = np.arange(1, n_rela_passage+1)
    return (numerator/denom/n_rela_passage).sum()

def NDCG(true_label):
    '''
        Input: true_label, same as the input of Average_Precision.
    '''
    # NDCG for single query
    DCG = np.sum((2**true_label - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    n_rela_passage = int(np.sum(true_label))
    opt_rela_score = np.zeros(len(true_label))
    opt_rela_score[:n_rela_passage] = 1
    optDCG = np.sum((2**opt_rela_score - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    return DCG/optDCG if optDCG != 0 else 0




In [None]:

def text_preprocessing(f):
    vocab = dict()
    lines = [re.sub(u"([^\u0061-\u007a\u0030-\u0039\u0020])", "", line.strip('\n').lower()) for line in f]
    for line in lines:
        line = line.split(' ')
        line.remove('') if '' in line else line
        for word in line:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    del vocab['']
    vocab = sorted(vocab.items(), key = lambda item: item[1], reverse=True)
    return vocab


In [None]:
# read passage
whole_passage = []
with open('part2/validation_data.tsv', 'r') as f: 
    i = 0
    for line in f.readlines():
        if i == 0:
            pass
            i += 1
        else:
            line = line.strip('\n').lower().split('\t')
            line[0] = int(line[0]) #qid
            line[1] = int(line[1]) #pid
            line[2] = re.sub(u"([^/u0061-\u007a\u0030-\u0039\u0020])", "", line[2]) #query  
            line[3] = re.sub(u"([^\u0061-\u007a\u0030-\u0039\u0020])", "", line[3]) #passage
            line[4] = int(float(line[4])) #relavance score
            whole_passage.append(line)

    


In [None]:
whole_passage = np.array(whole_passage)

pid = whole_passage[:, 1].astype(np.int64)

rank_pid, idx_p = np.unique(pid, return_index=True)

whole_passage_p = whole_passage[idx_p]

uni_passage = whole_passage_p[:, -2]

vocab = text_preprocessing(uni_passage)

In [None]:
# remove stop words in vocab
num_stop_words = 20
new_vocab = np.array(np.array(vocab)[num_stop_words: ])[:, 0]
new_vocab_dict = {}

for i, word in enumerate(new_vocab):
    new_vocab_dict[word] = i

In [None]:
#construct inverted_idx matrix
row,col,data = [],[],[]
Ld = []
for i, line in enumerate(tqdm(uni_passage)):
    line = line.split(' ')
    line.remove('') if '' in line else line
    Ld.append(len(line))

    line_vocab = {}
        
    for word in line:
        #column is in order with the frequency of word in vocabulary
        if word in new_vocab_dict:
            row.append(i)
            col.append(new_vocab_dict[word])
            data.append(1)

inverted_idx = sparse.csr_matrix((data, (row, col)), shape=(len(uni_passage), len(new_vocab_dict)))


In [None]:
# #calculate idf_bm25 of new_vocab 
num_passage = inverted_idx.shape[0]
num_vocab = inverted_idx.shape[1]
idf_bm25 = np.zeros(num_vocab)
block = 5000

for i in tqdm(range(0, (num_vocab // block)*block, block)):
    temp = inverted_idx[:, i:i+block].toarray()  
    count_zero_temp = np.count_nonzero(temp, axis=0) 
    idf_bm25[i:i+block] = np.log(((num_passage - count_zero_temp) + 0.5)/ (count_zero_temp + 0.5))
    

temp = inverted_idx[:, i+block:].toarray()
idf_bm25[i+block:] =  np.log((num_passage - np.count_nonzero(temp, axis=0) + 0.5)/ (np.count_nonzero(temp, axis=0) + 0.5))
 

In [None]:
idf_bm25 = np.load('idf_bm25.npy')

In [None]:
qid = whole_passage[:, 0].astype(np.int64)

_, query_idx = np.unique(qid, return_index=True)

query = whole_passage[:, 2][query_idx] #ranked query

In [None]:
ranked_qid = qid[np.argsort(qid)]
last = ranked_qid[0]
counter = []
cur_counter = []
for i, id in enumerate(tqdm(ranked_qid)):
    cur = id
    if cur != last:
        counter.append(cur_counter)
        cur_counter = [i]
    else:
        cur_counter.append(i)
    last = cur
counter.append(cur_counter)

ranked_qid_whole_passage = whole_passage[np.argsort(qid)]

qid_top1000 = []
qid_top1000_relavance = []

pid_top1000 = ranked_qid_whole_passage[:, 1].astype(np.int64) #ranked qid corresponding pid
pid_top1000_relavance = ranked_qid_whole_passage[:, -1].astype(np.int64) #ranked qid core

for count in tqdm(counter):
    qid_top1000.append(pid_top1000[count]) #takeout corresponding 1000 candidate
    qid_top1000_relavance.append(pid_top1000_relavance[count]) #take out corresponding 1000 candidate relavance score

    

In [None]:
def cal_score_bm25(query, idf_bm25, inverted_idx, new_vocab_dict, rank_pid, Ld):
    #hyperparameters for BM25
    k1 = 1.2
    k2 = 100
    b = 0.75

    #initialize store box
    res_score = []
    res_pid = []
    res_qid = []
    res_relavance = []

    m_ap = 0
    m_ndcg = 0

    # deal with one query at a time
    for i, line in enumerate(tqdm(query)):
        line = line.split(' ')
        line.remove('') if '' in line else line

        temp = np.zeros(inverted_idx.shape[1])

        line_vocab = {}

        #construct small dictionary for one single query
        for word in line:
            if word in line_vocab:
                line_vocab[word] += 1
            else:
                line_vocab[word] = 1

        #record the frequency 
        for word in line:
            if word in new_vocab_dict:
                temp[new_vocab_dict[word]] = line_vocab[word]

        #calculate the tf for one single query
        tf_q = temp
        if np.sum(temp) == 0:
            print('Too many stop words have been deleted, please change the number of it.')

        #take out words in query (those elements have non-zero value in tf_q)
        nonzero_idx = np.nonzero(tf_q)[0]
        tf_q = tf_q[nonzero_idx]

        #take out passages-sub-matrix in corresponding position
        p_idf = idf_bm25[nonzero_idx].reshape(1, -1)
        
        p_tf_doc = inverted_idx[:, nonzero_idx].toarray()

        #take out corresponding 1000 passages
        candidate_idx = np.zeros(qid_top1000[i].shape[0])
        for p, candidate in enumerate(qid_top1000[i]):
            cur_idx = np.where(rank_pid == candidate)[0]
            candidate_idx[p] = cur_idx
        candidate_idx = candidate_idx.astype(np.int64)
        p_tf_doc = p_tf_doc[candidate_idx]

        temp_rank_pid = qid_top1000[i]
        temp_rank_relevance = qid_top1000_relavance[i]
        
        L = (Ld/np.mean(Ld))[candidate_idx].reshape(-1, 1)

        
        #calculate score, take out corresponding top 100 pid
        nonzero_p_idx = np.nonzero(np.sum(p_tf_doc, axis=1))[0]

        temp_score = np.zeros(len(qid_top1000[i]))
        temp_score[nonzero_p_idx] = np.sum(p_idf * (k1 + 1) * p_tf_doc[nonzero_p_idx] * (k2 + 1) * tf_q / ((k1*((1-b) + b*(L[nonzero_p_idx])) + p_tf_doc[nonzero_p_idx]) * (k2 + tf_q)), axis=1)
        temp_res_pid = temp_rank_pid[np.argsort(temp_score)[::-1][:100]]
        temp_res_score = temp_score[np.argsort(temp_score)[::-1][:100]]
        temp_res_relavance = temp_rank_relevance[np.argsort(temp_score)[::-1][:100]]


        ap = Average_Precision(temp_res_relavance)
        m_ap += ap

        ndcg = NDCG(temp_res_relavance)
        m_ndcg += ndcg
        
        
    return m_ap / len(query), m_ndcg / len(query)







In [None]:
m_ap, m_ndcg = cal_score_bm25(query, idf_bm25, inverted_idx, new_vocab_dict, rank_pid, Ld)

In [None]:
m_ap, m_ndcg

# Task2

In [5]:
import os
if not os.path.exists('embedding_passages_train.npy'):
    train_all_embedding_q, train_all_embedding_p = generate_embedding(train_queries, train_passages, train_labels, 'train')
    test_all_embedding_q, test_all_embedding_p = generate_embedding(test_queries, test_passages, test_labels, 'test')
else:
    train_all_embedding_p = np.load('embedding_passages_train.npy')
    train_all_embedding_q = np.load('embedding_queries_train.npy')

    test_all_embedding_p = np.load('embedding_passages_test.npy')
    test_all_embedding_q = np.load('embedding_queries_test.npy')

#concatenate passage and query to input into logistic regression model
xTr = np.concatenate((train_all_embedding_q, train_all_embedding_p), 1)
#add bias term
xTr = np.concatenate((xTr, np.ones((xTr.shape[0], 1))), 1)
yTr = train_labels

del train_all_embedding_p, train_all_embedding_q

#concatenate passage and query to input into logistic regression model
xTe = np.concatenate((test_all_embedding_q, test_all_embedding_p), 1)
#add bias term
xTe = np.concatenate((xTe, np.ones((xTe.shape[0], 1))), 1)
yTe = test_labels

del test_all_embedding_p, test_all_embedding_q




In [6]:
import gc
gc.collect()

441

In [16]:
class LogisticRegression:
    def __init__(self, xTr, yTr, lr, epochs=200):
        np.random.seed(42)
        self.xTr = xTr
        self.yTr = yTr
        self.w = np.random.rand(xTr.shape[1])
        self.lr = lr
        self.epochs = epochs
        self.mini_batch_size = 256
        

    def forward(self, x):
        return 1 / (1 + np.exp(x@-self.w))
    
    def criterion(self, pred, target):
        # return 1/len(target) * -(target*(np.log(pred)) + (1-target)*np.log(1-pred)).sum()
        return  -(target*(np.log(pred)) + (1-target)*np.log(1-pred)).sum()



    def train(self):
        all_loss = []
        for epoch in tqdm(range(self.epochs)):
            total_loss = 0
            #Obtain data
            pred = self.forward(self.xTr)
            loss = self.criterion(pred, self.yTr)
            grad = -self.lr / len(self.xTr) * np.sum((self.yTr - pred) * self.xTr.T, 1)
            self.w -= grad
            total_loss += loss

            print("Epoch:\t", epoch, "Averaged loss of the epoch:", total_loss.item())
            all_loss.append(loss)

        return all_loss





In [17]:
lr1 = LogisticRegression(xTr, yTr, lr=0.5, epochs=500)

In [None]:
loss_1 = lr1.train()

In [19]:
lr2 = LogisticRegression(xTr, yTr, lr=5e-2, epochs=500)

In [None]:
loss_2 = lr2.train()

In [None]:
lr3 = LogisticRegression(xTr, yTr, lr=10, epochs=500)

In [None]:
loss_3 = lr3.train()

In [None]:
lr4 = LogisticRegression(xTr, yTr, lr=50, epochs=500)

In [None]:
loss_4 = lr4.train()

In [None]:
import matplotlib.pyplot as plt
plt.plot(np.arange(1, len(loss_1)+1), loss_1, label='lr = 0.5')
plt.plot(np.arange(1, len(loss_2)+1), loss_2, label='lr = 0.05')
plt.plot(np.arange(1, len(loss_3)+1), loss_3, label='lr = 10')
plt.plot(np.arange(1, len(loss_4)+1), loss_4, label='lr = 50')
plt.xlabel('Epochs')
plt.ylabel('Epoch Loss')
plt.yscale('log')
plt.title('Comparsion on training loss of using different learning rate')
plt.legend()
plt.savefig('Comparsion_lr.png', dpi=300)

In [10]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy import sparse

def Average_Precision(true_label):
    # AP for single query
    rela_idx = np.where(true_label == 1)[0]
    n_rela_passage = len(rela_idx)
    denom = rela_idx + 1
    numerator = np.arange(1, n_rela_passage+1)
    return (numerator/denom/n_rela_passage).sum()

def NDCG(true_label):
    # NDCG for single query
    DCG = np.sum((2**true_label - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    n_rela_passage = int(np.sum(true_label))
    opt_rela_score = np.zeros(len(true_label))
    opt_rela_score[:n_rela_passage] = 1
    optDCG = np.sum((2**opt_rela_score - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    return DCG/optDCG if optDCG != 0 else 0

def mean_metric_lr(test_qids, xTe, yTe, test_pids, lr, write=False):
    ranked_qid = test_qids[np.argsort(test_qids)]
    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)

    xTe = xTe[np.argsort(test_qids)]
    yTe = yTe[np.argsort(test_qids)]
    test_pids = test_pids[np.argsort(test_qids)]
    uni_qids = np.unique(test_qids)
    
    m_ap = 0
    m_ndcg = 0

    res_qid = []
    res_pid = []
    res_score = []
    res_rank = []
    res_A1 = []
    res_algoname = []

    for i, count in enumerate(tqdm(counter)):
        sub_xTe = xTe[count]
        pred_yTe = lr.forward(sub_xTe)
        sort_idx = np.argsort(pred_yTe)[::-1][:]
        sub_yTe = yTe[count]
        true_label = sub_yTe[sort_idx]
        ap = Average_Precision(true_label)
        ndcg = NDCG(true_label)
        m_ap += ap
        m_ndcg += ndcg

        sub_pids = test_pids[count]

        res_qid.extend([uni_qids[i] for _ in range(len(sort_idx))])
        res_pid.extend(sub_pids[sort_idx])
        res_score.extend(pred_yTe[np.argsort(pred_yTe)[::-1][:]])
        res_rank.extend(np.arange(1, len(sort_idx)+1))
        res_A1.extend(['A1' for _ in range(len(sort_idx))])
        res_algoname.extend(['LR' for _ in range(len(sort_idx))])
    
    if write:
        data = {'qid': res_qid, 'A1': res_A1, 'pid': res_pid, 'rank': res_rank, 'score': res_score, 'algoname': res_algoname}
        data_df = pd.DataFrame(data)
        data_df.to_csv('LR.txt',index=False,header=False, sep=' ')

    
    return m_ap / len(counter), m_ndcg / len(counter)


In [11]:
m_ap_1, m_ndcg_1 = mean_metric_lr(test_qids, xTe, yTe, test_pids, lr1, write=False)

100%|██████████| 1103039/1103039 [00:00<00:00, 1847848.67it/s]
100%|██████████| 1148/1148 [00:04<00:00, 279.86it/s]


In [18]:
del xTr, yTr, xTe, yTe

# Task3 make sure to restart kernel!!!

In [3]:
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

col=['qid','pid','query','passage','relevancy']

train_data=pd.read_csv("part2/train_data.tsv", sep='\t', header=None, names=col)
train_data=pd.DataFrame(train_data)
train_data = train_data.iloc[1:]

train_passages = train_data['passage'].values
train_queries = train_data['query'].values
train_pids = train_data['pid'].values.astype(np.int64)
train_qids = train_data['qid'].values.astype(np.int64)
train_labels = train_data['relevancy'].values.astype(np.float64).astype(np.int64)



test_data=pd.read_csv("part2/validation_data.tsv", sep='\t', header=None, names=col)
test_data=pd.DataFrame(test_data)
test_data = test_data.iloc[1:]

test_passages = test_data['passage'].values
test_queries = test_data['query'].values
test_pids = test_data['pid'].values.astype(np.int64)
test_qids = test_data['qid'].values.astype(np.int64)
test_labels = test_data['relevancy'].values.astype(np.float64).astype(np.int64)


train_all_embedding_p = np.load('embedding_passages_train.npy')
train_all_embedding_q = np.load('embedding_queries_train.npy')

test_all_embedding_p = np.load('embedding_passages_test.npy')
test_all_embedding_q = np.load('embedding_queries_test.npy')

#concatenate passage and query to input into logistic regression model
xTr = np.concatenate((train_all_embedding_q, train_all_embedding_p), 1)
yTr = train_labels

#concatenate passage and query to input into logistic regression model
xTe = np.concatenate((test_all_embedding_q, test_all_embedding_p), 1)
yTe = test_labels

del train_all_embedding_p, train_all_embedding_q, test_all_embedding_p, test_all_embedding_q


In [5]:
#Negative sampling
import numpy as np
def neg_sampling(qids, xTr, yTr, ratio=5):
    ranked_qid = qids[np.argsort(qids)]
    yTr = yTr[np.argsort(qids)]
    xTr = xTr[np.argsort(qids)]

    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)
    

    sample_xTr = []
    sample_yTr = []
    sample_qids = []

    for i, qid in enumerate(tqdm(np.unique(qids))):
        idx1 = np.where(yTr[counter[i]] == 1)
        num_pos = idx1[0].shape[0]

        #positive sample
        sample_xTr.extend(xTr[counter[i]][idx1])
        sample_yTr.extend(yTr[counter[i]][idx1])
        sample_qids.extend([qid]*(len(xTr[counter[i]][idx1])))

        #negative sample
        idx0 = np.delete(np.arange(len(counter[i])), idx1)
        np.random.shuffle(idx0)
        sample_xTr.extend(xTr[counter[i]][idx0[:ratio*num_pos]])
        sample_yTr.extend(yTr[counter[i]][idx0[:ratio*num_pos]])
        sample_qids.extend([qid]*len(xTr[counter[i]][idx0[:ratio*num_pos]]))

    return sample_xTr, sample_yTr, sample_qids
        


In [6]:
sample_xTr, sample_yTr, sample_qids = neg_sampling(train_qids, xTr, yTr, ratio=5)

100%|██████████| 4364339/4364339 [00:01<00:00, 2203554.09it/s]
100%|██████████| 4590/4590 [00:06<00:00, 694.95it/s]


In [7]:
del xTr, yTr

In [7]:
#Negative sampling
import numpy as np
def Average_Precision(true_label):
    # AP for single query
    rela_idx = np.where(true_label == 1)[0]
    n_rela_passage = len(rela_idx)
    denom = rela_idx + 1
    numerator = np.arange(1, n_rela_passage+1)
    return (numerator/denom/n_rela_passage).sum()

def NDCG(true_label):
    # NDCG for single query
    DCG = np.sum((2**true_label - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    n_rela_passage = int(np.sum(true_label))
    opt_rela_score = np.zeros(len(true_label))
    opt_rela_score[:n_rela_passage] = 1
    optDCG = np.sum((2**opt_rela_score - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    return DCG/optDCG.sum() if optDCG != 0 else 0

def make_group(qids, yTr, xTr):
    xTr = np.array(xTr)
    yTr = np.array(yTr)
    qids = np.array(qids)

    ranked_qid = qids

    # ranked_qid = qids[np.argsort(qids)]
    # yTr = yTr[np.argsort(qids)]
    # xTr = xTr[np.argsort(qids)]


    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)

    group = [len(counter[i]) for i in range(len(counter))]

    return xTr, yTr, group
        


In [9]:
xTr, yTr, groups = make_group(sample_qids, sample_yTr, sample_xTr)
# xTr, yTr, groups = make_group(train_qids, yTr, xTr)

100%|██████████| 28761/28761 [00:00<00:00, 1747814.04it/s]


In [101]:
def xgb_cv(lr, colsample_bytree, max_depth, n_estimators, subsample):
    qids = sample_qids
    folds = 5
    n = len(groups)
    c_len = n // folds
    group_chunks = [groups[i*c_len:(i+1)*c_len] for i in range(folds-1)]
    group_chunks.append(groups[(folds-1)*c_len:])

    xTr_chunks = []
    yTr_chunks = []
    qid_chunks = []
    for i in range(folds):
        start_idx = 0
        chunk_length = sum(group_chunks[i])
        xTr_chunks.append(xTr[start_idx:start_idx+chunk_length])
        yTr_chunks.append(yTr[start_idx:start_idx+chunk_length])
        qid_chunks.append(qids[start_idx:start_idx+chunk_length])
        start_idx += chunk_length
    
    ndcg_list = []
    ap_list = []
    for i in range(folds):
        idx_list = np.arange(folds)
        idx_list = np.delete(idx_list, i)

        xVal_cv = xTr_chunks[i]
        yVal_cv = yTr_chunks[i]
        gVal_cv = group_chunks[i]
        qids_cv = qid_chunks[i]

        xTr_cv = np.concatenate([xTr_chunks[j] for j in idx_list][:])
        yTr_cv = np.concatenate([yTr_chunks[j] for j in idx_list][:])
        group_cv = np.concatenate([group_chunks[j] for j in idx_list][:])

        rank_model = xgb.XGBRanker(  
            tree_method='gpu_hist',
            booster='gbtree',
            objective='rank:pairwise',
            random_state=42, 
            learning_rate=lr,
            colsample_bytree=colsample_bytree, 
            eta=0.05, 
            max_depth=int(max_depth), 
            n_estimators=int(n_estimators), 
            subsample=subsample,
            )
        rank_model.fit(xTr_cv, yTr_cv, group_cv, verbose=True)


        ranked_qid = qids_cv

        last = ranked_qid[0]
        counter = []
        cur_counter = []
        for i, id in enumerate(tqdm(ranked_qid)):
            cur = id
            if cur != last:
                counter.append(cur_counter)
                cur_counter = [i]
            else:
                cur_counter.append(i)
            last = cur
        counter.append(cur_counter)

        m_ap = 0
        m_ndcg = 0


        for count in tqdm(counter):
            sub_xTe = xVal_cv[count]
            pred_yTe = rank_model.predict(sub_xTe)
            sort_idx = np.argsort(pred_yTe)[::-1]
            sub_yTe = yVal_cv[count]
            true_label = sub_yTe[sort_idx]
            ap = Average_Precision(true_label)
            ndcg = NDCG(true_label)
            m_ap += ap
            m_ndcg += ndcg


        
        ndcg_list.append(m_ndcg/len(counter))
        ap_list.append(m_ap/len(counter))

    return np.mean(ap_list)




In [102]:
xgb_bo = BayesianOptimization(
    xgb_cv,
    {'lr': (0.001, 0.1),
    'colsample_bytree': (0.5, 0.9),
    'max_depth': (4, 10),
    'n_estimators': (300, 800),
     'subsample': (0.5, 0.9)})

In [23]:
import xgboost as xgb
xgb_bo.maximize(init_points=0,n_iter=10,)

|   iter    |  target   | colsam... |    lr     | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------


100%|██████████| 5784/5784 [00:00<00:00, 1462053.54it/s]
100%|██████████| 918/918 [00:24<00:00, 37.64it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1521519.22it/s]
100%|██████████| 930/930 [00:24<00:00, 37.68it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1532472.01it/s]
100%|██████████| 903/903 [00:24<00:00, 37.61it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1757319.28it/s]
100%|██████████| 904/904 [00:24<00:00, 37.60it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1672118.69it/s]
100%|██████████| 909/909 [00:24<00:00, 37.65it/s]


| [0m 1       [0m | [0m 0.9353  [0m | [0m 0.5045  [0m | [0m 0.001323[0m | [0m 5.907   [0m | [0m 740.4   [0m | [0m 0.8127  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1538159.67it/s]
100%|██████████| 918/918 [00:10<00:00, 89.64it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1580415.47it/s]
100%|██████████| 930/930 [00:10<00:00, 89.42it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1686791.33it/s]
100%|██████████| 903/903 [00:10<00:00, 89.30it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1587531.10it/s]
100%|██████████| 904/904 [00:10<00:00, 89.52it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1747499.59it/s]
100%|██████████| 909/909 [00:10<00:00, 89.45it/s]


| [95m 2       [0m | [95m 0.9974  [0m | [95m 0.7481  [0m | [95m 0.05633 [0m | [95m 5.295   [0m | [95m 310.0   [0m | [95m 0.5534  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1481246.45it/s]
100%|██████████| 918/918 [00:12<00:00, 74.08it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1744278.01it/s]
100%|██████████| 930/930 [00:12<00:00, 74.18it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1700490.42it/s]
100%|██████████| 903/903 [00:12<00:00, 74.22it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1764848.61it/s]
100%|██████████| 904/904 [00:12<00:00, 73.91it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1534991.50it/s]
100%|██████████| 909/909 [00:12<00:00, 73.90it/s]


| [95m 3       [0m | [95m 0.9985  [0m | [95m 0.6235  [0m | [95m 0.05591 [0m | [95m 6.155   [0m | [95m 310.9   [0m | [95m 0.8496  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1424369.09it/s]
100%|██████████| 918/918 [00:25<00:00, 35.71it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1843166.21it/s]
100%|██████████| 930/930 [00:26<00:00, 35.72it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1612169.67it/s]
100%|██████████| 903/903 [00:25<00:00, 35.60it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1666486.86it/s]
100%|██████████| 904/904 [00:25<00:00, 35.59it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1660565.32it/s]
100%|██████████| 909/909 [00:25<00:00, 35.68it/s]


| [95m 4       [0m | [95m 0.9986  [0m | [95m 0.6878  [0m | [95m 0.03383 [0m | [95m 10.0    [0m | [95m 377.6   [0m | [95m 0.9     [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1340101.33it/s]
100%|██████████| 918/918 [00:27<00:00, 33.10it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1675777.01it/s]
100%|██████████| 930/930 [00:28<00:00, 32.86it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1739526.60it/s]
100%|██████████| 903/903 [00:27<00:00, 32.85it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1750623.26it/s]
100%|██████████| 904/904 [00:27<00:00, 32.95it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1675032.19it/s]
100%|██████████| 909/909 [00:27<00:00, 32.89it/s]


| [0m 5       [0m | [0m 0.9985  [0m | [0m 0.5301  [0m | [0m 0.08357 [0m | [0m 9.936   [0m | [0m 492.1   [0m | [0m 0.7412  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1540699.50it/s]
100%|██████████| 918/918 [00:14<00:00, 62.90it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1485831.30it/s]
100%|██████████| 930/930 [00:14<00:00, 62.88it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1734848.76it/s]
100%|██████████| 903/903 [00:14<00:00, 62.89it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1745507.09it/s]
100%|██████████| 904/904 [00:14<00:00, 62.76it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1622450.68it/s]
100%|██████████| 909/909 [00:14<00:00, 62.67it/s]


| [0m 6       [0m | [0m 0.9982  [0m | [0m 0.7969  [0m | [0m 0.09722 [0m | [0m 4.047   [0m | [0m 571.2   [0m | [0m 0.679   [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1381540.68it/s]
100%|██████████| 918/918 [00:11<00:00, 79.12it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1697653.09it/s]
100%|██████████| 930/930 [00:11<00:00, 78.93it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1625674.87it/s]
100%|██████████| 903/903 [00:11<00:00, 79.11it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1640961.30it/s]
100%|██████████| 904/904 [00:11<00:00, 79.10it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1665859.98it/s]
100%|██████████| 909/909 [00:11<00:00, 79.12it/s]


| [0m 7       [0m | [0m 0.7851  [0m | [0m 0.5     [0m | [0m 0.001   [0m | [0m 4.0     [0m | [0m 436.7   [0m | [0m 0.9     [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1311981.74it/s]
100%|██████████| 918/918 [00:28<00:00, 32.11it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1716983.68it/s]
100%|██████████| 930/930 [00:29<00:00, 31.92it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1332688.52it/s]
100%|██████████| 903/903 [00:28<00:00, 32.10it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1741435.64it/s]
100%|██████████| 904/904 [00:28<00:00, 31.71it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1231469.66it/s]
100%|██████████| 909/909 [00:28<00:00, 31.85it/s]


| [0m 8       [0m | [0m 0.9986  [0m | [0m 0.7535  [0m | [0m 0.1     [0m | [0m 10.0    [0m | [0m 530.1   [0m | [0m 0.5     [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1390249.53it/s]
100%|██████████| 918/918 [00:36<00:00, 25.11it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1822258.41it/s]
100%|██████████| 930/930 [00:37<00:00, 25.12it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1776818.82it/s]
100%|██████████| 903/903 [00:35<00:00, 25.10it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1374849.26it/s]
100%|██████████| 904/904 [00:35<00:00, 25.14it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1802141.72it/s]
100%|██████████| 909/909 [00:36<00:00, 25.11it/s]


| [0m 9       [0m | [0m 0.9985  [0m | [0m 0.514   [0m | [0m 0.04823 [0m | [0m 9.832   [0m | [0m 624.0   [0m | [0m 0.5077  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1006967.22it/s]
100%|██████████| 918/918 [00:16<00:00, 54.01it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1397703.81it/s]
100%|██████████| 930/930 [00:17<00:00, 53.85it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1886492.06it/s]
100%|██████████| 903/903 [00:16<00:00, 53.76it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1771391.38it/s]
100%|██████████| 904/904 [00:16<00:00, 53.90it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1706187.84it/s]
100%|██████████| 909/909 [00:16<00:00, 53.75it/s]


| [0m 10      [0m | [0m 0.9984  [0m | [0m 0.7659  [0m | [0m 0.07296 [0m | [0m 4.062   [0m | [0m 670.2   [0m | [0m 0.7748  [0m |


100%|██████████| 5784/5784 [00:00<00:00, 1500300.21it/s]
100%|██████████| 918/918 [00:09<00:00, 101.25it/s]
100%|██████████| 5860/5860 [00:00<00:00, 1711841.58it/s]
100%|██████████| 930/930 [00:09<00:00, 101.26it/s]
100%|██████████| 5691/5691 [00:00<00:00, 1460461.58it/s]
100%|██████████| 903/903 [00:08<00:00, 100.94it/s]
100%|██████████| 5696/5696 [00:00<00:00, 1694019.40it/s]
100%|██████████| 904/904 [00:08<00:00, 101.19it/s]
100%|██████████| 5730/5730 [00:00<00:00, 1759654.56it/s]
100%|██████████| 909/909 [00:08<00:00, 101.01it/s]

| [0m 11      [0m | [0m 0.9973  [0m | [0m 0.9     [0m | [0m 0.1     [0m | [0m 4.0     [0m | [0m 347.0   [0m | [0m 0.5     [0m |





In [24]:
xgb_bo.max

{'params': {'colsample_bytree': 0.6878421461159999,
  'lr': 0.033832164874053106,
  'max_depth': 10.0,
  'n_estimators': 377.64193463828013,
  'subsample': 0.9},
 'target': 0.9986408090117767}

In [8]:
#Negative sampling
import numpy as np
def Average_Precision(true_label):
    # AP for single query
    rela_idx = np.where(true_label == 1)[0]
    n_rela_passage = len(rela_idx)
    denom = rela_idx + 1
    numerator = np.arange(1, n_rela_passage+1)
    return (numerator/denom/n_rela_passage).sum()

def NDCG(true_label):
    # NDCG for single query
    DCG = np.sum((2**true_label - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    n_rela_passage = int(np.sum(true_label))
    opt_rela_score = np.zeros(len(true_label))
    opt_rela_score[:n_rela_passage] = 1
    optDCG = np.sum((2**opt_rela_score - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    return DCG/optDCG.sum() if optDCG != 0 else 0

def make_group(qids, yTr, xTr):
    xTr = np.array(xTr)
    yTr = np.array(yTr)
    qids = np.array(qids)

    # ranked_qid = qids

    ranked_qid = qids[np.argsort(qids)]
    yTr = yTr[np.argsort(qids)]
    xTr = xTr[np.argsort(qids)]


    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)

    group = [len(counter[i]) for i in range(len(counter))]

    return xTr, yTr, group
        


In [9]:
xTr, yTr, groups = make_group(train_qids, yTr, xTr)

100%|██████████| 4364339/4364339 [00:01<00:00, 2214036.59it/s]


In [10]:
import xgboost as xgb
rank_model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.033832164874053106,
    colsample_bytree=0.6878421461159999, 
    eta=0.05, 
    max_depth=10, 
    n_estimators=378, 
    subsample=0.9,
    )

rank_model.fit(xTr, yTr, group=groups, verbose=True)

def mean_metric_xgb(rank_model, test_qids, test_pids, xTe, yTe, write=True):
    ranked_qid = test_qids[np.argsort(test_qids)]
    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)

    xTe = xTe[np.argsort(test_qids)]
    yTe = yTe[np.argsort(test_qids)]
    test_pids = test_pids[np.argsort(test_qids)]
    uni_qids = np.unique(test_qids)


    m_ap = 0
    m_ndcg = 0

    res_qid = []
    res_pid = []
    res_score = []
    res_rank = []
    res_A1 = []
    res_algoname = []

    for i, count in enumerate(tqdm(counter)):
        sub_xTe = xTe[count]
        pred_yTe = rank_model.predict(sub_xTe)
        sort_idx = np.argsort(pred_yTe)[::-1][:]
        sub_yTe = yTe[count]
        true_label = sub_yTe[sort_idx]
        ap = Average_Precision(true_label)
        ndcg = NDCG(true_label)
        m_ap += ap
        m_ndcg += ndcg

        sub_pids = test_pids[count]

        res_qid.extend([uni_qids[i] for _ in range(len(sort_idx))])
        res_pid.extend(sub_pids[sort_idx])
        res_score.extend(pred_yTe[np.argsort(pred_yTe)[::-1][:]])
        res_rank.extend(np.arange(1, len(sort_idx)+1))
        res_A1.extend(['A1' for _ in range(len(sort_idx))])
        res_algoname.extend(['LM' for _ in range(len(sort_idx))])

    if write:
        data = {'qid': res_qid, 'A1': res_A1, 'pid': res_pid, 'rank': res_rank, 'score': res_score, 'algoname': res_algoname}
        data_df = pd.DataFrame(data)
        data_df.to_csv('/content/gdrive/MyDrive/LM.txt',index=False,header=False, sep=' ')
    
    return m_ap / len(counter), m_ndcg / len(counter)
    
mean_metric_xgb(rank_model, test_qids, test_pids, xTe, yTe, write='False')

100%|██████████| 1103039/1103039 [00:00<00:00, 2156566.94it/s]
100%|██████████| 1148/1148 [00:59<00:00, 19.16it/s]


(0.03581239563558507, 0.16844698828579643)

# Task 4 Make sure to restart kernel!!!

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

col=['qid','pid','query','passage','relevancy']

train_data=pd.read_csv("part2/train_data.tsv", sep='\t', header=None, names=col)
train_data=pd.DataFrame(train_data)
train_data = train_data.iloc[1:]

train_passages = train_data['passage'].values
train_queries = train_data['query'].values
train_pids = train_data['pid'].values.astype(np.int64)
train_qids = train_data['qid'].values.astype(np.int64)
train_labels = train_data['relevancy'].values.astype(np.float64).astype(np.int64)



test_data=pd.read_csv("part2/validation_data.tsv", sep='\t', header=None, names=col)
test_data=pd.DataFrame(test_data)
test_data = test_data.iloc[1:]

test_passages = test_data['passage'].values
test_queries = test_data['query'].values
test_pids = test_data['pid'].values.astype(np.int64)
test_qids = test_data['qid'].values.astype(np.int64)
test_labels = test_data['relevancy'].values.astype(np.float64).astype(np.int64)

class DataSeq(Dataset):
    '''
        Dataset for generating tokens of sequences.
    '''
    def __init__(self, queries, passages, labels):
        self.queries = queries
        self.passages = passages
        self.labels = labels
    
    def __getitem__(self, index):
        query, passage, label = self.queries[index], self.passages[index], self.labels[index]
        
        # ids_query = tokenizer.batch_encode_plus([query], add_special_tokens=False, padding='max_length', max_length=50, truncation=True, return_tensors='pt').to(device)
        # ids_passage = tokenizer.batch_encode_plus([passage], add_special_tokens=False, padding='max_length', max_length=300, truncation=True, return_tensors='pt').to(device)
        # return model(**ids_query), model(**ids_passage), ids_query.attention_mask, ids_passage.attention_mask, label

        ids_query = tokenizer.batch_encode_plus([query], add_special_tokens=False, padding='max_length', max_length=50, truncation=True)
        ids_passage = tokenizer.batch_encode_plus([passage], add_special_tokens=False, padding='max_length', max_length=300, truncation=True)
        return np.array([ids_query['input_ids']]), np.array([ids_passage['input_ids']]), np.array([ids_query['attention_mask']]), np.array([ids_passage['attention_mask']]), label


    def __len__(self):
        return len(self.labels)




In [5]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--hidden_size', type=int, default=128,
                    choices=[64, 128, 256])
parser.add_argument('--vocab_size', type=int, default=len(tokenizer))
parser.add_argument('--device', default=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--batch_size', type=int, default=128,
                    choices=[64, 128, 256])
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--feed_forward', type=float, default=64)
parser.add_argument('--dropout', type=float, default=0.1) 
parser.add_argument('--num_head', type=int, default=4)
parser.add_argument('--num_transformer_layer', default=2)
args = parser.parse_args(args=[])

In [6]:
import torch
from torch import nn, optim
import torch.nn.functional as F
import random
import math


class PositionalEncoding(nn.Module):
    def __init__(self, args, max_len=350):
        super().__init__()
        self.args = args
        self.embeddings = nn.Embedding(max_len, self.args.hidden_size)
        self.register_buffer('position_ids', torch.arange(max_len))

    def forward(self, x):
        """
        return (l b d)
        """
        position_ids = self.position_ids[:x.size(0)] #1, seq_len
        position_ids = position_ids.repeat(x.shape[1], 1) #bs, seq_len
        position_ids = self.embeddings(position_ids) #bs, seq_len, embedding_size
        position_ids = position_ids.transpose(0, 1)

        return x + position_ids


class BinaryClassifier(torch.nn.Module):
    def __init__(self, args):
        super(BinaryClassifier, self).__init__()
        self.args = args
        ##positional encoder
        self.pos_encoder = PositionalEncoding(self.args) 
        ##Multi-atten
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.args.hidden_size, nhead=self.args.num_head, dim_feedforward=self.args.feed_forward, dropout=self.args.dropout)
        self.multi_atten = nn.TransformerEncoder(encoder_layer, num_layers=self.args.num_transformer_layer)
        
        self.fc = nn.Linear(self.args.hidden_size, self.args.hidden_size) 
        self.dropout = nn.Dropout(self.args.dropout)

        self.down1 = nn.Linear(self.args.hidden_size, self.args.hidden_size//4)
        self.down2 = nn.Linear(self.args.hidden_size//4, self.args.hidden_size//4)
        self.down3 = nn.Linear(self.args.hidden_size//4, self.args.hidden_size//8)
        self.out = nn.Linear(self.args.hidden_size//8, 1)

        # self.out = nn.Linear(self.args.hidden_size, 1)
        
        self.relu = nn.ReLU()
        


    def forward(self, query, passage, mask_q, mask_p):
        #query: bs, seq_len_q, embedding_size
        #passage: bs, seq_len_p, embedding_size


        x = torch.cat((query, passage), 1) #bs, seq_len_all
        mask = torch.cat((mask_q, mask_p), 1) #bs, seq_len_all

        #embedding layer
        x = x * math.sqrt(self.args.hidden_size) #bs, seq_len_all, embedding_size
        x = self.dropout(x)

        #multi-atten
        x = torch.transpose(x, 0, 1) # seq_len, bs, embedding_size
        x = self.pos_encoder(x)
        x = self.multi_atten(x, src_key_padding_mask=(mask==0)) # seq_len, bs, embedding_size
        x = torch.transpose(x, 0, 1) #bs, seq_len, embedding_size
        
        #pooling
        x = torch.mean(x, 1)
        x = self.relu(self.fc(x))

        #down
        x = self.relu(self.down1(x))
        x = self.relu(self.down2(x))
        x = self.relu(self.down3(x))
        x = self.out(x)

        # x = self.out(x)

        x = nn.Sigmoid()(x)

        return x

In [7]:
'''
    Parallel encoding for query and passage, respectively, with m_ndcg = 0.14 and m_ap = 0.02.
    Model is not performing well because query and passage are not computing attention with each other, thus model is hard to obtain interactive info
    between passage and query, thus harder to predict if they are relevant of not.
'''
# import torch
# from torch import nn, optim
# import torch.nn.functional as F
# import random
# import math


# class PositionalEncoding(nn.Module):
#     def __init__(self, args, max_len=350):
#         super().__init__()
#         self.args = args
#         self.embeddings = nn.Embedding(max_len, self.args.hidden_size)
#         self.register_buffer('position_ids', torch.arange(max_len))

#     def forward(self, x):
#         """
#         return (l b d)
#         """
#         position_ids = self.position_ids[:x.size(0)] #1, seq_len
#         position_ids = position_ids.repeat(x.shape[1], 1) #bs, seq_len
#         position_ids = self.embeddings(position_ids) #bs, seq_len, embedding_size
#         position_ids = position_ids.transpose(0, 1)

#         return x + position_ids


# class BinaryClassifier(torch.nn.Module):
#     def __init__(self, args):
#         super(BinaryClassifier, self).__init__()
#         self.args = args
#         ##positional encoder
#         self.pos_encoder_q = PositionalEncoding(self.args, max_len=50)
#         self.pos_encoder_p = PositionalEncoding(self.args, max_len=300)
#         ##Multi-atten
#         encoder_layer_q = nn.TransformerEncoderLayer(d_model=self.args.hidden_size, nhead=self.args.num_head, dim_feedforward=self.args.feed_forward, dropout=self.args.dropout)
#         self.multi_atten_q = nn.TransformerEncoder(encoder_layer_q, num_layers=self.args.num_transformer_layer)

#         encoder_layer_p = nn.TransformerEncoderLayer(d_model=self.args.hidden_size, nhead=self.args.num_head, dim_feedforward=self.args.feed_forward, dropout=self.args.dropout)
#         self.multi_atten_p = nn.TransformerEncoder(encoder_layer_p, num_layers=self.args.num_transformer_layer)

        
#         self.fc_p = nn.Linear(self.args.hidden_size, self.args.hidden_size) 
#         self.fc_q = nn.Linear(self.args.hidden_size, self.args.hidden_size) 

#         self.dropout = nn.Dropout(self.args.dropout)

#         self.down1 = nn.Linear(self.args.hidden_size, self.args.hidden_size//4)
#         self.down2 = nn.Linear(self.args.hidden_size//4, self.args.hidden_size//4)
#         self.down3 = nn.Linear(self.args.hidden_size//4, self.args.hidden_size//8)
#         self.out = nn.Linear(self.args.hidden_size//8, 1)

#         # self.out = nn.Linear(self.args.hidden_size, 1)
        
#         self.relu = nn.ReLU()
        


#     def forward(self, query, passage, mask_q, mask_p):
#         #query: bs, seq_len_q, embedding_size
#         #passage: bs, seq_len_p, embedding_size


#         #embedding layer
#         x_q = query * math.sqrt(self.args.hidden_size) #bs, seq_len_all, embedding_size
#         x_q = self.dropout(x_q)

#         #multi-atten
#         x_q = torch.transpose(x_q, 0, 1) # seq_len, bs, embedding_size
#         x_q = self.pos_encoder_q(x_q)
#         x_q = self.multi_atten_q(x_q, src_key_padding_mask=(mask_q==0)) # seq_len, bs, embedding_size
#         x_q = torch.transpose(x_q, 0, 1) #bs, seq_len, embedding_size
        
#         #pooling
#         x_q = torch.mean(x_q, 1)
#         x_q = self.relu(self.fc_q(x_q))

#         #embedding layer
#         x_p = passage * math.sqrt(self.args.hidden_size) #bs, seq_len_all, embedding_size
#         x_p = self.dropout(x_p)

#         #multi-atten
#         x_p = torch.transpose(x_p, 0, 1) # seq_len, bs, embedding_size
#         x = self.pos_encoder_p(x_p)
#         x_p = self.multi_atten_p(x_p, src_key_padding_mask=(mask_p==0)) # seq_len, bs, embedding_size
#         x_p = torch.transpose(x_p, 0, 1) #bs, seq_len, embedding_size
        
#         #pooling
#         x_p = torch.mean(x_p, 1)
#         x_p = self.relu(self.fc_p(x_p))

#         x = x_q + x_p

#         #down
#         x = self.relu(self.down1(x))
#         x = self.relu(self.down2(x))
#         x = self.relu(self.down3(x))
#         x = self.out(x)

#         # x = self.out(x)

#         x = nn.Sigmoid()(x)

#         return x

'\n    Parallel encoding for query and passage, respectively, with m_ndcg = 0.14 and m_ap = 0.02.\n    Model is not performing well because query and passage are not computing attention with each other, thus model is hard to obtain interactive info\n    between passage and query, thus harder to predict if they are relevant of not.\n'

In [7]:
trainset = DataSeq(queries=train_queries, passages=train_passages, labels=train_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2)

valset = DataSeq(queries=test_queries, passages=test_passages, labels=test_labels)
valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=0)

In [8]:
pos_ratio = np.where(train_labels==1)[0].shape[0] / train_labels.shape[0]
print('Postive/Negative ratio in trainset is {0:.4f}'.format(pos_ratio))

Postive/Negative ratio in trainset is 0.0011


In [9]:
#Negative sampling
import numpy as np
def neg_sampling(qids, pids, labels, passages, queries, ratio=10):
    ranked_qid = qids[np.argsort(qids)]
    ranked_labels = labels[np.argsort(qids)]
    ranked_passages = passages[np.argsort(qids)]
    ranked_queries = queries[np.argsort(qids)]

    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)

    sample_passage = []
    sample_query = []
    sample_label = []
    
    for i, qid in enumerate(tqdm(np.unique(qids))):
        idx1 = np.where(ranked_labels[counter[i]] == 1)
        num_pos = idx1[0].shape[0]

        #positive sample
        sample_passage.extend(ranked_passages[counter[i]][idx1])
        sample_query.extend(ranked_queries[counter[i]][idx1])
        sample_label.extend(ranked_labels[counter[i]][idx1])

        #negative sample
        idx0 = np.delete(np.arange(len(counter[i])), idx1)
        np.random.shuffle(idx0)
        sample_passage.extend(ranked_passages[counter[i]][idx0[:ratio*num_pos]])
        sample_query.extend(ranked_queries[counter[i]][idx0[:ratio*num_pos]])
        sample_label.extend(ranked_labels[counter[i]][idx0[:ratio*num_pos]])
    
    return sample_query, sample_passage, sample_label
        


In [10]:
sample_query, sample_passage, sample_label = neg_sampling(train_qids, train_pids, train_labels, train_passages, train_queries, ratio=5)

100%|██████████| 4364339/4364339 [00:02<00:00, 1936827.41it/s]
100%|██████████| 4590/4590 [00:03<00:00, 1516.80it/s]


In [11]:
pos_ratio = sum(sample_label) / len(sample_label)
print('Postive/Negative ratio in trainset is {0:.4f}'.format(pos_ratio))

Postive/Negative ratio in trainset is 0.1668


In [12]:
trainset = DataSeq(queries=sample_query, passages=sample_passage, labels=sample_label)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2)

In [13]:
# for i in trainloader:
#     break

# i[0].last_hidden_state.squeeze().shape, i[1].last_hidden_state.squeeze().shape, i[2].shape, i[3].squeeze().shape, i[4].squeeze().shape

for i in trainloader:
    break

i[0].squeeze().shape, i[1].squeeze().shape, i[2].squeeze().shape, i[3].squeeze().shape,  i[4].shape

(torch.Size([128, 50]),
 torch.Size([128, 300]),
 torch.Size([128, 50]),
 torch.Size([128, 300]),
 torch.Size([128]))

In [20]:
binary_model = BinaryClassifier(args).to(args.device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(binary_model.parameters(), lr=args.lr)
# optimizer = torch.optim.SGD(binary_model.parameters(), lr=args.lr, momentum=0.9)


for epoch in range(150):
    progress_bar = tqdm(trainloader)
    epoch_loss = 0.
    for i, data in enumerate(progress_bar):
        progress_bar.set_description('Epoch: ' + str(epoch))
        # query, passage, query_mask, passage_mask, label = data[0].last_hidden_state.squeeze(), data[1].last_hidden_state.squeeze(), data[2].squeeze(), data[3].squeeze(), data[4].to(device)
        query, passage, query_mask, passage_mask, label= data[0].squeeze().to(device), data[1].squeeze().to(device), data[2].squeeze().to(device), data[3].squeeze().to(device), data[4].to(device)

        with torch.no_grad():
            query = model(input_ids=query, attention_mask=query_mask).last_hidden_state
            passage = model(input_ids=passage, attention_mask=passage_mask).last_hidden_state
        
        output = binary_model(query, passage, query_mask, passage_mask)
        optimizer.zero_grad()
        loss = criterion(output.squeeze(), label.to(torch.float32))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(epoch_loss='%.3f' % (epoch_loss /(i+1)))
    # scheduler.step()





Epoch: 0: 100%|██████████| 225/225 [00:20<00:00, 10.76it/s, epoch_loss=0.500]
Epoch: 1: 100%|██████████| 225/225 [00:20<00:00, 10.72it/s, epoch_loss=0.449]
Epoch: 2: 100%|██████████| 225/225 [00:21<00:00, 10.49it/s, epoch_loss=0.442]
Epoch: 3: 100%|██████████| 225/225 [00:21<00:00, 10.55it/s, epoch_loss=0.438]
Epoch: 4: 100%|██████████| 225/225 [00:20<00:00, 10.73it/s, epoch_loss=0.434]
Epoch: 5: 100%|██████████| 225/225 [00:21<00:00, 10.70it/s, epoch_loss=0.430]
Epoch: 6: 100%|██████████| 225/225 [00:21<00:00, 10.68it/s, epoch_loss=0.428]
Epoch: 7: 100%|██████████| 225/225 [00:20<00:00, 10.74it/s, epoch_loss=0.424]
Epoch: 8: 100%|██████████| 225/225 [00:20<00:00, 10.72it/s, epoch_loss=0.422]
Epoch: 9: 100%|██████████| 225/225 [00:20<00:00, 10.74it/s, epoch_loss=0.419]
Epoch: 10: 100%|██████████| 225/225 [00:21<00:00, 10.69it/s, epoch_loss=0.415]
Epoch: 11: 100%|██████████| 225/225 [00:21<00:00, 10.71it/s, epoch_loss=0.413]
Epoch: 12: 100%|██████████| 225/225 [00:20<00:00, 10.74it/s, e

In [14]:
# binary_model.eval()
# torch.save(binary_model, '/content/gdrive/MyDrive/model.pth')
binary_model = torch.load('model.pth')
binary_model.eval()

BinaryClassifier(
  (pos_encoder): PositionalEncoding(
    (embeddings): Embedding(350, 128)
  )
  (multi_atten): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=64, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear

In [15]:
test_queries = test_queries[np.argsort(test_qids)]
test_passages = test_passages[np.argsort(test_qids)]
test_labels = test_labels[np.argsort(test_qids)]

In [16]:
valset = DataSeq(queries=test_queries, passages=test_passages, labels=test_labels)
valloader = torch.utils.data.DataLoader(valset, batch_size=512, shuffle=False, num_workers=2)

In [77]:
# model.eval()
# torch.save(model, '/content/gdrive/MyDrive/model.pth')

In [17]:
yTe_pred = []
with torch.no_grad():
    binary_model.eval()
    progress_bar = tqdm(valloader)
    for i, data in enumerate(progress_bar):
        progress_bar.set_description('Epoch: ' + str(0))
        query, passage, query_mask, passage_mask, label= data[0].squeeze().to(device), data[1].squeeze().to(device), data[2].squeeze().to(device), data[3].squeeze().to(device), data[4].to(device)
        query = model(input_ids=query, attention_mask=query_mask).last_hidden_state
        passage = model(input_ids=passage, attention_mask=passage_mask).last_hidden_state
        output = binary_model(query, passage, query_mask, passage_mask)
        yTe_pred.extend((output.squeeze().cpu().numpy().tolist()))

Epoch: 0: 100%|██████████| 2155/2155 [06:31<00:00,  5.51it/s]


In [18]:
import numpy as np
def Average_Precision(true_label):
    # AP for single query
    rela_idx = np.where(true_label == 1)[0]
    n_rela_passage = len(rela_idx)
    denom = rela_idx + 1
    numerator = np.arange(1, n_rela_passage+1)
    return (numerator/denom/n_rela_passage).sum()

def NDCG(true_label):
    # NDCG for single query
    DCG = np.sum((2**true_label - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    n_rela_passage = int(np.sum(true_label))
    opt_rela_score = np.zeros(len(true_label))
    opt_rela_score[:n_rela_passage] = 1
    optDCG = np.sum((2**opt_rela_score - 1) / np.log2(np.arange(1, len(true_label)+1) + 1))
    return DCG/optDCG.sum() if optDCG != 0 else 0
    
def mean_metric_nn(test_qids, test_pids, yTe, pred_yTe, write=True):
    ranked_qid = test_qids[np.argsort(test_qids)]
    last = ranked_qid[0]
    counter = []
    cur_counter = []
    for i, id in enumerate(tqdm(ranked_qid)):
        cur = id
        if cur != last:
            counter.append(cur_counter)
            cur_counter = [i]
        else:
            cur_counter.append(i)
        last = cur
    counter.append(cur_counter)


    m_ap = 0
    m_ndcg = 0
    test_pids = test_pids[np.argsort(test_qids)]
    uni_qids = np.unique(test_qids)

    res_qid = []
    res_pid = []
    res_score = []
    res_rank = []
    res_A1 = []
    res_algoname = []

    for i, count in enumerate(tqdm(counter)):
        sort_idx = np.argsort(pred_yTe[count])[::-1]
        sub_yTe = yTe[count]
        true_label = sub_yTe[sort_idx]
        ap = Average_Precision(true_label)
        ndcg = NDCG(true_label)
        m_ap += ap
        m_ndcg += ndcg

        sub_pids = test_pids[count]

        res_qid.extend([uni_qids[i] for _ in range(len(sort_idx))])
        res_pid.extend(sub_pids[sort_idx])
        res_score.extend(pred_yTe[count][np.argsort(pred_yTe[count])[::-1]])
        res_rank.extend(np.arange(1, len(sort_idx)+1))
        res_A1.extend(['A1' for _ in range(len(sort_idx))])
        res_algoname.extend(['NN' for _ in range(len(sort_idx))])


    if write:
        data = {'qid': res_qid, 'A1': res_A1, 'pid': res_pid, 'rank': res_rank, 'score': res_score, 'algoname': res_algoname}
        data_df = pd.DataFrame(data)
        data_df.to_csv('NN.txt',index=False,header=False, sep=' ')
    
        
    
    return m_ap / len(counter), m_ndcg / len(counter)

In [19]:
mean_metric_nn(test_qids, test_pids, test_labels, np.array(yTe_pred))

100%|██████████| 1103039/1103039 [00:00<00:00, 2094042.37it/s]
100%|██████████| 1148/1148 [00:01<00:00, 934.95it/s]


(0.03623747755962718, 0.16870018637126288)