In [1]:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer 
import collections
from collections import Counter
import random
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
import xgboost as xgb
from tqdm import tqdm

In [2]:
# format : qid pid queries passage relevancy
# 1103039 rows × 5 columns
validation_data = pd.read_csv('validation_data.tsv', sep='\t',header=0,low_memory=False)

# 4364339 rows × 5 columns
train_data = pd.read_csv('train_data.tsv', sep='\t',header=0,low_memory=False)

In [4]:
def preprocessing(text, stopword_removal = False, lemma = False):
    """
        A text preprocessing function
        Inputs:
          text: input queries/passages
          stopword_removal: remove all stopwords if True
          lemma: do lemmatisation and stemming if True
        Outputs:
          passage: queries/passages after preprocessing
    """
    stop_words = set(stopwords.words('english')) 
    word_tokens = RegexpTokenizer(r'\s+', gaps=True)
    passage = []
    for i in range(len(text)):
        words = text[i].lower()
        # remove punctuation
        rm_punc =re.compile('[%s]' % re.escape(string.punctuation))
        words = rm_punc.sub('', words)
        # remove all the numbers
        words = re.sub(r'[^a-zA-Z\s]', u' ', words, flags=re.UNICODE)
        # tokenize
        token_words = word_tokens.tokenize(words)
        
        # stop word removal
        if (stopword_removal == True):
            token_words = [w for w in token_words if not w in stop_words]
        
        sentence = []
        # lemmatisation & stemming
        if (lemma == True):
            stemmer = SnowballStemmer('english')
            for i in token_words:      
                sentence.append(stemmer.stem(i))
        else:
            sentence = token_words
        passage.append(sentence) 
    return passage

In [5]:
def Relevant_dict(data):
    """
        A relevant and irrelevant passage function
        Inputs:
          data: input dataset
        Outputs:
          relevant_dict: relevant passage dictionary with a format of {qid: {pid, position}}
          irrelevant_dict: irrelevant passage dictionary with a format of {qid: {pid, position}}
    """
    qid_list = data.qid
    pid_list = data.pid
    relevancy_list = data.relevancy
    relevant_dict = {}
    irrelevant_dict = {}
    for ind,qid in enumerate(qid_list):
        pid = pid_list[ind]
        relevancy = relevancy_list[ind]
        if relevancy > 0:
            if qid not in relevant_dict.keys():
                relevant_dict[qid] = {pid:ind}
            elif qid in relevant_dict.keys():
                new_pid = {pid:ind}
                relevant_dict[qid].update(new_pid)
        else:
            if qid not in irrelevant_dict.keys():
                irrelevant_dict[qid] = {pid:ind}
            elif qid in irrelevant_dict.keys():
                new_pid = {pid:ind}
                irrelevant_dict[qid].update(new_pid)

    return relevant_dict,irrelevant_dict

In [6]:
valid_relevant_dict, valid_irrelevant_dict = Relevant_dict(validation_data)

In [7]:
train_relevant_dict, train_irrelevant_dict = Relevant_dict(train_data)

In [8]:
def subsampling(data):
    """
        A subsampling function
        Inputs:
          data: input dataset
        Outputs:
           dataset after negative down sampling
    """
    # a list store all subsamples' positions selected
    DF_list = []
    
    # for each query
    for qid in train_relevant_dict.keys():   
        
        # keep all relevant passage, record their positions
        rel_list = list(train_relevant_dict[qid].values())
        
        # random choose samples from irrelevant passage with a rate of 0.025, 
        # record their positions
        if qid not in train_irrelevant_dict.keys():
            irrel_list = []
            
        else:
            L = list(train_irrelevant_dict[qid].values())
            
            # if the number of irrelevant passages for this qid is samller than 25, 
            # keep all irrelevant passages
            if len(L) <= 5:
                irrel_list = L
                
            # if the number of irrelevant passages for this qid is larger than 25,
            # choose them by the rate of 0.025
            else:
                irrel_list = random.sample(L,5) 
                # choose 25 here, since most amount of irrelevant passages is around 1000
                # 1000*0.025 = 25
        
        sample_ind = rel_list + irrel_list
        DF_list += sample_ind  
    
    # convert positions to their corresponding rows
    NewData = []
    for i in DF_list:
        newdata = data[i:i+1]
        NewData.append(newdata)
    
    # merge all the subsamples and convert to a dataFrame
    return pd.concat(NewData,axis=0,ignore_index=True)

In [9]:
train_subdata = subsampling(train_data)
train_subdata # 27726 rows

Unnamed: 0,qid,pid,queries,passage,relevancy
0,709560,1050990,what is all in basic metabolic panel,Basic Metabolic Panel. The basic metabolic pan...,1.0
1,709560,8695294,what is all in basic metabolic panel,This gives you the basic instructions on how a...,0.0
2,709560,1050988,what is all in basic metabolic panel,"A Dr. Kathleen Handal, MD , Emergency Medicine...",0.0
3,709560,2901427,what is all in basic metabolic panel,Calories are the basic unit of energy found in...,0.0
4,709560,2415797,what is all in basic metabolic panel,(See also Diabetes Mellitus.) Diabetic ketoaci...,0.0
...,...,...,...,...,...
27721,969974,2569623,where did the the trail of tears end,The Cherokees presented their own memorial to ...,0.0
27722,969974,1054561,where did the the trail of tears end,The museum is located along the route traveled...,0.0
27723,969974,6724015,where did the the trail of tears end,This challenging 2.1 mile trail connects the u...,0.0
27724,969974,778287,where did the the trail of tears end,c. tears The act of weeping: criticism that le...,0.0


In [10]:
subtrain_relevant_dict, subtrain_irrelevant_dict = Relevant_dict(train_subdata)
#len(subtrain_relevant_dict.keys())   # 4590 qids
#len(subtrain_irrelevant_dict.keys())  # 4589 qids

In [19]:
txt = "glove.6B.100d.txt"
 
# Get vector_size
with open(txt, 'r') as f:
    line = f.readline().split(' ')
    vector_size = len(line) - 1
    
# Get vocab_size
vocab_size = -1
for vocab_size, line in enumerate(open(txt,'rU')):
    pass
vocab_size += 1
 
# Add them to the start of file
with open(txt, 'r+') as f:
    content = f.read()        
    f.seek(0, 0)
    f.write(('%d %d\n' % (vocab_size, vector_size)) + content)
    
word_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)
len(list(word_model.key_to_index))  # 400000 words, 100 dim for each word

  for vocab_size, line in enumerate(open(txt,'rU')):


400000

In [75]:
max_length = 0
for i in tqdm(range(len(qid_query_train.keys()))):
    length = len(list(qid_query_train.values())[i])
    if length > max_length:
        max_length = length 
for i in tqdm(range(len(pid_passage_train.keys()))):
    length = len(list(pid_passage_train.values())[i])
    if length > max_length:
        max_length = length
for i in tqdm(range(len(qid_query_valid.keys()))):
    length = len(list(qid_query_valid.values())[i])
    if length > max_length:
        max_length = length
for i in tqdm(range(len(pid_passage_valid.keys()))):
    length = len(L[i])
    if length > max_length:
        max_length = length 
max_length

137

In [76]:
train_passages = preprocessing(train_subdata.passage, stopword_removal = True, lemma = False)
train_queries = preprocessing(train_subdata.queries, stopword_removal = True, lemma = False)
test_passages = preprocessing(validation_data.passage, stopword_removal = True, lemma = False)
test_queries = preprocessing(validation_data.queries, stopword_removal = True, lemma = False)

In [104]:
def word_table(datasets,model):
    token_to_ind = {} # tokens to indexes
    ind_to_vec = {} # indexes to word vectors
    i = 0
    
    for dataset in tqdm(datasets):
        for sentence in dataset: # for each query/passage
            for token in sentence: # for each token of the sentence
                # if this word is not token_to_ind
                if(token_to_ind.get(token) == None):
                    if token in model:
                    # if this word exists is the word model
                        i += 1
                        token_to_ind[token] = i
                        ind_to_vec[i] = model[token]

    return token_to_ind, ind_to_vec

In [105]:
# 150119 words
token_ind_dict, ind_vec_dict = word_table([train_passages,train_queries,test_passages,test_queries], word_model)

100%|█████████████████████████████████████████████| 4/4 [00:12<00:00,  3.15s/it]


In [112]:
def new_sentence_embedding(token_ind_dict,ind_vec_dict,text):
    sentence_vec = []
  ## for every sentence
    for sentence in tqdm(text):
        sentence_vec_list = []
        for word in sentence:
            word_index = token_ind_dict.get(word)
        if(word_index!=None):
            word_embedding = ind_vec_dict.get(word_index)
            sentence_vec_list.append(np.array(word_embedding))
        else:
            sentence_vec_list.append(np.array(np.zeros(100)))
    if len(sentence_vec_list) == 0:
        sentence_vec_list.append(np.array(np.zeros(100)))
        sentence_vec_list = np.array(sentence_vec_list)

    sentence_vec.append(np.mean(sentence_vec_list,axis = 0))

    return np.array(sentence_vec)

In [114]:
train_passage_ind = new_sentence_embedding(token_ind_dict, ind_vec_dict, train_passages)
train_queries_ind= new_sentence_embedding(token_ind_dict, ind_vec_dict, train_queries)

test_passage_ind = new_sentence_embedding(token_ind_dict, ind_vec_dict, test_passages)
test_queries_ind = new_sentence_embedding(token_ind_dict, ind_vec_dict, test_queries)

100%|██████████████████████████████████| 27726/27726 [00:01<00:00, 13876.53it/s]
100%|██████████████████████████████████| 27726/27726 [00:00<00:00, 34015.40it/s]
100%|██████████████████████████████| 1103039/1103039 [01:26<00:00, 12734.22it/s]
100%|██████████████████████████████| 1103039/1103039 [00:20<00:00, 53817.00it/s]


In [122]:
train_labels = train_subdata['relevancy'].values
test_labels = validation_data['relevancy'].values

In [130]:
def look_up_table(ind_vec_dict):
    table = [np.zeros(100)]
    for key in sorted (ind_vec_dict.keys()) :  
        table.append(ind_vec_dict.get(key))
    return np.array(table)

new_table = look_up_table(ind_vec_dict)
print(new_table.shape) # (150120, 100)

(150120, 100)


In [138]:
def padding_zeros(max_length,vector):
    vector = np.array(vector)
    if(vector.shape[0] < max_length and vector.shape[0] != 0):
        padding_vector = np.zeros(max_length - vector.shape[0])
        return np.concatenate((vector, padding_vector), axis=0)
    elif(vector.shape[0] == max_length):
        return vector
    else:
        return np.zeros((0,0))

def word_to_index_func(text,max_length,labels,token_ind_dict):
    embedding_ind = []
    embedding_labels = []
    sentence_lengths = []
    i = -1

    for sentence in tqdm(text):
        i += 1
        embedding_sentence = []
        for word in sentence:
            if(token_ind_dict.get(word)!=None):
                embedding_sentence.append(token_ind_dict.get(word))

    sentence_lengths.append(len(embedding_sentence))

    embedding_sentence =  padding_zeros(max_length,embedding_sentence) 
    embedding_labels.append(labels[i])
    
    if(embedding_sentence.shape[0]!=0):
        embedding_ind.append(np.array(embedding_sentence))
    else:
        embedding_ind.append(np.zeros(max_length))
    Ind = np.array(embedding_ind)
    Labels = np.array(embedding_labels)
    Len = np.array(sentence_lengths)
    return Ind,Labels,Len


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_pInd = torch.from_numpy(train_pInd).int().to(device)
train_qInd = torch.from_numpy(train_qInd).int().to(device)
train_plen = torch.from_numpy(train_plen)
train_plen = torch.as_tensor(train_plen,dtype=torch.int64)
train_qlen = torch.from_numpy(train_qlen)
train_qlen = torch.as_tensor(train_qlen, dtype=torch.int64)
train_labels = torch.from_numpy(train_labels).to(device)

new_table = torch.from_numpy(new_table).float().to(device)

test_pInd = torch.from_numpy(test_pInd).int().to(device)
test_qInd = torch.from_numpy(test_qInd).int().to(device)
test_plen = torch.from_numpy(test_plen)
test_plen = torch.as_tensor(test_plen, dtype=torch.int64)
test_qlen = torch.from_numpy(test_qlen)
test_qlen = torch.as_tensor(test_qlen, dtype=torch.int64)
test_labels = torch.from_numpy(test_labels).to(device)

train_data = torch.utils.data.TensorDataset(train_pInd, train_plen, train_qInd, train_qlen, train_labels)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)

test_data = torch.utils.data.TensorDataset(test_pInd, test_plen, test_qInd, test_queries_lengths, test_qlen)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=False)

100%|██████████████████████████████████| 27726/27726 [00:01<00:00, 18727.03it/s]
100%|██████████████████████████████████| 27726/27726 [00:00<00:00, 83024.69it/s]
 77%|███████████████████████▉       | 852886/1103039 [00:45<00:09, 25849.26it/s]

In [192]:
class RNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = 2,
                          bidirectional = True, dropout = 0.5)
        self.fc = nn.Linear(hidden_dim*2,1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self,x):
        embedding = self.dropout(self,embedding(x))
        output,(hidden,cell) = self.rnn(embedding)
        hidden - torch.cat(hidden[-2],hidden[-1],dim=1)
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out 

In [193]:
rnn = RNN(len(look_up_table),100,256)
pretrained_embedding = 


In [None]:
def train(rnn, iterator,optimizer,stop):
    