In [1]:
from passage_reader import get_passages_for_relations, get_passages_for_relations_selective_negative
from  transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification, AdamW
from  transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertPreTrainedModel
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from rerank_passages import qid_relation, read_passages, get_query, tokenizer as nltk_tokenizer
from rank_bm25 import BM25Okapi
from nltk import ngrams
from nltk.corpus import stopwords
import regex as re
import pandas as pd
import numpy as np
import torch
import operator
import rank_bm25
import nltk
import pickle
import collections
from tqdm import tqdm
# from apex import amp
from random import random
from copy import deepcopy
import gc

  from ._conv import register_converters as _register_converters
I0226 17:38:34.418381 139661789845248 file_utils.py:39] PyTorch version 1.3.0 available.
Using TensorFlow backend.


In [None]:
instances = ['actedIn', 'created', 'diedIn', 'diedOnDate', 'directed', 'graduatedFrom', 'hasCapital', 'hasChild',
             'hasWonPrize', 'influences', 'isCitizenOf', 'isKnownFor', 'isLeaderOf', 'isLocatedIn', 'isMarriedTo',
             'isPoliticianOf', 'participatedIn', 'wasBornIn', 'wasBornOnDate']

instance_converter = {'actedIn':'acted in', 'created': 'created', 'diedIn': 'died in', 'diedOnDate': 'died on date', 'directed':'directed', 'graduatedFrom':'graduated from', 'hasCapital': 'has capital', 'hasChild': 'has child',
             'hasWonPrize': 'has won prize','influences': 'influences','isCitizenOf': 'is citizen of', 'isKnownFor': 'is known for', 'isLeaderOf': 'is leader of', 'isLocatedIn': 'is located in', 'isMarriedTo': 'is married to',
             'isPoliticianOf': 'is politician of', 'participatedIn': 'participated in', 'wasBornIn': 'was born in', 'wasBornOnDate': 'was born on date'}


In [None]:
PASSAGE_DIRECTORY = 'passages/fact_check_centroid_sorted'
SAMPLE_SIZE = 4000
SKIP_DUPLICATE = True
REGRESSION = True

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
def tag_entities(passage, source, target, label=None):
    
    passage = passage.replace('(', ' ( ')
    passage = passage.replace(')', ' ) ')
    passage = passage.replace('[', ' [ ')
    passage = passage.replace(']', ' ] ')
    passage = passage.replace('  ', ' ')
    
    passage = passage.replace(source, '<startSource> {} <endSource>'.format(source))
    passage = passage.replace(target, '<startTarget> {} <endTarget>'.format(target))
    list_stopwords = set(stopwords.words('english'))
    subjects = []
    objects = []
    for w in source.split(' '):
        if w.lower() not in list_stopwords:
            subjects.append(w)
    
    for w in target.split(' '):
        if w.lower() not in list_stopwords:
            objects.append(w)
    
    reg_split = re.compile(r'[\s\.\,\!\?\:\;/-]')
    source_start = False
    target_start = False  
    tagged_passage = ''

    for w in reg_split.split(passage):
        w = w.strip()
        if not source_start and w == '<startSource>':      
            source_start = True
            tagged_passage += ' {} '.format(w)
        elif source_start and w == '<endSource>':
            source_start = False
            tagged_passage += ' {} '.format(w)
        elif not target_start and w == '<startTarget>':
            target_start = True
            tagged_passage += ' {} '.format(w)
        elif target_start and w == '<endTarget>':
            target_start = False
            tagged_passage += ' {} '.format(w)
        elif not source_start and not target_start:
            if w in subjects:
                tagged_passage += ' <startSource> {} <endSource> '.format(w)
            elif w in objects:
                tagged_passage += ' <startTarget> {} <endTarget> '.format(w)
            else:
                tagged_passage += ' {} '.format(w)
        else:
            tagged_passage += ' {} '.format(w)
        
    return tagged_passage.replace('  ', ' ')

In [None]:
data, label = get_passages_for_relations_selective_negative(directory=PASSAGE_DIRECTORY, relations=instances,
                                                               sample_size=SAMPLE_SIZE,
                                                               # skip_mode='even',  # skip even line
                                                               skip_duplicate=SKIP_DUPLICATE,
                                                               regression=REGRESSION)

In [None]:
data

In [None]:
tag_entities(data['passage'][29] + 'Jane '+'Thomas', data['source'][29], data['target'][29])

In [None]:
data['converted_relation'] = data['relation'].apply(lambda x: instance_converter[x])

In [None]:
data['source_relation_target'] = data['source'] +' ' + data['converted_relation'] + ' ' + data['target']

In [None]:
data

In [None]:
label

In [3]:
def compute_bm25_sentences(paragraph, query):
    sentences = nltk.sent_tokenize(paragraph.replace('.', ' . '))
    tokenized_sentences = []
        
    for s in sentences:
        tokenized_sentences.append(nltk_tokenizer.tokenize(s))
            
    bm25 = BM25Okapi(tokenized_sentences)
    tokenized_query = nltk_tokenizer.tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)
        
    merge_sentences_scores = list(zip(sentences, bm25_scores))
   
    merge_sentences_scores.sort(key=lambda tup: tup[1], reverse=True)
   
    sorted_sentences = [s for s, score in merge_sentences_scores[:3]]
        
    if len(sorted_sentences) < 3:
        sorted_sentences+=[' '] * (3-len(sorted_sentences))
    
    sorted_length_sentences = sorted_sentences.copy()
    
    length_sentences = [len(x.split(' ')) for x in sorted_sentences]
    sorted_length_sentences += length_sentences
    
    return tuple(sorted_length_sentences)
    
#paragraph 51120
# print(compute_bm25_sentences(data['passage'][37], data['source_relation_target'][37]))
# label[10]
# print(data['source_relation_target'][10])

In [4]:
def disjoint_ngrams(sentence, n):
    tokenized_sentence = sentence.split(' ')
    divide_sentence = int(len(tokenized_sentence)/n)
    ngram_sentences = []
    begin = 0
    end = n
    for _ in range(0, divide_sentence):
        print(tokenized_sentence[begin:end])
        ngram_sentences.append(tokenized_sentence[begin:end])
        begin = end
        end += n
    return ngram_sentences

In [5]:
def compute_bm25_sentences_with_n_grams(paragraph, query):
#     sentences = nltk_tokenizer.tokenize(paragraph)
    tokenized_sentences = disjoint_ngrams(paragraph,20)
#     for s in ngrams(sentences,6):
#         tokenized_sentences.append(s)
    
    bm25 = BM25Okapi(tokenized_sentences)
    tokenized_query = nltk_tokenizer.tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)

In [6]:
def pairwise_loss(s_i, s_j, S_ij, sigma=1):
    C = torch.log1p(torch.exp(-sigma * (s_i - s_j)))
    if S_ij == -1:
        C += sigma * (s_i - s_j)
    elif S_ij == 0:
        C += 0.5 * sigma * (s_i - s_j)
    elif S_ij == 1:
        pass
    else:
        raise ValueError("S_ij: -1/0/1")
    return C

In [None]:
data['sentence_1'], data['sentence_2'], data['sentence_3'], data['length_1'], data['length_2'], data['length_3']  = zip(*data.apply(lambda x: compute_bm25_sentences(x['passage'], x['source_relation_target']), axis=1))

In [None]:
print(data['sentence_1'][10], data['sentence_2'][10], data['sentence_3'][10])
print(data['passage'][10])

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
special_tokens_dict = {'additional_special_tokens':['<startSource>', '<endSource>', 
                                                    '<startTarget>','<endTarget>']}
print(len(bert_tokenizer))
num_added_tokens = bert_tokenizer.add_special_tokens(special_tokens_dict)
print('{} special tokens were added'.format(num_added_tokens))
print(len(bert_tokenizer))

In [7]:
def transform_data_x_y_sentences(tokenizer, data, label):
    y = []
    
    x_1 = []
    x_1_tokens = []
    segment_id_1 = []
    attention_mask_1 = []
    
    x_2 = []
    x_2_tokens = []
    segment_id_2 = []
    attention_mask_2 = []
    
    x_3 = []
    x_3_tokens = []
    segment_id_3 = []
    attention_mask_3 = []

    for s_1, s_2, s_3, src, targ, r, l in zip(data['sentence_1'], data['sentence_2'], data['sentence_3'],
                       data['source'], data['target'], data['source_relation_target'], label):
        
        s_1 = tag_entities(passage=s_1, source=src, target=targ)
        s_2 = tag_entities(passage=s_2, source=src, target=targ)
        s_3 = tag_entities(passage=s_3, source=src, target=targ)
        
        temp_x_1, temp_x_1_tokens, temp_segment_id_1, temp_attention_mask_1 = transform_passage(tokenizer, s_1, r)
        temp_x_2, temp_x_2_tokens, temp_segment_id_2, temp_attention_mask_2 = transform_passage(tokenizer, s_2, r)
        temp_x_3, temp_x_3_tokens, temp_segment_id_3, temp_attention_mask_3 = transform_passage(tokenizer, s_3, r)
        
        if temp_x_1 is not None and temp_x_2 is not None and temp_x_3 is not None:
            x_1.append(temp_x_1)
            x_1_tokens.append(temp_x_1_tokens)
            segment_id_1.append(temp_segment_id_1)
            attention_mask_1.append(temp_attention_mask_1)
            
            x_2.append(temp_x_2)
            x_2_tokens.append(temp_x_2_tokens)
            segment_id_2.append(temp_segment_id_2)
            attention_mask_2.append(temp_attention_mask_2)
            
            x_3.append(temp_x_3)
            x_3_tokens.append(temp_x_3_tokens)
            segment_id_3.append(temp_segment_id_3)
            attention_mask_3.append(temp_attention_mask_3)
            
            y.append(l)
            
    return x_1, x_1_tokens, segment_id_1, attention_mask_1,\
        x_2, x_2_tokens, segment_id_2, attention_mask_2,\
        x_3, x_3_tokens, segment_id_3, attention_mask_3,\
        y
        
def transform_data_x_sentences(tokenizer, passages, relations, sources, targets):
    x_1 = []
    x_1_tokens = []
    segment_id_1 = []
    attention_mask_1 = []
    
    x_2 = []
    x_2_tokens = []
    segment_id_2 = []
    attention_mask_2 = []
    
    x_3 = []
    x_3_tokens = []
    segment_id_3 = []
    attention_mask_3 = []

    for p, rel, src, targ in zip(passages, relations, sources, targets):
        
        source_relation_target = src + ' ' + rel + ' ' + targ
        
        s_1, s_2, s_3, _, _, _ =  compute_bm25_sentences(p, source_relation_target)
        
        s_1 = tag_entities(passage=s_1, source=src, target=targ)
        s_2 = tag_entities(passage=s_2, source=src, target=targ)
        s_3 = tag_entities(passage=s_3, source=src, target=targ)
        
        temp_x_1, temp_x_1_tokens, temp_segment_id_1, temp_attention_mask_1 = transform_passage(tokenizer, s_1,\
                                                                                                source_relation_target)
        temp_x_2, temp_x_2_tokens, temp_segment_id_2, temp_attention_mask_2 = transform_passage(tokenizer, s_2,\
                                                                                                source_relation_target)
        temp_x_3, temp_x_3_tokens, temp_segment_id_3, temp_attention_mask_3 = transform_passage(tokenizer, s_3,\
                                                                                                source_relation_target)
        
        x_1.append(temp_x_1)
        x_1_tokens.append(temp_x_1_tokens)
        segment_id_1.append(temp_segment_id_1)
        attention_mask_1.append(temp_attention_mask_1)
            
        x_2.append(temp_x_2)
        x_2_tokens.append(temp_x_2_tokens)
        segment_id_2.append(temp_segment_id_2)
        attention_mask_2.append(temp_attention_mask_2)
            
        x_3.append(temp_x_3)
        x_3_tokens.append(temp_x_3_tokens)
        segment_id_3.append(temp_segment_id_3)
        attention_mask_3.append(temp_attention_mask_3)

    return x_1, x_1_tokens, segment_id_1, attention_mask_1,\
        x_2, x_2_tokens, segment_id_2, attention_mask_2,\
        x_3, x_3_tokens, segment_id_3, attention_mask_3

def transform_passage(tokenizer, passage, relation):
    max_length = 300
    tokenized_r = tokenizer.tokenize(relation)
    tokenized_p = tokenizer.tokenize(passage)[:max_length - len(tokenized_r) - 3]
    
    if len(tokenized_p) + len(tokenized_r) < max_length: #509
        temp_x = ['[CLS]'] + tokenized_p + ['[SEP]'] + tokenized_r + ['[SEP]']
        temp_segment_id = (len(tokenized_p)+2) * [0] + (len(tokenized_r)+1) * [1] + (max_length - len(temp_x)) * [0]
        temp_attention_mask = (len(tokenized_p)+2) * [1] + (len(tokenized_r)+1) * [1] + (max_length - len(temp_x)) * [0]
        x_tokens = np.asarray(tokenizer.convert_tokens_to_ids(temp_x) + (max_length - len(temp_x)) * [0])
        return temp_x, x_tokens, temp_segment_id, temp_attention_mask
    
    return (None,) * 4


In [None]:
# x, x_tokens, segment_id, attention_mask, y = transform_data_x_y(bert_tokenizer, data, label)
x_1, x_1_tokens, segment_id_1, attention_mask_1,\
    x_2, x_2_tokens, segment_id_2, attention_mask_2,\
    x_3, x_3_tokens, segment_id_3, attention_mask_3,\
    y = transform_data_x_y_sentences(bert_tokenizer, data, label)

In [None]:
x_3[70]

In [8]:
def split_data(x, x_tokens, segment_id, attention_mask, y):
    x_tokens_train, x_tokens_val, y_train, y_val = train_test_split(x_tokens, y, test_size=0.1, random_state=42)
    _, _, segment_id_train, segment_id_val = train_test_split(x_tokens, segment_id, test_size=0.1, random_state=42)
    _, _, attention_mask_train, attention_mask_val = train_test_split(x_tokens, attention_mask, test_size=0.1, random_state=42)
    _, _, x_train, x_val = train_test_split(x_tokens, x, test_size=0.1, random_state=42)
    
    return x_train, x_tokens_train, segment_id_train, attention_mask_train, y_train,\
        x_val, x_tokens_val, segment_id_val, attention_mask_val, y_val 


In [None]:
x_1_train, x_1_tokens_train, segment_id_1_train,\
    attention_mask_1_train, y_1_train,\
    x_1_val, x_1_tokens_val, segment_id_1_val,\
    attention_mask_1_val, y_1_val = split_data(x_1, x_1_tokens, segment_id_1, attention_mask_1, y)
    

x_2_train, x_2_tokens_train, segment_id_2_train,\
    attention_mask_2_train, y_2_train,\
    x_2_val, x_2_tokens_val, segment_id_2_val,\
    attention_mask_2_val, y_2_val = split_data(x_2, x_2_tokens, segment_id_2, attention_mask_2, y)
    
x_3_train, x_3_tokens_train, segment_id_3_train,\
    attention_mask_3_train, y_3_train,\
    x_3_val, x_3_tokens_val, segment_id_3_val,\
    attention_mask_3_val, y_3_val = split_data(x_3, x_3_tokens, segment_id_3, attention_mask_3, y)
    
print(x_1_train[10])
print(x_2_train[10])
print(x_3_train[10])

print('')
print(x_1_val[10])
print(x_2_val[10])
print(x_3_val[10])

In [9]:
def convert_to_tensor_train_val(x_tokens_train, segment_id_train, attention_mask_train, y_train,\
                      x_tokens_val, segment_id_val, attention_mask_val, y_val):

    x_tokens_train_tensor, segment_id_train_tensor, attention_mask_train_tensor = convert_to_tensor_x(x_tokens_train,\
                                                                                                      segment_id_train, attention_mask_train)
    y_train_tensor = torch.tensor(y_train)

    x_tokens_val_tensor, segment_id_val_tensor, attention_mask_val_tensor = convert_to_tensor_x(x_tokens_val,\
                                                                                                segment_id_val, attention_mask_val)
    y_val_tensor = torch.tensor(y_val)
    
    return x_tokens_train_tensor, segment_id_train_tensor,\
        attention_mask_train_tensor, y_train_tensor,\
        x_tokens_val_tensor, segment_id_val_tensor,\
        attention_mask_val_tensor, y_val_tensor

def convert_to_tensor_x(x_tokens, segment_id, attention_mask):
    x_tokens_tensor = torch.tensor(x_tokens)
    segment_id_tensor = torch.tensor(segment_id)
    attention_mask_tensor = torch.tensor(attention_mask)
    
    return x_tokens_tensor, segment_id_tensor, attention_mask_tensor

In [None]:
x_1_tokens_train_tensor, segment_id_1_train_tensor,\
    attention_mask_1_train_tensor, y_1_train_tensor,\
    x_1_tokens_val_tensor, segment_id_1_val_tensor, \
    attention_mask_val_1_tensor, y_val_1_tensor = convert_to_tensor_train_val(x_1_tokens_train, segment_id_1_train,\
                                                                    attention_mask_1_train, y_1_train,\
                                                                    x_1_tokens_val, segment_id_1_val,\
                                                                    attention_mask_1_val, y_1_val)
    
x_2_tokens_train_tensor, segment_id_2_train_tensor,\
    attention_mask_2_train_tensor, y_2_train_tensor,\
    x_2_tokens_val_tensor, segment_id_2_val_tensor, \
    attention_mask_val_2_tensor, y_val_2_tensor = convert_to_tensor_train_val(x_2_tokens_train, segment_id_2_train,\
                                                                    attention_mask_2_train, y_2_train,\
                                                                    x_2_tokens_val, segment_id_2_val,\
                                                                    attention_mask_2_val, y_2_val)
    
x_3_tokens_train_tensor, segment_id_3_train_tensor,\
    attention_mask_3_train_tensor, y_3_train_tensor,\
    x_3_tokens_val_tensor, segment_id_3_val_tensor, \
    attention_mask_val_3_tensor, y_val_3_tensor = convert_to_tensor_train_val(x_3_tokens_train, segment_id_3_train,\
                                                                    attention_mask_3_train, y_3_train,\
                                                                    x_3_tokens_val, segment_id_3_val,\
                                                                    attention_mask_3_val, y_3_val)

In [10]:
class SentenceEntityBertModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SentenceEntityBertModel, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.last_hidden_layer = torch.nn.Linear(config.hidden_size, 150)
        self.last_hidden_layer_1 = torch.nn.Linear(150*3, 150)
        self.classifier = torch.nn.Linear(150, self.config.num_labels)

        self.init_weights()
    
    def forward_once(self, input_ids, token_type_ids=None, attention_mask=None,
                position_ids=None, head_mask=None, labels=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
                            head_mask=head_mask)
        
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        last_hidden_layer = self.last_hidden_layer(pooled_output)
        
        return last_hidden_layer
    
    def forward(self, input_ids_1, token_type_ids_1, attention_mask_1,\
                input_ids_2, token_type_ids_2, attention_mask_2,\
                input_ids_3, token_type_ids_3, attention_mask_3,\
                labels=None):
        global device
        
        last_hidden_layer_1  = self.forward_once(input_ids=input_ids_1.to(device), token_type_ids=token_type_ids_1.to(device), attention_mask=attention_mask_1.to(device))
        last_hidden_layer_2  = self.forward_once(input_ids=input_ids_2.to(device), token_type_ids=token_type_ids_2.to(device), attention_mask=attention_mask_2.to(device))
        last_hidden_layer_3  = self.forward_once(input_ids=input_ids_3.to(device), token_type_ids=token_type_ids_3.to(device), attention_mask=attention_mask_3.to(device))
        
        my_tensor = torch.cat((last_hidden_layer_1, last_hidden_layer_2, last_hidden_layer_3), dim=1)

        output_hidden_layer_1 = self.last_hidden_layer_1(my_tensor)
        logits = self.classifier(output_hidden_layer_1)
        
        outputs = (logits,) # + outputs[2:]   add hidden states and attention if they are here
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
batch_size = 2

training_tensors = TensorDataset(x_1_tokens_train_tensor, segment_id_1_train_tensor, attention_mask_1_train_tensor,\
                                     x_2_tokens_train_tensor, segment_id_2_train_tensor, attention_mask_2_train_tensor,\
                                     x_3_tokens_train_tensor, segment_id_3_train_tensor, attention_mask_3_train_tensor,\
                                     y_3_train_tensor)
    
val_tensors = TensorDataset(x_1_tokens_val_tensor, segment_id_1_val_tensor, attention_mask_val_1_tensor,\
                                x_2_tokens_val_tensor, segment_id_2_val_tensor, attention_mask_val_2_tensor,\
                                x_3_tokens_val_tensor, segment_id_3_val_tensor, attention_mask_val_3_tensor,\
                                y_val_3_tensor)
    
training_set = DataLoader(training_tensors, batch_size=batch_size, pin_memory=True, shuffle=True)
val_set = DataLoader(val_tensors, batch_size=128, pin_memory=True)


In [11]:
def rank_passages(bert_tokenizer, nltk_tokenizer, model, passages, relations, sources, targets, query_text):
    global device
    model.to(device)
    
    rank_scores = np.zeros(shape=(len(passages), 2))
    bm25_scores = np.zeros(shape=(len(passages), 2))

    for relation, source, target in zip(relations, sources, targets):

        relation = [relation] * len(passages)
        source = [source] * len(passages)
        target = [target] * len(passages)
        
        passtext = [passage.text for passage in passages]

        x_1_test, x_1_tokens_test, segment_id_1_test, attention_mask_1_test,\
        x_2_test, x_2_tokens_test, segment_id_2_test, attention_mask_2_test,\
        x_3_test, x_3_tokens_test, segment_id_3_test, attention_mask_3_test = transform_data_x_sentences(bert_tokenizer,\
                                                                                                         passtext, relation, source, target)
        
        x_1_tokens_test_tensor, segment_id_1_test_tensor, attention_mask_1_test_tensor = convert_to_tensor_x(x_1_tokens_test,\
                                                                                                             segment_id_1_test, attention_mask_1_test)
        x_2_tokens_test_tensor, segment_id_2_test_tensor, attention_mask_2_test_tensor = convert_to_tensor_x(x_2_tokens_test,\
                                                                                                             segment_id_2_test, attention_mask_2_test)
        x_3_tokens_test_tensor, segment_id_3_test_tensor, attention_mask_3_test_tensor = convert_to_tensor_x(x_3_tokens_test,\
                                                                                                             segment_id_3_test, attention_mask_3_test) 
        temp_rank_scores = []
#         print(x_1_test[0])
#         print(x_1_tokens_test[0])
#         print(segment_id_1_test[0])
#         print(attention_mask_1_test[0])
        test_tensors = TensorDataset(x_1_tokens_test_tensor, segment_id_1_test_tensor, attention_mask_1_test_tensor,\
                                     x_2_tokens_test_tensor, segment_id_2_test_tensor, attention_mask_2_test_tensor,\
                                     x_3_tokens_test_tensor, segment_id_3_test_tensor, attention_mask_3_test_tensor)
        test_set = DataLoader(test_tensors, batch_size=1)

        with torch.no_grad():
            for i, (in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3) in enumerate(test_set):
#                 print((in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3))
                results = model(in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3)
                temp_rank_scores.append(results[0].cpu().detach().numpy()[0])
            rank_scores += np.asarray(temp_rank_scores)
        
#         tokenized_corpus = []

#         for p in passtext:
#             tokenized_corpus.append(nltk_tokenizer.tokenize(p))
# #         # bm25
# #         # tokenized_corpus = tokenizer.tokenize(passtext)
#         bm25 = BM25Okapi(tokenized_corpus)
#         tokenized_query = nltk_tokenizer.tokenize(query_text)
#         bm25_scores[:, 0] = bm25.get_scores(tokenized_query)
#         rank_scores += bm25_scores
    rank_scores /= (len(relations) * 2)

    for i, score in enumerate(rank_scores):
       
        passages[i].score = score[0]

    passages.sort(key=operator.attrgetter('score'), reverse=True)
    return passages

In [None]:
num_labels = 2

model = SentenceEntityBertModel.from_pretrained('bert-base-cased', num_labels=num_labels)
model.resize_token_embeddings(len(bert_tokenizer))
model.to(device)

FULL_FINETUNE = True

optimizer_grouped_parameters = None
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

if FULL_FINETUNE:
    print('ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    print('NO ALL FINETUNE')
    #         {'params': model.last_hidden_layer.parameters(),
#          'weight_decay_rate': 0.01},

    optimizer_grouped_parameters = [
        {'params': model.classifier.parameters(),
         'weight_decay_rate': 0.01}
    ]

for n, p in param_optimizer:
    print(n)

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
# model, optimizer = amp.initialize(model, optimizer, opt_level='O0')


In [12]:
def write_results(model, name='bert', epoch=None):
    global bert_tokenizer
    global nltk_tokenizer
    global instance_converter
    result_file_name='ranking.txt'
    query_pass = read_passages(result_file_name)
    queries = pd.read_csv("test.csv", delimiter=",")
    
    if epoch is not None:
        name += '_{}'.format(epoch)
    with open('fact_checks/' + name + '_entity.tab', 'w', encoding='utf8') as output:
            for qid in range(1, len(queries) + 1):
                passage_id = queries['qid'].values[qid-1]
                try:
                    print('Query:', str(qid)+'/'+str(len(queries)))
                    temp_relations, sources, targets, query_text = get_query(queries, qid)
                    relations = []
                    for rl in temp_relations:
                        temp_rl = rl
                        for ci in instance_converter:
                            if ci in rl:
                                temp_rl = temp_rl.replace(ci, instance_converter[ci])
                        relations.append(temp_rl)
                    
                    rerank = rank_passages(bert_tokenizer, nltk_tokenizer, model, query_pass[passage_id], relations, sources, targets, query_text)
                    for passage in rerank:
                        print(str(passage_id) + '\t' + passage.entity + '\t' + passage.text + '\t' + str(passage.score),
                              file=output)
                except FileNotFoundError as file_error:
                    for i in range(len(query_pass[qid])):
                        print(str(qid) + '\t-\t-\t0', file=output)

                    print(file_error)
                    pass

In [None]:
epochs = 4
accumulation_grad = 32
min_loss = 1000

In [None]:
print('size of training set {}'.format(len(training_set)))
print('size of val set {}'.format(len(val_set)))

for ep in range(0, epochs):
    gc.collect()
    print('epoch: {}'.format(ep+1))
    model.train()
    train_loss = 0
    val_loss = 0
    cnt_step = 0
    cnt_acc = 0
    model.zero_grad()
    for i, (in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3, y_3) in tqdm(enumerate(training_set)):
        loss, logits = model(in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3, y_3.to(device))
        train_loss += loss.item()
        loss = loss / (accumulation_grad/batch_size)
        loss.backward()
        cnt_acc += batch_size

        if cnt_acc % accumulation_grad == 0:
            optimizer.step()
            model.zero_grad()
        cnt_step += 1
    print('training loss {}'.format(train_loss/cnt_step))
        
    model.zero_grad()
    model.eval()
    cnt_step = 0
    with torch.no_grad():
        for i, (in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3, y_3) in tqdm(enumerate(val_set)):
            loss, logits = model(in_1, seg_1, attn_1, in_2, seg_2, attn_2, in_3, seg_3, attn_3, y_3.to(device))
            val_loss += loss.item()
            cnt_step += 1
        print('val loss {}'.format(val_loss/cnt_step))
    
    val_loss = val_loss/cnt_step
    if min_loss > val_loss:
        print('We have a new val loss')
        min_loss = val_loss
        torch.save(model, 'models/bert_entity_sentences/bert_entity.pth')
        with open('models/bert_entity_sentences/bert_tokenizer_entity.pickle', 'wb') as handle:
            pickle.dump(bert_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
write_results(model)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
del optimizer
torch.cuda.empty_cache()

In [None]:
del loss
torch.cuda.empty_cache()

## Meta-learning

In [15]:
best_model = torch.load('models/bert_entity_sentences/bert_entity.pth', map_location=device)
best_tokenizer = pickle.load(open("models/bert_entity_sentences/bert_tokenizer_entity.pickle","rb"))
best_model.to(device)

bert_tokenizer = best_tokenizer


# write_results(best_model)

FULL_FINETUNE = True

optimizer_grouped_parameters = None
param_optimizer = list(best_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

if FULL_FINETUNE:
    print('ALL FINETUNE')
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    print('NO ALL FINETUNE')
    #         {'params': model.last_hidden_layer.parameters(),
#          'weight_decay_rate': 0.01},

    optimizer_grouped_parameters = [
        {'params': best_model.classifier.parameters(),
         'weight_decay_rate': 0.01}
    ]

for n, p in param_optimizer:
    print(n)

meta_learning_optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

ALL FINETUNE
bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.we

In [16]:
def transform_pairs(bert_tokenizer, model, passages, relations, sources, targets, query_text):
        
    pair_passages = []
    for p1, p2 in it.combinations(passages, 2):
        if p1.text != p2.text:
            pair_passages.append([p1.text, int(p1.score), p2.text, int(p2.score)])
            
    pair_passages = np.asarray(pair_passages)
    
    all_first_pair = []
    all_second_pair = []
    
    for relation, source, target in zip(relations, sources, targets):
                
        relation = [relation] * len(pair_passages)
        source = [source] * len(pair_passages)
        target = [target] * len(pair_passages)
        
        
        x_fp_1, x_fp_1_tokens, segment_id_fp_1, attention_mask_fp_1,\
        x_fp_2, x_fp_2_tokens, segment_id_fp_2, attention_mask_fp_2,\
        x_fp_3, x_fp_3_tokens, segment_id_fp_3, attention_mask_fp_3 = transform_data_x_sentences(bert_tokenizer,\
                                                                                                         pair_passages[:, 0], relation, source, target)
        
        x_sp_1, x_sp_1_tokens, segment_id_sp_1, attention_mask_sp_1,\
        x_sp_2, x_sp_2_tokens, segment_id_sp_2, attention_mask_sp_2,\
        x_sp_3, x_sp_3_tokens, segment_id_sp_3, attention_mask_sp_3 = transform_data_x_sentences(bert_tokenizer,\
                                                                                                         pair_passages[:, 2], relation, source, target)


        
        x_fp_1_tokens_tensor, segment_id_fp_1_tensor, attention_mask_fp_1_tensor = convert_to_tensor_x(x_fp_1_tokens,\
                                                                                                             segment_id_fp_1, attention_mask_fp_1)
        x_fp_2_tokens_tensor, segment_id_fp_2_tensor, attention_mask_fp_2_tensor = convert_to_tensor_x(x_fp_2_tokens,\
                                                                                                             segment_id_fp_2, attention_mask_fp_2)
        x_fp_3_tokens_tensor, segment_id_fp_3_tensor, attention_mask_fp_3_tensor = convert_to_tensor_x(x_fp_3_tokens,\
                                                                                                             segment_id_fp_3, attention_mask_fp_3) 
        
        x_sp_1_tokens_tensor, segment_id_sp_1_tensor, attention_mask_sp_1_tensor = convert_to_tensor_x(x_sp_1_tokens,\
                                                                                                             segment_id_sp_1, attention_mask_sp_1)
        x_sp_2_tokens_tensor, segment_id_sp_2_tensor, attention_mask_sp_2_tensor = convert_to_tensor_x(x_sp_2_tokens,\
                                                                                                             segment_id_sp_2, attention_mask_sp_2)
        x_sp_3_tokens_tensor, segment_id_sp_3_tensor, attention_mask_sp_3_tensor = convert_to_tensor_x(x_sp_3_tokens,\
                                                                                                             segment_id_sp_3, attention_mask_sp_3) 
        
        FIELDS = ('x', 'x_1_tokens_tensors', 'segment_id_1_tensors',\
                        'attention_mask_1_tensors', 'x_2_tokens_tensors', 'segment_id_2_tensors',\
                        'attention_mask_2_tensors', 'x_3_tokens_tensors', 'segment_id_3_tensors',\
                        'attention_mask_3_tensors', 'scores')
        
        Dataset = collections.namedtuple('Dataset', FIELDS)
        
        first_pair = Dataset(x=x_fp_1, x_1_tokens_tensors=x_fp_1_tokens_tensor, 
                             segment_id_1_tensors=segment_id_fp_1_tensor, 
                             attention_mask_1_tensors=attention_mask_fp_1_tensor,
                             x_2_tokens_tensors=x_fp_2_tokens_tensor, 
                             segment_id_2_tensors=segment_id_fp_2_tensor, 
                             attention_mask_2_tensors=attention_mask_fp_2_tensor,
                             x_3_tokens_tensors=x_fp_3_tokens_tensor, 
                             segment_id_3_tensors=segment_id_fp_3_tensor, 
                             attention_mask_3_tensors=attention_mask_fp_3_tensor,
                             scores=pair_passages[:, 1])
        
        second_pair = Dataset(x=x_sp_1, x_1_tokens_tensors=x_sp_1_tokens_tensor, 
                             segment_id_1_tensors=segment_id_sp_1_tensor, 
                             attention_mask_1_tensors=attention_mask_sp_1_tensor,
                             x_2_tokens_tensors=x_sp_2_tokens_tensor, 
                             segment_id_2_tensors=segment_id_sp_2_tensor, 
                             attention_mask_2_tensors=attention_mask_sp_2_tensor,
                             x_3_tokens_tensors=x_sp_3_tokens_tensor, 
                             segment_id_3_tensors=segment_id_sp_3_tensor, 
                             attention_mask_3_tensors=attention_mask_sp_3_tensor,
                             scores=pair_passages[:, 3])


        all_first_pair.append(first_pair)
        all_second_pair.append(second_pair)
        
    
    return all_first_pair, all_second_pair

In [None]:
# import itertools as it

# train_data = pd.read_csv('train.csv', delimiter=",")
# val_data = pd.read_csv('val.csv', delimiter=",")
# test_data = pd.read_csv('test.csv', delimiter=",")
# all_queries = pd.read_csv('queries.csv', delimiter=",")
# query_pass = read_passages('ranking.txt')

# torch.cuda.empty_cache()
# for _ in range(0, 1):
    
    
    
#     for qid_train, qid_val in zip(train_data['qid'], val_data['qid']):
#         print(qid_train, qid_val)
#         new_parameters = list(best_model.parameters()).copy()
        
#         temp_relations_train, sources_train, targets_train, query_text_train = get_query(all_queries, 
#                                                                  int(qid_train))
#         temp_relations_val, sources_val, targets_val, query_text_val = get_query(all_queries, 
#                                                                  int(qid_val))
        
#         passages = query_pass[int(qid_train)]
#         all_train_first_pair, all_train_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_train, 
#                                                           sources_train, targets_train, query_text_train)
        
#         inner_loss = None
#         cnt_train = 0
#         for _ in range(0, 10):
#             for first_pair, second_pair in zip(all_train_first_pair, all_train_second_pair):

#                 for idx in range(0, len(first_pair.x)):
#                     pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                         first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                         first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                         first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                         first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                     pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                         second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                         second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                         second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                         second_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                     if first_pair.scores[idx] > second_pair.scores[idx]:
#                         S_ij = 1
#                     elif first_pair.scores[idx] == second_pair.scores[idx]:
#                         S_ij = 0
#                     else:
#                         S_ij = -1 

# #                     if inner_loss is None:
# #                         inner_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
# #                     else:
# #                         inner_loss += pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                    
#                     inner_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                    
#                     cnt_train += 1
               
#                 grads = torch.autograd.grad(inner_loss, list(best_model.parameters()),create_graph=True)
#                 new_parameters = [(new_parameters[i] - 1e-4 * grads[i]) for i in range(0, len(grads))]
                
#                 del grads
#                 torch.cuda.empty_cache()
        
# #         print("task inner loss", inner_loss.item()/cnt_train)
        
#         state_dict = best_model.state_dict()
#         for n_p, i in zip(new_parameters, state_dict):
#             state_dict[i] = n_p
#         best_model.load_state_dict(state_dict)
        
#         del new_parameters
#         torch.cuda.empty_cache()
#         all_val_first_pair, all_val_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_val, 
#                                                           sources_val, targets_val, query_text_val)
        
#         outer_loss = None
#         task_val_loss = 0
#         cnt_val = 0
        
#         for first_pair, second_pair in zip(all_val_first_pair, all_val_second_pair):
            
#             for idx in range(0, len(first_pair.x)):
#                 pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                     first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                     first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                     first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                     first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                 pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                     second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                     second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                     second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                     second_pair.attention_mask_3_tensors[idx].unsqueeze(0))
                
#                 if first_pair.scores[idx] > second_pair.scores[idx]:
#                     S_ij = 1
#                 elif first_pair.scores[idx] == second_pair.scores[idx]:
#                     S_ij = 0
#                 else:
#                     S_ij = -1 
                
#                 outer_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
#                 task_val_loss += outer_loss.item()
#                 outer_loss.backward()
#                 meta_learning_optimizer.step()
#                 meta_learning_optimizer.zero_grad()
#                 cnt_val += 1
                
#         print("task inner loss", outer_loss / cnt_val)
                    

In [None]:
# import itertools as it

# train_data = pd.read_csv('train.csv', delimiter=",")
# val_data = pd.read_csv('val.csv', delimiter=",")
# test_data = pd.read_csv('test.csv', delimiter=",")
# all_queries = pd.read_csv('queries.csv', delimiter=",")
# query_pass = read_passages('ranking.txt')

# torch.cuda.empty_cache()

# for _ in range(0, 5):
    
    
#     num_task = 0
#     first_task = False
#     sum_gradients = []
#     for qid_train, qid_val in zip(train_data['qid'], val_data['qid']):
#         print(qid_train, qid_val)
#         fast_model = deepcopy(best_model)
#         fast_model.to(device)
#         inner_optimizer = AdamW(fast_model.parameters(), lr=2e-5)
#         temp_relations_train, sources_train, targets_train, query_text_train = get_query(all_queries, 
#                                                                  int(qid_train))
#         temp_relations_val, sources_val, targets_val, query_text_val = get_query(all_queries, 
#                                                                  int(qid_val))
        
#         passages = query_pass[int(qid_train)]
#         all_train_first_pair, all_train_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_train, 
#                                                           sources_train, targets_train, query_text_train)
                                                                                                  
        
#         fast_model.train()
#         inner_loss = None
#         for _ in range(0, 15):
            
#             cnt_train_break = 0
#             cnt_train = 1
#             train_loss = 0
#             accumulation_batch = 32
#             for first_pair, second_pair in zip(all_train_first_pair, all_train_second_pair):

#                 for idx in range(0, len(first_pair.x)):
#                     pred_fp = fast_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                         first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                         first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                         first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                         first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                     pred_sp = fast_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                         second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                         second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                         second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                         second_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                     if first_pair.scores[idx] > second_pair.scores[idx]:
#                         S_ij = 1
#                     elif first_pair.scores[idx] == second_pair.scores[idx]:
#                         S_ij = 0
#                     else:
#                         S_ij = -1 

#                     inner_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
#                     train_loss += inner_loss.item()
#                     inner_loss = inner_loss / accumulation_batch
# #                     print(pred_fp[0][0][0], pred_sp[0][0][0])
                    
#                     inner_loss.backward()
#                     if cnt_train % accumulation_batch == 0:
# #                         print(cnt_train)
#                         inner_optimizer.step()
#                         fast_model.zero_grad()
                    
#                     if cnt_train_break == 128:
#                         break
#                     cnt_train_break += 1
#                     cnt_train += 1
#             print(train_loss/(cnt_train-1)) 
#         del inner_loss
#         torch.cuda.empty_cache()
#         print("finish Task")
                
#         all_val_first_pair, all_val_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_val, 
#                                                           sources_val, targets_val, query_text_val)
        
#         q_loss = None
#         saved_loss = None
#         cnt_val = 0
#         val_loss = 0
#         cnt_break_val = 0
#         for first_pair, second_pair in zip(all_val_first_pair, all_val_second_pair):
            
#             for idx in range(0, len(first_pair.x)):
#                 pred_fp = fast_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                     first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                     first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                     first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                     first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

#                 pred_sp = fast_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
#                     second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
#                     second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
#                     second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
#                     second_pair.attention_mask_3_tensors[idx].unsqueeze(0))
                
#                 if first_pair.scores[idx] > second_pair.scores[idx]:
#                     S_ij = 1
#                 elif first_pair.scores[idx] == second_pair.scores[idx]:
#                     S_ij = 0
#                 else:
#                     S_ij = -1 
                
#                 q_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
#                 q_loss = q_loss / len(all_val_first_pair)
#                 val_loss += q_loss.item()
#                 q_loss.backward()
# #                 if cnt_val == 0:
# #                     saved_loss = q_loss.clone()
# #                 else:
# #                     saved_loss += q_loss.clone()
                
# #                 fast_model.zero_grad()
#                 if cnt_break_val == 128:
#                     break
#                 cnt_val += 1  
#                 cnt_break_val+=1
#             print(val_loss/cnt_val)
                
# #         saved_loss /= cnt_val
        
# #         saved_loss.backward()
        
#         fast_model.to(torch.device('cpu'))
#         for i, params in enumerate(fast_model.parameters()):
#             if first_task == False:
#                 sum_gradients.append(deepcopy(params.grad))
#             else:
#                 sum_gradients[i] += deepcopy(params.grad)
                        
#         del fast_model, inner_optimizer, saved_loss
#         torch.cuda.empty_cache()
#         first_task = True
#         num_task += 1
# #         print(sum_gradients)
        
#     for i in range(0, len(sum_gradients)):
#         print("before")
#         print(sum_gradients[i])
#         print("after")
#         print(sum_gradients[i] / float(num_task))
#         sum_gradients[i] = sum_gradients[i] / float(num_task)
#     print("average")
#     print(sum_gradients)
#     state_dict = best_model.state_dict()
#     for n_p, i in zip(sum_gradients, state_dict):
#         state_dict[i] = n_p
#     best_model.load_state_dict(state_dict)
        
#     meta_learning_optimizer.step()
#     best_model.zero_grad()
        
#     del sum_gradients
#     gc.collect()
                    

In [None]:
import itertools as it

train_data = pd.read_csv('train.csv', delimiter=",")
val_data = pd.read_csv('val.csv', delimiter=",")
test_data = pd.read_csv('test.csv', delimiter=",")
all_queries = pd.read_csv('queries.csv', delimiter=",")
query_pass = read_passages('ranking.txt')

torch.cuda.empty_cache()
innerstepsize = 0.001
outerstepsize0 = 0.1
outeriterations = 5
inneriterations = 30

for iteration in range(0, outeriterations):

    weights_before = deepcopy(best_model.state_dict())
    
    for qid_train, qid_val in zip(train_data['qid'], val_data['qid']):
        print(qid_train, qid_val)
        
        best_model.to(device)

        temp_relations_train, sources_train, targets_train, query_text_train = get_query(all_queries, 
                                                                 int(qid_train))
        temp_relations_val, sources_val, targets_val, query_text_val = get_query(all_queries, 
                                                                 int(qid_val))
        
        passages = query_pass[int(qid_train)]
        all_train_first_pair, all_train_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_train, 
                                                          sources_train, targets_train, query_text_train)
                                                                                                  
        
        best_model.train()
        inner_loss = None
        for _ in range(0, inneriterations):
            cnt_train = 1
            train_loss = 0

            for first_pair, second_pair in zip(random.sample(all_train_first_pair, len(all_train_first_pair))[:100], 
                                               random.sample(all_train_second_pair, len(all_train_second_pair))[:100]:

                for idx in range(0, len(first_pair.x)):
                    pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                        first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                        first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                        first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                        first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                    pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                        second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                        second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                        second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                        second_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                    if first_pair.scores[idx] > second_pair.scores[idx]:
                        S_ij = 1
                    elif first_pair.scores[idx] == second_pair.scores[idx]:
                        S_ij = 0
                    else:
                        S_ij = -1 

                    inner_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                    inner_loss.backward()
                    train_loss += inner_loss.item()              
                    for param in model.parameters():
                        param.data -= innerstepsize * param.grad.data
                                               
                    cnt_train += 1
            print(train_loss/(cnt_train-1)) 
        del inner_loss
        torch.cuda.empty_cache()
        print("finish Task")
                
        all_val_first_pair, all_val_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_val, 
                                                          sources_val, targets_val, query_text_val)
        best_model.eval()
        q_loss = None
        cnt_val = 0
        val_loss = 0

    for first_pair, second_pair in zip(all_val_first_pair, all_val_second_pair):
            
            for idx in range(0, len(first_pair.x)):
                pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                    first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                    first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                    first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                    first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                    second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                    second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                    second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                    second_pair.attention_mask_3_tensors[idx].unsqueeze(0))
                
                if first_pair.scores[idx] > second_pair.scores[idx]:
                    S_ij = 1
                elif first_pair.scores[idx] == second_pair.scores[idx]:
                    S_ij = 0
                else:
                    S_ij = -1 
                
                q_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                q_loss = q_loss / len(all_val_first_pair)
                val_loss += q_loss.item()
                cnt_val += 1  
            print(val_loss/cnt_val)
                
    weights_after = best_model.state_dict()
    outerstepsize = outerstepsize0 * (1 - iteration / outeriterations) # linear schedule
    model.load_state_dict({name : 
        weights_before[name] + (weights_after[name] - weights_before[name]) * outerstepsize 
        for name in weights_before})


In [None]:
import itertools as it

train_data = pd.read_csv('train.csv', delimiter=",")
val_data = pd.read_csv('val.csv', delimiter=",")
test_data = pd.read_csv('test.csv', delimiter=",")
all_queries = pd.read_csv('queries.csv', delimiter=",")
query_pass = read_passages('ranking.txt')

torch.cuda.empty_cache()

for _ in range(0, 5):
    
    
    num_task = 0
    first_task = False
    sum_gradients = []
    for qid_train, qid_val in zip(train_data['qid'], val_data['qid']):
        print(qid_train, qid_val)
        best_model.to(device)
        inner_optimizer = AdamW(best_model.parameters(), lr=2e-5)
        temp_relations_train, sources_train, targets_train, query_text_train = get_query(all_queries, 
                                                                 int(qid_train))
        temp_relations_val, sources_val, targets_val, query_text_val = get_query(all_queries, 
                                                                 int(qid_val))
        
        passages = query_pass[int(qid_train)]
        all_train_first_pair, all_train_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_train, 
                                                          sources_train, targets_train, query_text_train)
                                                                                                  
        
        best_model.train()
        inner_loss = None
        for _ in range(0, 30):
            
            cnt_train_break = 0
            cnt_train = 1
            train_loss = 0
            accumulation_batch = 32
            for first_pair, second_pair in zip(all_train_first_pair, all_train_second_pair):

                for idx in range(0, len(first_pair.x)):
                    pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                        first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                        first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                        first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                        first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                    pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                        second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                        second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                        second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                        second_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                    if first_pair.scores[idx] > second_pair.scores[idx]:
                        S_ij = 1
                    elif first_pair.scores[idx] == second_pair.scores[idx]:
                        S_ij = 0
                    else:
                        S_ij = -1 

                    inner_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                    train_loss += inner_loss.item()
                    inner_loss = inner_loss / accumulation_batch
#                     print(pred_fp[0][0][0], pred_sp[0][0][0])
                    
                    inner_loss.backward()
                    if cnt_train % accumulation_batch == 0:
#                         print(cnt_train)
                        inner_optimizer.step()
                        best_model.zero_grad()
                    
#                     if cnt_train_break == 128:
#                         break
#                     cnt_train_break += 1
                    cnt_train += 1
            print(train_loss/(cnt_train-1)) 
        del inner_loss
        torch.cuda.empty_cache()
        print("finish Task")
                
        all_val_first_pair, all_val_second_pair = transform_pairs(best_tokenizer, best_model, passages, temp_relations_val, 
                                                          sources_val, targets_val, query_text_val)
        best_model.eval()
        q_loss = None
        saved_loss = None
        cnt_val = 0
        val_loss = 0
        cnt_break_val = 0
        for first_pair, second_pair in zip(all_val_first_pair, all_val_second_pair):
            
            for idx in range(0, len(first_pair.x)):
                pred_fp = best_model(first_pair.x_1_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                    first_pair.attention_mask_1_tensors[idx].unsqueeze(0), first_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                    first_pair.segment_id_2_tensors[idx].unsqueeze(0), first_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                    first_pair.x_3_tokens_tensors[idx].unsqueeze(0), first_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                    first_pair.attention_mask_3_tensors[idx].unsqueeze(0))

                pred_sp = best_model(second_pair.x_1_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_1_tensors[idx].unsqueeze(0),\
                    second_pair.attention_mask_1_tensors[idx].unsqueeze(0), second_pair.x_2_tokens_tensors[idx].unsqueeze(0),\
                    second_pair.segment_id_2_tensors[idx].unsqueeze(0), second_pair.attention_mask_2_tensors[idx].unsqueeze(0),\
                    second_pair.x_3_tokens_tensors[idx].unsqueeze(0), second_pair.segment_id_3_tensors[idx].unsqueeze(0),\
                    second_pair.attention_mask_3_tensors[idx].unsqueeze(0))
                
                if first_pair.scores[idx] > second_pair.scores[idx]:
                    S_ij = 1
                elif first_pair.scores[idx] == second_pair.scores[idx]:
                    S_ij = 0
                else:
                    S_ij = -1 
                
                q_loss = pairwise_loss(pred_fp[0][0][0], pred_sp[0][0][0], S_ij)
                q_loss = q_loss / len(all_val_first_pair)
                val_loss += q_loss.item()
#                 q_loss.backward()
#                 if cnt_val == 0:
#                     saved_loss = q_loss.clone()
#                 else:
#                     saved_loss += q_loss.clone()
                
#                 fast_model.zero_grad()
#                 if cnt_break_val == 128:
#                     break
                cnt_val += 1  
#                 cnt_break_val+=1
            print(val_loss/cnt_val)
                
#         saved_loss /= cntcnt_train_val
        
#         saved_loss.backward()
        
#         fast_model.to(torch.device('cpu'))
#         for i, params in enumerate(fast_model.parameters()):
#             if first_task == False:
#                 sum_gradients.append(deepcopy(params.grad))
#             else:
#                 sum_gradients[i] += deepcopy(params.grad)
                        
#         del fast_model, inner_optimizer, saved_loss
#         torch.cuda.empty_cache()
#         first_task = True
#         num_task += 1
# #         print(sum_gradients)
        
#     for i in range(0, len(sum_gradients)):
#         print("before")
#         print(sum_gradients[i])
#         print("after")
#         print(sum_gradients[i] / float(num_task))
#         sum_gradients[i] = sum_gradients[i] / float(num_task)
#     print("average")
#     print(sum_gradients)
#     state_dict = best_model.state_dict()
#     for n_p, i in zip(sum_gradients, state_dict):
#         state_dict[i] = n_p
#     best_model.load_state_dict(state_dict)
        
#     meta_learning_optimizer.step()
#     best_model.zero_grad()
        
#     del sum_gradients
#     gc.collect()
                    

7 35
0.39797250899723063
0.317992667451425
0.2893703974382437
0.27733493439759577
0.2756504666556747
0.2736845723597901
0.26966798097430317
0.26641213700918065
0.2693179442569458
0.2665028119471031
0.26352726032440377
0.26387225234527
0.26329002965100967
0.26157419045603125
0.261898591185175
0.26102411341165227
0.26125291696036873
0.26138552641012197
0.26070264050762937
0.2609200755729765
0.2608264530280049
0.26054572840347034
0.2609022171633256
0.2608351590769914
0.2611195031548921
0.26115372676691867
0.2622838529022952
0.2608546538225485
0.2602459659710466
0.260033931102619
finish Task
0.2808420009190502
4 17
0.5403952299657326
0.4412925555491828
0.42837414997879425
0.4190454647013082
0.40320568254875705
0.3826213339486581
0.36114917239022026
0.3533344291250709
0.34745146695242396
0.34466432095622157
0.3431849891514095
0.3420814105440191
0.3420093847159229
0.341162464360453
0.3410142666816078
0.3404131688248813
0.34055871445946573
0.3404209507775948
0.3402904494562434
0.3402202447458

0.34487649642600227
0.34396091869842904
0.34346668523928636
0.34141022080306355
0.34137597143619786
0.33998742256599845
0.33967044234414456
0.33981445997268284
0.33986681203940183
0.3399660102108161
0.3395300371381644
0.339076728338718
0.3379382892468584
0.3376252383434045
0.3372541933467863
0.33743906290714837
0.33750448426553115
0.33746281018595786
0.3372556753974538
0.3372041034978506
0.33721802302629894
0.33699895345508735
0.3371892194142705
finish Task
0.10276560084703058
0.10291083489767407
0.10290427388341462
0.10288876700677722
23 52
0.6021922330602143
0.4535989159735021
0.4114043025833028
0.40295925900486246
0.3927554549764602
0.39012703619322553
0.38751759422646453
0.3852653804708317
0.3839091374805634
0.38492786836433524
0.38516787892517634
0.3868706831833281
0.3840655851410936
0.38368945054158754
0.3830241784390031
0.38312633634843324
0.3826247271566588
0.38308476893603355
0.38231094177112906
0.382767178467065
0.3821023689064345
0.38323664808718694
0.3825426167015467
0.3828

In [None]:
write_results(best_model, 'meta_learned_bert')

In [None]:
for i in best_model.parameters():
    print(i)