In [1]:
# increase the cell width 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))   

# need to run this every time start this notebook, to add python3.7/site-packages to sys.pat, in order to import ipywidgets, which is used when RobertaTokenizer.from_pretrained('roberta-base') 
import sys
# sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.7/site-packages')
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.8/site-packages')

### convert hotpotqa to squard format

According to Longformer: use the following input format with special tokens:  “[CLS] [q] question [/q] [p] sent1,1 [s] sent1,2 [s] ... [p] sent2,1 [s] sent2,2 [s] ...” 
where [s] and [p] are special tokens representing sentences and paragraphs. The special tokens were added to the RoBERTa vocabulary and randomly initialized before task finetuning.

In [2]:
# helper functions to convert hotpotqa to squard format modified from  https://github.com/chiayewken/bert-qa/blob/master/run_hotpot.py

import tqdm 
from datetime import datetime 
import pytz 
timeZ_Az = pytz.timezone('US/Mountain') 
import transformers 

QUESTION_START = '[question]'
QUESTION_END = '[/question]' 
TITLE_START = '<t>'  # indicating the start of the title of a paragraph (also used for loss over paragraphs)
TITLE_END = '</t>'   # indicating the end of the title of a paragraph
SENT_MARKER_END = '[/sent]'  # indicating the end of the title of a sentence (used for loss over sentences)
PAR = '[/par]'  # used for indicating end of the regular context and beginning of `yes/no/null` answers
EXTRA_ANSWERS = " yes no null"

 
def create_example_dict(context, answer, id, question, is_sup_fact, is_supporting_para):
    return {
        "context": context,
        "qas": [                        # each context corresponds to only one qa in hotpotqa
            {
                "answer": answer,
                "id": id,
                "question": question,
                "is_sup_fact": is_sup_fact,
                "is_supporting_para": is_supporting_para
            }
        ],
    }

def create_para_dict(example_dicts):
    if type(example_dicts) == dict:
        example_dicts = [example_dicts]   # each paragraph corresponds to only one [context, qas] in hotpotqa
    return {"paragraphs": example_dicts}   


In [3]:
# !python -m pip install tqdm 
# !python -m pip install git+https://github.com/allenai/longformer.git 
# !python -m pip install pytorch-lightning==0.6.0
# !python -m pip install jdc  
# !wget https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-base-4096.tar.gz
# !tar -xf longformer-base-4096.tar.gz

In [4]:
import re
import string

def convert_hotpot_to_squad_format(json_dict, gold_paras_only=False):
    
    """function to convert hotpotqa to squard format.


    Note: A context corresponds to several qas in SQuard. In hotpotqa, one question corresponds to several paragraphs as context. 
          "paragraphs" means different: each paragraph in SQuard contains a context and a list of qas; while 10 paragraphs in hotpotqa concatenated into a context for one question.

    Args:
        json_dict: The original data load from hotpotqa file.
        gold_paras_only: when is true, only use the 2 paragraphs that contain the gold supporting facts; if false, use all the 10 paragraphs
 

    Returns:
        new_dict: The converted dict of hotpotqa dataset, use it as a dict would load from SQuAD json file
                  usage: input_data = new_dict["data"]   https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_squad.py#L230

    """
 
    new_dict = {"data": []} 
    for example in json_dict: 

        support_para = set(
            para_title for para_title, _ in example["supporting_facts"]
        )
        sp_set = set(list(map(tuple, example['supporting_facts'])))
        
        raw_contexts = example["context"]
        if gold_paras_only: 
            raw_contexts = [lst for lst in raw_contexts if lst[0] in support_para]
            
        is_supporting_para = []  # a boolean list with 10 True/False elements, one for each paragraph
        is_sup_fact = []         # a boolean list with True/False elements, one for each context sentence
        for para_title, para_lines in raw_contexts:
            is_supporting_para.append(para_title in support_para)   
            for sent_id, sent in enumerate(para_lines):
                is_sup_fact.append( (para_title, sent_id) in sp_set )    
        
        for lst in raw_contexts:
            lst[0] = normalize_answer(lst[0])
            lst[1] = [normalize_answer(sent) for sent in lst[1]]
        
        contexts = [TITLE_START + ' ' + lst[0]  + ' ' + TITLE_END + ' ' + (' ' + SENT_MARKER_END +' ').join(lst[1]) + ' ' + SENT_MARKER_END for lst in raw_contexts]    
        # extra space is fine, which would be ignored latter. most sentences has already have heading space, there are several no heading space; call the normalize_answer() which is same as the one used during evaluation
   
        context = " ".join(contexts)
#         print(context)
        
#         exit(0)

        
        answer = normalize_answer(example["answer"]) 
#         print("answer: ", answer)
        if(len(answer) > 0):   # answer can be '' after normalize
            new_dict["data"].append(
                create_para_dict(
                    create_example_dict(
                        context=context,
                        answer=answer,
                        id = example["_id"],
                        question=normalize_answer(example["question"]),
                        is_sup_fact = is_sup_fact,
                        is_supporting_para = is_supporting_para 
                    )
                )
            ) 

    return new_dict

def normalize_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [5]:
# debug: check whether convert_hotpot_to_squad_format() works
import os
os.chdir('/xdisk/msurdeanu/fanluo/hotpotQA/Data')
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[76200:76280]' > small.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[37:50]' > small_dev.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[31:50]' > sample.json

import json
with open("small.json", "r", encoding='utf-8') as f:  
    json_dict = convert_hotpot_to_squad_format(json.load(f))['data']
    print(json.dumps(json_dict[3], indent=2))

{
  "paragraphs": [
    {
      "context": "<t> dr seuss how grinch stole christmas </t> dr seuss how grinch stole christmas [/sent] is video game based on dr seuss book with same name but mostly based on film [/sent] game was released on november 8 2007 [/sent] <t> lorax film </t> lorax also known as dr seuss lorax is 2012 american 3d computeranimated musical fantasy\u2013comedy film produced by illumination entertainment and based on dr seusss childrens book of same name [/sent] film was released by universal pictures on march 2 2012 on 108th birthday of dr seuss [/sent] second film adaptation of book following 1972 animated television special film builds on book by expanding story of ted previously unnamed boy who visits onceler [/sent] cast includes danny devito as lorax ed helms as onceler and zac efron as ted [/sent] new characters introduced in film are audrey voiced by taylor swift aloysius ohare rob riggle mrs wiggins teds mother jenny slate and grammy norma betty white [/sent

### longfomer's fine-tuning


- For answer span extraction we use BERT’s QA model with addition of a question type (yes/no/span) classification head over the first special token ([CLS]).

- For evidence extraction we apply 2 layer feedforward networks on top of the representations corresponding to sentence and paragraph tokens to get the corresponding evidence prediction scores and use binary cross entropy loss to train the model.

- We combine span, question classification, sentence, and paragraphs losses and train the model in a multitask way using linear combination of losses.


In [6]:
### Section2: This is modified from longfomer's fine-tuning with triviaqa.py from https://github.com/allenai/longformer/blob/master/scripts/triviaqa.py

# !conda install transformers --yes
# !conda install cudatoolkit=10.0 --yes
# !python -m pip install git+https://github.com/allenai/longformer.git
####requirements.txt:torch>=1.2.0, transformers>=3.0.2, tensorboardX, pytorch-lightning==0.6.0, test-tube==0.7.5
# !conda install -c conda-forge regex --force-reinstall --yes
# !conda install pytorch-lightning -c conda-forge
#!python -m pip install jdc 
# !pip install test-tube 
#!python -m pip install ipywidgets 
# !conda update --force conda --yes  
# !jupyter nbextension enable --py widgetsnbextension 
# !conda install jupyter --yes


import os
import json
import string
import random
import numpy as np
import torch
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, Dataset 

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
from pytorch_lightning.logging import TestTubeLogger    # sometimes pytorch_lightning.loggers works instead

from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer
import jdc
from more_itertools import locate
from collections import Counter
from collections import defaultdict

In [7]:
print(pl.__file__)

/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.8/site-packages/pytorch_lightning/__init__.py


#### class hotpotqaDataset

##### \_\_init\_\_, \_\_getitem\_\_ and \_\_len\_\_ 

In [8]:
class hotpotqaDataset(Dataset):
    """
    Largely based on
    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
    and
    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
    """
    
    
    def __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride,
                 max_num_answers, ignore_seq_with_no_answers, max_question_len):
        assert os.path.isfile(file_path)
        self.file_path = file_path
        if("reduced_context" not in self.file_path):
            with open(self.file_path, "r", encoding='utf-8') as f:
                print(f'reading file: {self.file_path}')
                self.data_json = convert_hotpot_to_squad_format(json.load(f))['data']
                
        else:
            with open(self.file_path, "r", encoding='utf-8') as f:
                print(f'reading file: {self.file_path}')
                self.data_json = json.load(f)['data']            
                print(self.data_json[0])
            
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.max_doc_len = max_doc_len
        self.doc_stride = doc_stride
        self.max_num_answers = max_num_answers
        self.ignore_seq_with_no_answers = ignore_seq_with_no_answers
        self.max_question_len = max_question_len


#         print(tokenizer.all_special_tokens) 
    
        # A mapping from qid to an int, which can be synched across gpus using `torch.distributed`
        if 'train' not in self.file_path:  # only for the evaluation set 
            self.val_qid_string_to_int_map =                  {
                    entry["paragraphs"][0]['qas'][0]['id']: index
                    for index, entry in enumerate(self.data_json)
                }
        else:
            self.val_qid_string_to_int_map = None
            
            
    def __len__(self):
        return len(self.data_json)

    def __getitem__(self, idx):
        entry = self.data_json[idx]
        tensors_list = self.one_example_to_tensors(entry, idx)
        if(len(tensors_list) != 1):
            print("tensors_list: ", tensors_list)
        assert len(tensors_list) == 1
        return tensors_list[0]

##### one_example_to_tensors

In [9]:
    %%add_to hotpotqaDataset
    def one_example_to_tensors(self, example, idx):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
                return True
            return False
        
        def map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset):
            
            # char offset to word offset
            start_word_position = char_to_word_offset[answer_start]
            end_word_position = char_to_word_offset[answer_end-1] 

#             print("start_word_position: ", start_word_position)
#             print("end_word_position: ", end_word_position)
            # sub_tokens postion reletive to context
            tok_start_position_in_doc = orig_to_tok_index[start_word_position]  
            not_end_of_doc = int(end_word_position + 1 < len(orig_to_tok_index))
            tok_end_position_in_doc = orig_to_tok_index[end_word_position + not_end_of_doc] - not_end_of_doc
            
            if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end:
                return (-1, -1) # this answer is outside the current slice                     
            
            # sub_tokens postion reletive to begining of all the tokens, including query sub tokens  
            start_position = tok_start_position_in_doc + doc_offset  
            end_position = tok_end_position_in_doc + doc_offset
            
            return (start_position, end_position)
        
        tensors_list = []
        for paragraph in example["paragraphs"]:  # example["paragraphs"] only contains one paragraph in hotpotqa
            context = paragraph["context"]
            
#             print("self.tokenizer.sep_token: ", self.tokenizer.sep_token)
#             print("self.tokenizer.sep_token == '</s>': ", self.tokenizer.sep_token == '</s>')
            
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in context:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c) # add a new token
                    else:
                        doc_tokens[-1] += c  # append the character to the last token
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            
#             print("len(char_to_word_offset): ", len(char_to_word_offset))
#             print("char_to_word_offset: ", char_to_word_offset)
            for qa in paragraph["qas"]:
                question_text = qa["question"]
#                 print("question text: ", question_text)  
                sp_sent = qa["is_sup_fact"]
                sp_para = qa["is_supporting_para"]
                start_position = None
                end_position = None
                orig_answer_text = None 

#                     print("len(sp_sent):", len(sp_sent))
#                     print("sp_sent", sp_sent) 
#                     print("doc_tokens", doc_tokens)
 
                # keep all answers in the document, not just the first matched answer. It also added the list of textual answers to make evaluation easy.
                
                   
                # ===== Given an example, convert it into tensors  =============
                 
                query_tokens = self.tokenizer.tokenize(question_text)
                query_tokens = query_tokens[:self.max_question_len]
                tok_to_orig_index = []
                orig_to_tok_index = []
                all_doc_tokens = []
                
                # each original token in the context is tokenized to multiple sub_tokens
                for (i, token) in enumerate(doc_tokens):
                    orig_to_tok_index.append(len(all_doc_tokens))
                    # hack: the line below should have been `self.tokenizer.tokenize(token')`
                    # but roberta tokenizer uses a different subword if the token is the beginning of the string
                    # or in the middle. So for all tokens other than the first, simulate that it is not the first
                    # token by prepending a period before tokenizing, then dropping the period afterwards
                    sub_tokens = self.tokenizer.tokenize(f'. {token}')[1:] if i > 0 else self.tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        tok_to_orig_index.append(i)
                        all_doc_tokens.append(sub_token)
                
                # all sub tokens, truncate up to limit
                all_doc_tokens = all_doc_tokens[:self.max_doc_len-8] 

                # The -8 accounts for CLS, QUESTION_START, QUESTION_END， [/par]， yes， no， null， </s>   
                max_tokens_per_doc_slice = self.max_seq_len - len(query_tokens) - 8
                if(max_tokens_per_doc_slice <= 0):
                    print("(max_tokens_per_doc_slice <= 0)")
                assert max_tokens_per_doc_slice > 0
                if self.doc_stride < 0:                           # default
                    # negative doc_stride indicates no sliding window, but using first slice
                    self.doc_stride = -100 * len(all_doc_tokens)  # large -negtive value for the next loop to execute once
                
                # inputs to the model
                input_ids_list = []
                input_mask_list = []
                segment_ids_list = []
                start_positions_list = []
                end_positions_list = []
                q_type_list = []
                sp_sent_list =  [1 if ss else 0 for ss in sp_sent]
                sp_para_list = [1 if sp else 0 for sp in sp_para]
                
#                 print("before for")
                for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride):    # execute once by default
                    slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))

                    doc_slice_tokens = all_doc_tokens[slice_start:slice_end]
                    tokens = [self.tokenizer.cls_token] + [QUESTION_START] + query_tokens + [QUESTION_END] + doc_slice_tokens + [PAR] + self.tokenizer.tokenize("yes") + self.tokenizer.tokenize("no") + self.tokenizer.tokenize("null") +  [self.tokenizer.eos_token]   
                    segment_ids = [0] * (len(query_tokens) + 3) + [1] * (len(doc_slice_tokens) + 5) 
#                     if(len(segment_ids) != len(tokens)):
#                         print("len(segment_ids): ", len(segment_ids))
#                         print("len(tokens): ", len(tokens))
                    assert len(segment_ids) == len(tokens)

                    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)   
                    input_mask = [1] * len(input_ids)

                    doc_offset = len(query_tokens) + 3 - slice_start  # where context starts
                    
                    # ===== answer positions tensors  ============
                    start_positions = []
                    end_positions = []
 
                    answer = qa["answer"] 
                    print("answer: ", answer)
                    if answer == 'yes':
                        q_type = 1
                        start_positions.append(len(tokens)-4)   
                        end_positions.append(len(tokens)-4) 
                    elif answer == 'no':
                        q_type = 2
                        start_positions.append(len(tokens)-3)   
                        end_positions.append(len(tokens)-3)  
                    else:
                        # keep all the occurences of answer in the context 
#                         for m in re.finditer("\s?".join(answer.split()), context):   # "\s?".join(answer.split()) in order to match even with extra space in answer or context
                        for m in re.finditer(normalize_answer(answer), context, re.IGNORECASE):
                            answer_start, answer_end = m.span() 
                            start_position, end_position = map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset)
                            if(start_position != -1):
                                start_positions.append(start_position)   
                                end_positions.append(end_position)
                            
                        if(len(start_positions) > 0): 
                            q_type = 0
                        else: # answer not found in context
                            q_type = 3 
                            start_positions.append(len(tokens)-2)   
                            end_positions.append(len(tokens)-2)  


                    # answers from start_positions and end_positions if > self.max_num_answers
                    start_positions = start_positions[:self.max_num_answers]
                    end_positions = end_positions[:self.max_num_answers]

                    # -1 padding up to self.max_num_answers
                    padding_len = self.max_num_answers - len(start_positions)
                    start_positions.extend([-1] * padding_len)
                    end_positions.extend([-1] * padding_len)

                    # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
                    found_start_positions = set()
                    found_end_positions = set()
                    for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)):
                        
                        if start_position in found_start_positions:
                            start_positions[i] = -1
                        if end_position in found_end_positions:
                            end_positions[i] = -1
                        found_start_positions.add(start_position)
                        found_end_positions.add(end_position)
                        
#                         # for debug
#                         if(start_position != -1):
#                             answer_token_ids = input_ids[start_position: end_position+1]
#                             answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids)
#                             answer_text = self.tokenizer.convert_tokens_to_string(answer_tokens)
#                             print("answer_text: ", answer_text)
                        
                    if self.doc_stride >= 0:  # no need to pad if document is not strided
                        # Zero-pad up to the sequence length.
                        padding_len = self.max_seq_len - len(input_ids)
                        input_ids.extend([self.tokenizer.pad_token_id] * padding_len)
                        input_mask.extend([0] * padding_len)
                        segment_ids.extend([0] * padding_len)
                        
                        print("self.doc_stride >= 0")
                        assert len(input_ids) == self.max_seq_len
                        assert len(input_mask) == self.max_seq_len
                        assert len(segment_ids) == self.max_seq_len  
                        
                    input_ids_list.append(input_ids)
                    input_mask_list.append(input_mask)
                    segment_ids_list.append(segment_ids)
                    start_positions_list.append(start_positions)
                    end_positions_list.append(end_positions)
                    q_type_list.append(q_type)
                    
                tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list), torch.tensor(segment_ids_list),
                                     torch.tensor(start_positions_list), torch.tensor(end_positions_list), torch.tensor(q_type_list),
                                      torch.tensor([sp_sent_list]),  torch.tensor([sp_para_list]),
                                     qa['id'], answer))     
        return tensors_list


##### collate_one_doc_and_lists

In [10]:
    %%add_to hotpotqaDataset
    @staticmethod
    def collate_one_doc_and_lists(batch):
        num_metadata_fields = 2  # qid and answer  
        fields = [x for x in zip(*batch)]
        stacked_fields = [torch.stack(field) for field in fields[:-num_metadata_fields]]  # don't stack metadata fields
        stacked_fields.extend(fields[-num_metadata_fields:])  # add them as lists not torch tensors

        # always use batch_size=1 where each batch is one document
        # will use grad_accum to increase effective batch size
        assert len(batch) == 1
        fields_with_batch_size_one = [f[0] for f in stacked_fields]
        return fields_with_batch_size_one


##### class info

In [11]:
dir(hotpotqaDataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'collate_one_doc_and_lists',
 'one_example_to_tensors']

In [12]:
from inspect import getmembers
getmembers(hotpotqaDataset)

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__class__', type),
 ('__delattr__', <slot wrapper '__delattr__' of 'object' objects>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__doc__': '\n    Largely based on\n    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py\n    and\n    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n    ',
                '__init__': <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>,
                '__len__': <function __main__.hotpotqaDataset.__len__(self)>,
                '__getitem__': <function __main__.hotpotqaDataset.__getitem__(self, idx)>,
                'one_example_to_tensors': <function __main__.one_example_to_tensors(self, example, idx)>,
                'collate_one

In [13]:
from inspect import isfunction
functions_list = [o for o in getmembers(hotpotqaDataset) if isfunction(o[1])]
functions_list

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__getitem__', <function __main__.hotpotqaDataset.__getitem__(self, idx)>),
 ('__init__',
  <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>),
 ('__len__', <function __main__.hotpotqaDataset.__len__(self)>),
 ('collate_one_doc_and_lists',
  <function __main__.collate_one_doc_and_lists(batch)>),
 ('one_example_to_tensors',
  <function __main__.one_example_to_tensors(self, example, idx)>)]

In [14]:
import inspect
inspect.getmro(hotpotqaDataset)  # a hierarchy of classes 

(__main__.hotpotqaDataset, torch.utils.data.dataset.Dataset, object)

In [15]:
inspect.getfullargspec(hotpotqaDataset.one_example_to_tensors)

FullArgSpec(args=['self', 'example', 'idx'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={})

In [16]:
help(hotpotqaDataset)

Help on class hotpotqaDataset in module __main__:

class hotpotqaDataset(torch.utils.data.dataset.Dataset)
 |  Largely based on
 |  https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
 |  and
 |  https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
 |  
 |  Method resolution order:
 |      hotpotqaDataset
 |      torch.utils.data.dataset.Dataset
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, idx)
 |  
 |  __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  one_example_to_tensors(self, example, idx)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  collate_one_doc_and_lists(batch)
 |  
 |  -----------------------

#### class hotpotqa

##### \_\_init\_\_,  forward, dataloaders

In [17]:
class hotpotqa(pl.LightningModule):
    def __init__(self, args):
        super(hotpotqa, self).__init__()
        self.args = args
        self.hparams = args
 
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        num_new_tokens = self.tokenizer.add_special_tokens({"additional_special_tokens": [TITLE_START, TITLE_END, SENT_MARKER_END, QUESTION_START , QUESTION_END, PAR]})
#         print(self.tokenizer.all_special_tokens)
        self.tokenizer.model_max_length = self.args.max_seq_len
        self.model = self.load_model()
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.num_labels = 2
        self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
         
        self.linear_type = torch.nn.Linear(self.model.config.hidden_size, 4)   #  question type (yes/no/span/null) classification 
           
       
        self.fnn_sp_sent = torch.nn.Sequential(
          torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
        
        self.fnn_sp_para = torch.nn.Sequential(
          torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
         
        
        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
        
 
    def load_model(self):
        
        config = LongformerConfig.from_pretrained(self.args.model_path) 
        # choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
        # 'n2': for regular n2 attantion
        # 'tvm': a custom CUDA kernel implementation of our sliding window attention
        # 'sliding_chunks': a PyTorch implementation of our sliding window attention
        config.attention_mode = 'sliding_chunks'
        model = Longformer.from_pretrained(self.args.model_path, config=config)

        print("self.args.model_path: ", self.args.model_path)
        for layer in model.encoder.layer:
            layer.attention.self.attention_mode = self.args.attention_mode
            self.args.attention_window = layer.attention.self.attention_window

        print("Loaded model with config:")
        print(model.config)

        for p in model.parameters():
            p.requires_grad_(True)
        model.train()
        return model

#%%add_to hotpotqa    # does not seems to work for the @pl.data_loader decorator, missing which causes error "validation_step() takes 3 positional arguments but 4 were given"    
###################################################### dataloaders ########################################################### 
    @pl.data_loader
    def train_dataloader(self):
        if self.train_dataloader_object is not None:
            return self.train_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.train_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers)
        
#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,   # set shuffle=False, otherwise it will sample a different subset of data every epoch with train_percent_check
                        num_workers=self.args.num_workers,  
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

        self.train_dataloader_object = dl  
        return self.train_dataloader_object
    
 

    @pl.data_loader
    def val_dataloader(self):
        if self.val_dataloader_object is not None:
            return self.val_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples 

        
        
#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, 
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.val_dataloader_object = dl
        return self.val_dataloader_object

    @pl.data_loader
    def test_dataloader(self):
        if self.test_dataloader_object is not None:
            return self.test_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples

#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, 
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.test_dataloader_object = dl
        return self.test_dataloader_object

#%%add_to hotpotqa  
    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, q_type, sp_sent, sp_para):
 
        if(input_ids.size(0) > 1):
            assert("multi rows per document")
            
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        segment_ids = segment_ids.cuda()
        start_positions = start_positions.cuda()
        end_positions = end_positions.cuda()
        q_type = q_type.cuda()
        sp_sent = sp_sent.cuda()
        sp_para = sp_para.cuda()
        
        # Each batch is one document, and each row of the batch is a chunck of the document.    ????
        # Make sure all rows have the same question length.
        
 
        # local attention everywhere
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
        
        # global attention for the cls and all question tokens
        question_end_index = self._get_special_index(input_ids, [QUESTION_END])
#         if(question_end_index.size(0) == 1):
#             attention_mask[:,:question_end_index.item()] = 2  
#         else:
        attention_mask[:,:question_end_index[0].item()+1] = 2  # from <cls> until </q>
#             print("more than 1 <q> in: ", self.tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) )
        
        # global attention for the sentence and paragraph special tokens  
        sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
        attention_mask[:, sent_indexes] = 2
        
        para_indexes = self._get_special_index(input_ids, [TITLE_START])
        attention_mask[:, para_indexes] = 2       
         

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_ids, attention_mask = pad_to_window_size(
             input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)


#         print("size of input_ids: " + str(input_ids.size()))
        sequence_output = self.model(
                input_ids,
                attention_mask=attention_mask)[0]
#         print("size of sequence_output: " + str(sequence_output.size()))
#         print("sequence_output ", sequence_output)

        # The pretrained hotpotqa model wasn't trained with padding, so remove padding tokens
        # before computing loss and decoding.
        padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
        if padding_len > 0:
            sequence_output = sequence_output[:, :-padding_len]
#         print("size of sequence_output after removing padding: " + str(sequence_output.size()))
              
        
        ###################################### layers on top of sequence_output ##################################
        

        ### 1. answer start and end positions classification ###   
        logits = self.qa_outputs(sequence_output) 
        start_logits, end_logits = logits.split(1, dim=-1) 
        start_logits = start_logits.squeeze(-1) 
        end_logits = end_logits.squeeze(-1)
 
        ### 2. type classification, similar as class LongformerClassificationHead(nn.Module) https://huggingface.co/transformers/_modules/transformers/modeling_longformer.html#LongformerForSequenceClassification.forward ### 
        type_logits = self.linear_type(sequence_output[:,0]) 
        
        ### 3. supporting paragraph classification ###  
        sp_para_output = sequence_output[:,para_indexes,:]  
        sp_para_output_t = self.fnn_sp_para(sp_para_output) 

         # linear_sp_sent generates a single score for each sentence, instead of 2 scores for yes and no.   
        # Argument the score with additional score=0. The same way did in the HOTPOTqa paper
        sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float, device=sp_para_output_t.device) 
        predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).contiguous() 
 
        ### 4. supporting fact classification ###     
        # the first sentence in a paragraph is leading by <p>, other sentences are leading by <s>
 
        sp_sent_output = sequence_output[:,sent_indexes,:]  
        sp_sent_output_t = self.fnn_sp_sent(sp_sent_output)     
        sp_sent_output_aux = torch.zeros(sp_sent_output_t.shape, dtype=torch.float, device=sp_sent_output_t.device) 
        predict_support_sent = torch.cat([sp_sent_output_aux, sp_sent_output_t], dim=-1).contiguous() 
        
        outputs = (start_logits, end_logits, type_logits, sp_para_output_t, sp_sent_output_t)  
        answer_loss, type_loss, sp_para_loss, sp_sent_loss  = self.loss_computation(start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para, predict_support_para, sp_sent, predict_support_sent)
 
        outputs = (answer_loss, type_loss, sp_para_loss, sp_sent_loss,) + outputs    
    
    
#         explainer = shap.GradientExplainer( (logits, sequence_output), self.qa_outputs(sequence_output))
#         print(self.explainer)

        
        return outputs
    
    def loss_computation(self, start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para, predict_support_para, sp_sent, predict_support_sent):
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
 
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
  

            else: 
                start_positions = start_positions[:, 0:1]   # only use the top1 start_position considering only one appearance of the answer string
                end_positions = end_positions[:, 0:1]
                start_loss = crossentropy(start_logits, start_positions[:, 0])
                end_loss = crossentropy(end_logits, end_positions[:, 0])
                
 
            crossentropy = torch.nn.CrossEntropyLoss()
            type_loss = crossentropy(type_logits, q_type)  
            
            crossentropy_average = torch.nn.CrossEntropyLoss(reduction = 'mean', ignore_index=-1)      
            sp_para_loss = crossentropy_average(predict_support_para.view(-1, 2), sp_para.view(-1))
            sp_sent_loss = crossentropy_average(predict_support_sent.view(-1, 2), sp_sent.view(-1))      
 
            answer_loss = (start_loss + end_loss) / 2 
        return answer_loss, type_loss, sp_para_loss, sp_sent_loss  


#     %%add_to hotpotqa    
    def _get_special_index(self, input_ids, special_tokens):
        assert(input_ids.size(0)==1) 
        mask = input_ids != input_ids # initilaize 
        for special_token in special_tokens:
            mask = torch.logical_or(mask, input_ids.eq(self.tokenizer.convert_tokens_to_ids(special_token))) 
 
        token_indices = torch.nonzero(mask)    
         
 
        return token_indices[:,1]    

    def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1):
        """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
        assert logits.ndim == 2
        assert target.ndim == 2
        assert logits.size(0) == target.size(0) 
        
        # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
        # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

        # target are indexes of tokens, padded with ignore_index=-1
        # logits are scores (one for each label) for each token
 
        # compute a target mask
        target_mask = target == ignore_index
        # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
        masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
    
        # gather logits
        gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
 
        # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
        gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
 
        # each batch is one example
        gathered_logits = gathered_logits.view(1, -1)
        logits = logits.view(1, -1)
 
        # numerator = log(sum(exp(gathered logits)))
        log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
 
        log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
        
        # compute the loss
        loss = -(log_score - log_norm) 
        
        # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
        # when `target` is all `ignore_index`, loss is 0 
        loss = loss[~torch.isinf(loss)].sum()
#         loss = torch.tanh(loss)
#         print("final loss: " + str(loss)) 
        return loss 

In [18]:
# debug
# input_ids = torch.tensor([[-1, 5, -1, 2]])
# input_ids.size(0)
# token_indices =  torch.nonzero(input_ids == torch.tensor(-1))[:,1]
# # token_indices
# # token_indices.item()
# # indices =  torch.LongTensor([[2],[0,2]])

# # torch.gather(input_ids, 1, token_indices.unsqueeze(0))
# # p_index = token_indices.view(input_ids.size(0), -1)[:,1::2]   
# # attention_mask = torch.ones(input_ids.shape, dtype=torch.long) 
# # attention_mask[:,token_indices] = 2
# # attention_mask
# p_index = torch.tensor([1, 3, 4])
# s_index = torch.tensor([1,3,6])
# torch.sort(torch.cat((s_index, p_index)))[0]
# attention_mask.view(-1)[ p_index.view(-1), :].view(attention_mask.size(0), -1)
# # for pi in p_index[0]:
# #     attention_mask[:, pi] = 2
# # attention_mask
# # s_index = torch.tensor([[1,3]])
# # torch.sort(torch.cat((p_index, s_index), -1), -1)

# sequence_output  = torch.tensor([[[-1, 5, -1, 2],
#                                  [-2, 27, 2, 9],
#                                  [3, 6, 1, 65],
#                                  [52, 36, 13, 2],
#                                  [73, 26, 1, 7]
#                                 ]])

# sp_para_output_t   = torch.tensor([[[-1],
#                                  [-2 ],
#                                  [3],
#                                  [52],
#                                  [73]
#                                 ]])
# torch.zeros(sp_para_output_t.shape, dtype=torch.float) 

# print("size of sequence_output: " + str(sequence_output.size()))
# # print("size of p_index.unsqueeze(0).unsqueeze(-1): " + str(p_index.unsqueeze(0).size()))
# sequence_output[:,p_index,:]
# b = torch.tensor([0, 1, 2, 3])
# p_index.unsqueeze(-1) * b

# input_ids = torch.tensor([[0.2, 0.0, 0.6, 0.6], [0.2, 0.6, 0.0, 0.0]]) 
# # input_ids.tolist()
# p_index =  torch.nonzero(input_ids == torch.tensor(0.2))
# print(p_index)
# s_index =  torch.nonzero(input_ids == torch.tensor(0.6))
# print(s_index)

# sp_sent = torch.tensor([[0, 1, 1, 0]])
# torch.nonzero(sp_sent, as_tuple=True)[1]
# cat_index = torch.tensor([])
# cat_index = torch.cat((cat_index, ids[0][1]))
# print(ids)
# print(cat_index)
# p_index[p_index[:,0] == 0]

# cat_index[cat_index[:,0].argsort()]

# sorted(torch.cat((p_index, s_index)), key = lambda x: x[0])
# torch.sort(torch.cat((p_index, s_index)), 0)[0]
# for cor in token_indices:
#     attention_mask[cor[0].item()][cor[1].item()] = 2
# attention_mask 
# input_ids = torch.tensor([[-1, 5, -6, 2]])
# print(input_ids.size())
# input_ids.topk(k=2, dim=-1).indices

# predict_type = torch.tensor([[-0.0925, -0.0999, -0.1671]])
# p_type = torch.argmax(predict_type, dim=1).item()
# p_type_score = torch.max(predict_type, dim=1)[0].item()
# print("predict_type: ", predict_type)
# print("p_type: ", p_type)
# print("p_type_score: ", p_type_score)
    
# a = torch.tensor([[0.9213,  1.0887, -0.8858, -1.7683]])
# a.view(-1).size() 
# print(torch.sigmoid(a))
# a = torch.tensor([ 9.213,  1.0887, -0.8858, 7683])
# print(torch.sigmoid(a))

# a = torch.tensor([[[1],[2],[4],[-1],[-1]]])
# a= a.squeeze(-1)
# a.size() 
# a[:, torch.where(a!=-1)[1]]
# m = torch.nn.Sigmoid()
# print("m: ", m)
# loss = torch.nn.BCELoss()
# # input = torch.randn(3, requires_grad=True)
# # print("input: ", input)
# # target = torch.empty(3).random_(2)
# # print("target: ", target)
# # output = loss(m(input), target)
# # print("output: ", output)

# input = torch.tensor([1.0293, -0.1585,  1.1408], requires_grad=True)
# print("input: ", input)
# print("Sigmoid(input): ", m(input))
# target = torch.tensor([0., 1., 0.])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# input = torch.tensor([[1.0293, -0.1585,  1.1408]], requires_grad=True)
# print("input: ", input)
# target = torch.tensor([[0., 1., 0.]])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# 1.1761 * 3
# soft_input = torch.nn.Softmax(dim=-1)
# log_soft_input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss() 
# loss(log_soft_input, target)
# input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss()
# loss(input,target)

# loss =torch.nn.CrossEntropyLoss()
# loss(input,target) 

# sp_sent_logits =torch.tensor([[[0.0988],
#          [0.0319],
#          [0.0314]]])
# sp_sent_logits.squeeze()

# input_ids = torch.tensor([[0.6, 0.0, 0.6, 0.0]]) 
# token_indices =  torch.nonzero(input_ids == torch.tensor(0.6))
# token_indices[:,1][0].item()

# def or_softmax_cross_entropy_loss_one_doc(logits, target, ignore_index=-1, dim=-1):
#     """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
#     assert logits.ndim == 2
#     assert target.ndim == 2
#     assert logits.size(0) == target.size(0) 

#     # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
#     # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

#     # target are indexes of tokens, padded with ignore_index=-1
#     # logits are scores (one for each label) for each token
# #         print("or_softmax_cross_entropy_loss_one_doc" ) 
# #         print("size of logits: " + str(logits.size()))                    # torch.Size([1, 746]), 746 is number of all tokens 
# #         print("size of target: " + str(target.size()))                    # torch.Size([1, 64]),  -1 padded
#     print("target: " + str(target)) 

#     # compute a target mask
#     target_mask = target == ignore_index
#     # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
#     masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
#     print("masked_target: " + str(masked_target))     
#     # gather logits
#     gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
# #         print("size of gathered_logits: " + str(gathered_logits.size()))  # torch.Size([1, 64])
#     print("gathered_logits: " + str(gathered_logits)) 
#     # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
#     gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
#     print("gathered_logits after -inf: " + str(gathered_logits))      # tensor([[0.4382,   -inf,   -inf,   -inf,   -inf,...]])

#     # each batch is one example
#     gathered_logits = gathered_logits.view(1, -1)
#     logits = logits.view(1, -1)
# #         print("size of gathered_logits after view: " + str(gathered_logits.size()))  # torch.Size([1, 64])
# #         print("size of logits after view: " + str(logits.size()))                    # torch.Size([1, 746])　　

#     # numerator = log(sum(exp(gathered logits)))
#     log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
#     print("log_score: " + str(log_score)) 
#     # denominator = log(sum(exp(logits)))
#     log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
#     print("log_norm: " + str(log_norm)) 

#     # compute the loss
#     loss = -(log_score - log_norm)
#     print("loss: " + str(loss))


#     # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
#     # replace -inf with 0
#     loss = loss[~torch.isinf(loss)].sum()
#     print("final loss: " + str(loss)) 
#     return loss 

# # input = torch.tensor([[ 0,  0.0780],
# #         [0, 0.9253 ],
# #         [0, 0.0987]])
# # target = torch.tensor([0,1,0])
# # target.size(0) < 1
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312],
# #         [-0.3499, -1.9253, -1.5725],
# #         [-0.6578, -0.0987,  1.1570]])
# # target=torch.tensor([0,1,2])
# # predict_support_para.view(-1, 2), sp_para.view(-1)
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312]])
# # target=torch.tensor([0])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([2])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([-1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# a = torch.tensor([6.4062])    
# b = torch.tensor([2.23])
# torch.cat((a,b))
 
# for a in list_tensor
# from functools import reduce
# reduce(lambda x,y: torch.cat((x,y)), list_tensor[:-1])

# torch.tanh(a)
# # if(torch.isinf(a)):
# #     print("is inf")
# 5 * 1e-2


# import torch
# special_tokens = [1,2]
# input_ids = torch.tensor([[ 1, 0, 2, 1, 0, 2]])

# mask = input_ids != input_ids # initilaize 
# for special_token in special_tokens:
#     mask = torch.logical_or(mask, input_ids.eq(special_token)) 
#     print("mask: ", mask)
# torch.nonzero(mask)    
 

In [19]:
# # debug: check loaded dataset by DataLoader
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# num_new_tokens = tokenizer.add_special_tokens({"additional_special_tokens": ["<p>", "<q>", "</q>"]})
# # # # print(tokenizer.all_special_tokens)    
# # # # print(tokenizer.all_special_ids)     
# # # # tokenizer.convert_tokens_to_ids("<s>")
# # # # tokenizer.sep_token
# print(tokenizer.tokenize("yes"))
# print(tokenizer.tokenize("no"))
# print(tokenizer.tokenize("null"))
# # # all_doc_tokens = []
# # # orig_to_tok_index = []
# # # tok_to_orig_index = []
# # # for (i, token) in enumerate(["<s>", "da", "tell", "<p>", "say"]):
# # #     orig_to_tok_index.append(len(all_doc_tokens))
# # #     sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token)
# # #     for sub_token in sub_tokens:
# # #         tok_to_orig_index.append(i)
# # #         all_doc_tokens.append(sub_token)
# # # all_doc_tokens


 

# dataset = hotpotqaDataset(file_path= args.train_dataset, tokenizer=tokenizer,
#                           max_seq_len= args.max_seq_len, max_doc_len= args.max_doc_len,
#                           doc_stride= args.doc_stride,
#                           max_num_answers= args.max_num_answers,
#                           max_question_len= args.max_question_len,
#                           ignore_seq_with_no_answers= args.ignore_seq_with_no_answers)
# print(len(dataset))

# # # dl = DataLoader(dataset, batch_size=1, shuffle=None,
# # #                     num_workers=args.num_workers, sampler=None,
# # #                     collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

# example = dataset[3]  
# [input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids] = example
 

# print(input_ids[0][:20].tolist())
# print(input_mask) 
# print(segment_ids) 
# print(subword_starts) 
# print(subword_ends)
# print(q_type)
# print(sp_sent) 
# print(sp_para) 
# print(qids)
# print(tokenizer.convert_ids_to_tokens(input_ids[0][667:669+1].tolist()))
# 0.0033 * 90447 
# 28*4
# torch.tensor(0.0)

##### configure_ddp

In [20]:
 %%add_to hotpotqa
 # A hook to overwrite to define your own DDP(DistributedDataParallel) implementation init. 
 # The only requirement is that: 
 # 1. On a validation batch the call goes to model.validation_step.
 # 2. On a training batch the call goes to model.training_step.
 # 3. On a testing batch, the call goes to model.test_step
 def configure_ddp(self, model, device_ids):
    model = LightningDistributedDataParallel(
        model,
        device_ids=device_ids,
        find_unused_parameters=True
    )
    return model

##### **configure_optimizers**

In [21]:
%%add_to hotpotqa
def configure_optimizers(self):
    # Set up optimizers and (optionally) learning rate schedulers
    def lr_lambda(current_step):
        if current_step < self.args.warmup:
            return float(current_step) / float(max(1, self.args.warmup))
        return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup)))

    optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr)

    self.scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)  # scheduler is not saved in the checkpoint, but global_step is, which is enough to restart
    self.scheduler.step(self.global_step)
    print("global step: ", self.global_step)
    return optimizer

##### optimizer_step

In [22]:
%%add_to hotpotqa
# A hook to do a lot of non-standard training tricks such as learning-rate warm-up
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None,using_native_amp=None):
    optimizer.step()
    optimizer.zero_grad()
    self.scheduler.step(self.global_step)

##### **training_step**

In [23]:
%%add_to hotpotqa
def training_step(self, batch, batch_nb):
    # do the forward pass and calculate the loss for a batch 
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch 
    # print("size of input_ids: " + str(input_ids.size())) 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    
#     print(self.explainer(input_ids))
    answer_loss, type_loss, sp_para_loss, sp_sent_loss  = output[:4]
    # print("answer_loss: ", answer_loss)
    # print("type_loss: ", type_loss)
    # print("sp_para_loss: ", sp_para_loss)
    # print("sp_sent_loss: ", sp_sent_loss)

#     loss  = answer_loss +  type_loss + sp_para_loss + sp_sent_loss
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss
#     print("weighted loss: ", loss)
#     print("self.trainer.optimizers[0].param_groups[0]['lr']: ", self.trainer.optimizers[0].param_groups[0]['lr'])
    lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']  # loss.new_zeros(1) is tensor([0.]), converting 'lr' to tensor' by adding it.  

    tensorboard_logs = {'loss': loss, 'train_answer_loss': answer_loss, 'train_type_loss': type_loss, 
                        'train_sp_para_loss': sp_para_loss, 'train_sp_sent_loss': sp_sent_loss, 
                        'lr': lr,
                        'mem': torch.tensor(torch.cuda.memory_allocated(input_ids.device) / 1024 ** 3).type_as(loss) }
    return tensorboard_logs


##### training_end

In [24]:
# %%add_to hotpotqa
    # # the function is called for each batch after every epoch is completed
    # def training_end(self, output): 
    #     # print("training_end at epoch: ", self.current_epoch)
    # #     print("len(outputs): ",len(outputs))
    # #     print("output: ",output)
    
    #     # one batch only has one example
    #     avg_loss = output['loss']    
    #     avg_answer_loss = output['train_answer_loss']  
    #     avg_type_loss = output['train_type_loss']    
    #     avg_sp_para_loss = output['train_sp_para_loss']   
    #     avg_sp_sent_loss = output['train_sp_sent_loss'] 
    #     avg_lr = output['lr']      
         
     
    #     if self.trainer.use_ddp:
    #         torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_answer_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_type_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_para_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_sent_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_lr, op=torch.distributed.ReduceOp.SUM)
    #         avg_lr /= self.trainer.world_size 
            
     
    #     tensorboard_logs = { #'avg_train_loss': avg_loss, 
    #             'avg_train_answer_loss': avg_answer_loss, 'avg_train_type_loss': avg_type_loss, 'avg_train_sp_para_loss': avg_sp_para_loss, 'avg_train_sp_sent_loss': avg_sp_sent_loss, 'lr': avg_lr
    #           }
    
    #     return {'loss': avg_loss, 'log': tensorboard_logs}


##### validation_step

In [25]:
%%add_to hotpotqa
# When the validation_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of validation, model goes back to training mode and gradients are enabled.
def validation_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch

    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)


    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    pre_answer = normalize_answer(answers_pred[0]['text'])
#         print("pred answer_score: " + str(pre_answer_score))
#         print("pred answer_text: " + str(pre_answer)) 

    gold_answer = normalize_answer(answer)
    f1, prec, recall = self.f1_score(pre_answer, gold_answer)
    em = self.exact_match_score(pre_answer, gold_answer) 
    f1 = torch.tensor(f1).type_as(loss)
    prec = torch.tensor(prec).type_as(loss)
    recall = torch.tensor(recall).type_as(loss)
    em = torch.tensor(em).type_as(loss)
#         print("f1: " + str(f1))
#         print("prec: " + str(prec))
#         print("recall: " + str(recall))
#         print("em: " + str(em))  

    if(len(sp_sent_pred) > 0):
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = self.sp_metrics(sp_sent_pred, torch.where(sp_sent.squeeze())[0].tolist())
        sp_sent_em = torch.tensor(sp_sent_em).type_as(loss)
        sp_sent_precision = torch.tensor(sp_sent_precision).type_as(loss)
        sp_sent_recall = torch.tensor(sp_sent_recall).type_as(loss)
        sp_sent_f1 = torch.tensor(sp_sent_f1).type_as(loss)

#         print("sp_sent_em: " + str(sp_sent_em))
#         print("sp_sent_precision: " + str(sp_sent_precision))
#         print("sp_sent_recall: " + str(sp_sent_recall))    
#         print("sp_sent_f1: " + str(sp_sent_f1))    

        joint_prec = prec * sp_sent_precision
        joint_recall = recall * sp_sent_recall
        if joint_prec + joint_recall > 0:
            joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
        else:
            joint_f1 = torch.tensor(0.0).type_as(loss)
        joint_em = em * sp_sent_em 

    else:
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)
        joint_em, joint_f1, joint_prec, joint_recall =  torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)


    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss,
               'answer_score': pre_answer_score, 'f1': f1, 'prec':prec, 'recall':recall, 'em': em,
               'sp_em': sp_sent_em, 'sp_f1': sp_sent_f1, 'sp_prec': sp_sent_precision, 'sp_recall': sp_sent_recall,
               'joint_em': joint_em, 'joint_f1': joint_f1, 'joint_prec': joint_prec, 'joint_recall': joint_recall}


###### decode

In [26]:
%%add_to hotpotqa
def decode(self, input_ids, start_logits, end_logits, type_logits, sp_para_logits, sp_sent_logits):
#         print("decode")

    question_end_index = self._get_special_index(input_ids, [QUESTION_END])
#     print("question_end_index: ", question_end_index)

    # one example per batch
    start_logits = start_logits.squeeze()
    end_logits = end_logits.squeeze()
#     print("start_logits: ", start_logits)
#     print("end_logits: ", end_logits)
    start_logits_indices = start_logits.topk(k=self.args.n_best_size, dim=-1).indices
#     print("start_logits_indices: ", start_logits_indices)
    end_logits_indices = end_logits.topk(k=self.args.n_best_size, dim=-1).indices 
    if(len(start_logits_indices.size()) > 1):
        print("len(start_logits_indices.size()): ", len(start_logits_indices.size()))
        assert("len(start_logits_indices.size()) > 1")
    p_type = torch.argmax(type_logits, dim=1).item()
    p_type_score = torch.max(type_logits, dim=1)[0] 
#     print("type_logits: ", type_logits)
#         print("p_type: ", p_type)
#         print("p_type_score: ", p_type_score)

    answers = []
    if p_type == 0:
        potential_answers = []
        for start_logit_index in start_logits_indices: 
            for end_logit_index in end_logits_indices: 
                if start_logit_index <= question_end_index.item():
                    continue
                if end_logit_index <= question_end_index.item():
                    continue
                if start_logit_index > end_logit_index:
                    continue
                answer_len = end_logit_index - start_logit_index + 1
                if answer_len > self.args.max_answer_length:
                    continue
                potential_answers.append({'start': start_logit_index, 'end': end_logit_index,
                                          'start_logit': start_logits[start_logit_index],  # single logit score for start position at start_logit_index
                                          'end_logit': end_logits[end_logit_index]})    
        sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) 
#             print("sorted_answers: " + str(sorted_answers))

        if len(sorted_answers) == 0:
            answers.append({'text': 'NoAnswerFound', 'score': -1000000})
        else:
            answer = sorted_answers[0]
            answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]

            answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())
            # remove [/sent], <t> and </t>
            for special_token in [SENT_MARKER_END, TITLE_START, TITLE_END]:
                try:
                    answer_tokens.remove(special_token)
                except:
                    pass

            text = self.tokenizer.convert_tokens_to_string(answer_tokens)
#             score = (answer['start_logit'] + answer['end_logit'] + p_type_score) / 3
            score = (torch.sigmoid(answer['start_logit']) + torch.sigmoid(answer['end_logit']) + torch.sigmoid(p_type_score)) / 3
            answers.append({'text': text, 'score': score})
#             print("answers: " + str(answers))
    elif p_type == 1: 
        answers.append({'text': 'yes', 'score': p_type_score})
    elif p_type == 2:
        answers.append({'text': 'no', 'score': p_type_score})
    elif p_type == 3:
        answers.append({'text': 'null', 'score': p_type_score})
    else:
        assert False 


    sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
    para_indexes = self._get_special_index(input_ids, [TITLE_START])

    s_to_p_map = []
    for s in sent_indexes:
        s_to_p = torch.where(torch.le(para_indexes, s))[0][-1]     # last para_index smaller or equal to s
        s_to_p_map.append(s_to_p.item()) 
#         print("s_to_p_map: " + str(s_to_p_map))

#         print("sp_para_logits", sp_para_logits)
#         print("sp_sent_logits", sp_sent_logits)

    sp_para_top2 = sp_para_logits.squeeze().topk(k=2).indices
    if(sp_sent_logits.squeeze().size(0) > 12):
        sp_sent_top12 = sp_sent_logits.squeeze().topk(k=12).indices
    else:
        sp_sent_top12 = sp_sent_logits.squeeze().topk(k=sp_sent_logits.squeeze().size(0)).indices
#         print("sp_para_top2", sp_para_top2)
#         print("sp_sent_top12", sp_sent_top12)

    sp_sent_pred = set()
    sp_para_pred = set(sp_para_top2.tolist())
    for sp_sent in sp_sent_top12:
        sp_sent_to_para = s_to_p_map[sp_sent.item()]
        if sp_sent_to_para in sp_para_top2:
            sp_sent_pred.add(sp_sent.item())
#             sp_para_pred.add(sp_sent_to_para) 
#         print("sp_sent_pred: " + str(sp_sent_pred))
#         print("sp_para_pred: " + str(sp_para_pred))
    return (answers, sp_sent_pred, sp_para_pred)

###### metrics

In [27]:
%%add_to hotpotqa
# def normalize_answer(self, s):

#     def remove_articles(text):
#         return re.sub(r'\b(a|an|the)\b', ' ', text)

#     def white_space_fix(text):
#         return ' '.join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return ''.join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(self, prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)
    ZERO_METRIC = (0, 0, 0)

    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(self, prediction, ground_truth):
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))


def sp_metrics(self, prediction, gold): 
    tp, fp, fn = 0, 0, 0
    for e in prediction:
        if e in gold:
            tp += 1
        else:
            fp += 1 
    for e in gold:
        if e not in prediction:
            fn += 1 
    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0 
    return em, prec, recall, f1 



##### validation_end

In [28]:
%%add_to hotpotqa
# If a validation_step is not defined, this won't be called. Called at the end of the validation loop with the outputs of validation_step.
def validation_end(self, outputs):
    print("validation_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  


    answer_scores = [x['answer_score'] for x in outputs] 
    f1_scores = [x['f1'] for x in outputs]  
    em_scores = [x['em'] for x in outputs]  
    prec_scores =  [x['prec'] for x in outputs] 
    recall_scores = [x['recall'] for x in outputs]  
    sp_sent_f1_scores = [x['sp_f1'] for x in outputs]   
    sp_sent_em_scores = [x['sp_em'] for x in outputs]   
    sp_sent_prec_scores = [x['sp_prec'] for x in outputs]   
    sp_sent_recall_scores = [x['sp_recall'] for x in outputs]   
    joint_f1_scores = [x['joint_f1'] for x in outputs]  
    joint_em_scores = [x['joint_em'] for x in outputs]  
    joint_prec_scores = [x['joint_prec'] for x in outputs]  
    joint_recall_scores = [x['joint_recall'] for x in outputs]



    print(f'before sync --> sizes:  {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 

        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)

        sp_sent_f1_scores = self.sync_list_across_gpus(sp_sent_f1_scores, avg_loss.device, torch.float)
        sp_sent_em_scores = self.sync_list_across_gpus(sp_sent_em_scores, avg_loss.device, torch.float)
        sp_sent_prec_scores = self.sync_list_across_gpus(sp_sent_prec_scores, avg_loss.device, torch.float)
        sp_sent_recall_scores = self.sync_list_across_gpus(sp_sent_recall_scores, avg_loss.device, torch.float)

        joint_f1_scores = self.sync_list_across_gpus(joint_f1_scores, avg_loss.device, torch.float)
        joint_em_scores = self.sync_list_across_gpus(joint_em_scores, avg_loss.device, torch.float)
        joint_prec_scores = self.sync_list_across_gpus(joint_prec_scores, avg_loss.device, torch.float)
        joint_recall_scores = self.sync_list_across_gpus(joint_recall_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')

    avg_val_f1 = sum(f1_scores) / len(f1_scores)    
    avg_val_em = sum(em_scores) / len(em_scores)    
    avg_val_prec = sum(prec_scores) / len(prec_scores)  
    avg_val_recall = sum(recall_scores) / len(recall_scores)    
    avg_val_sp_sent_f1 = sum(sp_sent_f1_scores) / len(sp_sent_f1_scores)    
    avg_val_sp_sent_em = sum(sp_sent_em_scores) / len(sp_sent_em_scores)    
    avg_val_sp_sent_prec = sum(sp_sent_prec_scores) / len(sp_sent_prec_scores)  
    avg_val_sp_sent_recall = sum(sp_sent_recall_scores) / len(sp_sent_recall_scores)    
    avg_val_joint_f1 = sum(joint_f1_scores) / len(joint_f1_scores)  
    avg_val_joint_em = sum(joint_em_scores) / len(joint_em_scores)  
    avg_val_joint_prec = sum(joint_prec_scores) / len(joint_prec_scores)    
    avg_val_joint_recall = sum(joint_recall_scores) / len(joint_recall_scores)  

    print("avg_loss: ", avg_loss, end = '\t')   
    print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    print("avg_type_loss: ", avg_type_loss, end = '\t') 
    print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t')   
    print("avg_sp_sent_loss: ", avg_sp_sent_loss)   
    print("avg_val_f1: ", avg_val_f1, end = '\t')   
    print("avg_val_em: ", avg_val_em, end = '\t')   
    print("avg_val_prec: ", avg_val_prec, end = '\t')   
    print("avg_val_recall: ", avg_val_recall)   
    print("avg_val_sp_sent_f1: ", avg_val_sp_sent_f1, end = '\t')   
    print("avg_val_sp_sent_em: " , avg_val_sp_sent_em, end = '\t')  
    print("avg_val_sp_sent_prec: ", avg_val_sp_sent_prec, end = '\t')   
    print("avg_val_sp_sent_recall: ", avg_val_sp_sent_recall)   
    print("avg_val_joint_f1: " , avg_val_joint_f1, end = '\t')  
    print("avg_val_joint_em: ", avg_val_joint_em, end = '\t')   
    print("avg_val_joint_prec: ", avg_val_joint_prec, end = '\t')   
    print("avg_val_joint_recall: ", avg_val_joint_recall)   


    logs = {'avg_val_loss': avg_loss, 'avg_val_answer_loss': avg_answer_loss, 'avg_val_type_loss': avg_type_loss, 'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss,   
    'avg_val_f1': avg_val_f1, 'avg_val_em': avg_val_em,  'avg_val_prec': avg_val_prec, 'avg_val_recall': avg_val_recall,    
    'avg_val_sp_sent_f1': avg_val_sp_sent_f1, 'avg_val_sp_sent_em': avg_val_sp_sent_em,  'avg_val_sp_sent_prec': avg_val_sp_sent_prec, 'avg_val_sp_sent_recall': avg_val_sp_sent_recall,    
    'avg_val_joint_f1': avg_val_joint_f1, 'avg_val_joint_em': avg_val_joint_em,  'avg_val_joint_prec': avg_val_joint_prec, 'avg_val_joint_recall': avg_val_joint_recall 
    }   

    return {'avg_val_loss': avg_loss, 'log': logs}

def sync_list_across_gpus(self, l, device, dtype):
    l_tensor = torch.tensor(l, device=device, dtype=dtype)
    gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)]
    torch.distributed.all_gather(gather_l_tensor, l_tensor)
    return torch.cat(gather_l_tensor).tolist()

##### test_step

In [29]:
%%add_to hotpotqa
def test_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch

    print("test_step of qid: ", qid, end="\t") 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)

    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    pre_answer = normalize_answer(answers_pred[0]['text'])
    # print("pred answer_score: " + str(pre_answer_score))
    # print("pred answer_text: " + str(pre_answer)) 

    gold_answer = normalize_answer(answer)

    print("pre_answer:\t", pre_answer, "\tgold_answer:\t", gold_answer)

    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss, 'answer_score': pre_answer_score}


##### test_end

In [30]:
%%add_to hotpotqa
def test_end(self, outputs):
    print("test_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  

    answer_scores = [x['answer_score'] for x in outputs]  # [item for sublist in outputs for item in sublist['answer_score']] #torch.stack([x['answer_score'] for x in outputs]).mean() # 


    print(f'before sync --> sizes:  {len(answer_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 

#         int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}')
    # print("answer_scores: ", answer_scores)

    # print("avg_loss: ", avg_loss, end = '\t') 
    # print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    # print("avg_type_loss: ", avg_type_loss, end = '\t') 
    # print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t') 
    # print("avg_sp_sent_loss: ", avg_sp_sent_loss, end = '\t')  

    logs = {'avg_val_loss': avg_loss, 'avg_val_answer_loss': avg_answer_loss, 'avg_val_type_loss': avg_type_loss, 'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss
           }

    return {'avg_val_loss': avg_loss, 'log': logs} 


##### add_model_specific_args

In [31]:
%%add_to hotpotqa
@staticmethod
def add_model_specific_args(parser, root_dir):
    parser.add_argument("--save_dir", type=str, default='jupyter-hotpotqa')
    parser.add_argument("--save_prefix", type=str, required=True)
    parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format")
    parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format")
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
    parser.add_argument("--gpus", type=str, default='0',
                        help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ")
    parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps")
    parser.add_argument("--lr", type=float, default=0.00005, help="Maximum learning rate")
    parser.add_argument("--val_every", type=float, default=1.0, help="How often within one training epoch to check the validation set.")
    parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
    parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers")
    parser.add_argument("--seed", type=int, default=1234, help="Seed")
    parser.add_argument("--epochs", type=int, default=6, help="Number of epochs")
    parser.add_argument("--max_seq_len", type=int, default=4096,
                        help="Maximum length of seq passed to the transformer model")
    parser.add_argument("--max_doc_len", type=int, default=4096,
                        help="Maximum number of wordpieces of the input document")
    parser.add_argument("--max_num_answers", type=int, default=64,
                        help="Maximum number of answer spans per document (64 => 94%)")
    parser.add_argument("--max_question_len", type=int, default=55,
                        help="Maximum length of the question")
    parser.add_argument("--doc_stride", type=int, default=-1,
                        help="Overlap between document chunks. Use -1 to only use the first chunk")
    parser.add_argument("--ignore_seq_with_no_answers", action='store_true',
                        help="each example should have at least one answer. Default is False")
    parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
    parser.add_argument("--n_best_size", type=int, default=20,
                        help="Number of answer candidates. Used at decoding time")
    parser.add_argument("--max_answer_length", type=int, default=30,
                        help="maximum num of wordpieces/answer. Used at decoding time")
    parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss")
    parser.add_argument("--test", action='store_true', help="Test only, no training")
    parser.add_argument("--model_path", type=str,
                        help="Path to the checkpoint directory")
    parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing")
    parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                        default='sliding_chunks', help='Which implementation of selfattention to use')
    parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
    parser.add_argument('--train_percent', type=float, default=1.0)
    return parser

##### class info

In [32]:
dir(hotpotqa)

['CHECKPOINT_HYPER_PARAMS_KEY',
 'CHECKPOINT_HYPER_PARAMS_NAME',
 'CHECKPOINT_HYPER_PARAMS_TYPE',
 'T_destination',
 '_LightningModule__get_hparams_assignment_variable',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_apply',
 '_auto_collect_arguments',
 '_call_impl',
 '_forward_unimplemented',
 '_get_name',
 '_get_special_index',
 '_init_slurm_connection',
 '_load_from_state_dict',
 '_load_model_state',
 '_named_members',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',


In [33]:
from inspect import getmembers, isfunction
getmembers(hotpotqa)

[('CHECKPOINT_HYPER_PARAMS_KEY', 'hyper_parameters'),
 ('CHECKPOINT_HYPER_PARAMS_NAME', 'hparams_name'),
 ('CHECKPOINT_HYPER_PARAMS_TYPE', 'hparams_type'),
 ('T_destination', ~T_destination),
 ('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__abstractmethods__', frozenset()),
 ('__annotations__',
  {'_device': Ellipsis, '_dtype': typing.Union[str, torch.dtype]}),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__class__', abc.ABCMeta),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__init__': <function __main__.hotpotqa.__init__(self, args)>,
                'load_model': <function __main__.hotpotqa.load_model(self)>,
                'train_dataloader': <function pytorch_lightning.core.decorators.data_loader.<local

In [34]:
functions_list = [o for o in getmembers(hotpotqa) if isfunction(o[1])]
functions_list

[('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dir__', <function torch.nn.modules.module.Module.__dir__(self)>),
 ('__getattr__',
  <function torch.nn.modules.module.Module.__getattr__(self, name:str) -> Union[torch.Tensor, _ForwardRef('Module')]>),
 ('__init__', <function __main__.hotpotqa.__init__(self, args)>),
 ('__repr__', <function torch.nn.modules.module.Module.__repr__(self)>),
 ('__setattr__',
  <function torch.nn.modules.module.Module.__setattr__(self, name:str, value:Union[torch.Tensor, _ForwardRef('Module')]) -> None>),
 ('__setstate__',
  <function torch.nn.modules.module.Module.__setstate__(self, state)>),
 ('_apply', <function torch.nn.modules.module.Module._apply(self, fn)>),


In [35]:
import inspect
inspect.getmro(hotpotqa)  # a hierarchy of classes 

(__main__.hotpotqa,
 pytorch_lightning.core.lightning.LightningModule,
 abc.ABC,
 pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin,
 pytorch_lightning.core.grads.GradInformation,
 pytorch_lightning.core.saving.ModelIO,
 pytorch_lightning.core.hooks.ModelHooks,
 torch.nn.modules.module.Module,
 object)

In [36]:
help(hotpotqa.configure_optimizers)

Help on function configure_optimizers in module __main__:

configure_optimizers(self)



In [37]:
# import inspect
# code, line_no = inspect.getsourcelines(hotpotqa.training_step)
# print(''.join(code))

### main

In [38]:
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
    if not args.test:     # if it needs to train, remove exsiting folder
        import shutil
        save_folder = os.path.join(args.save_dir, args.save_prefix)
        if os.path.exists(save_folder):
            shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder 
        
    import shutil
    save_folder = os.path.join(args.save_dir, args.save_prefix)
    if os.path.exists(save_folder):
        shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder


In [41]:
    hotpotqa.__abstractmethods__=set()   # without this, got an error "Can't instantiate abstract class hotpotqa with abstract methods" if these two abstract methods are not implemented in the same cell where class hotpotqa defined 
    model = hotpotqa(args)
#     model.to('cuda')    # this is necessary to use gpu
    

self.args.model_path:  /xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096
Loaded model with config:
LongformerConfig {
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],
  "attention_mode": "sliding_chunks",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}



In [42]:
    logger = TestTubeLogger( # The TestTubeLogger adds a nicer folder structure to manage experiments and snapshots all hyperparameters you pass to a LightningModule.
        save_dir=args.save_dir,
        name=args.save_prefix,
        version=0  # always use version=0
    )

In [43]:
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
        save_top_k=5,
        verbose=True,
        monitor='avg_val_f1',
        mode='max',
        prefix=''
    )

In [44]:
    train_set_size = 9 * args.train_percent # 90447 * args.train_percent   # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    print("train_set_size: ", train_set_size) 

    args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus!='' else None
    num_devices = 1 or len(args.gpus)
    print("num_devices: ", num_devices)

    train_set_size = 90447 * args.train_percent    # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    args.steps = args.epochs * train_set_size / (args.batch_size * num_devices)

    print(f'>>>>>>> #train_set_size: {train_set_size}, #steps: {args.steps},  #warmup steps: {args.warmup}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<')

train_set_size:  9.0
num_devices:  1
>>>>>>> #train_set_size: 90447.0, #steps: 271341.0,  #warmup steps: 1000, #epochs: 6, batch_size: 2 <<<<<<<


In [45]:
    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp', # if args.gpus and (len(args.gpus) > 1) else None,
                             track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None,
                             accumulate_grad_batches=args.batch_size,
                             train_percent_check = args.train_percent,
        #                          val_check_interval=args.val_every,
                             val_percent_check=args.val_percent_check,
                             test_percent_check=args.val_percent_check,
                             logger=logger if not args.disable_checkpointing else False,
                             checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
                             show_progress_bar=args.no_progress_bar,
#                              use_amp=not args.fp32, 
                             amp_level='O2',
#                              check_val_every_n_epoch=1
                             )


GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
Multi-processing is handled by Slurm.
Multi-processing is handled by Slurm.
CUDA_VISIBLE_DEVICES: [0]
CUDA_VISIBLE_DEVICES: [0]


In [None]:
#     if not args.test: 
trainer.fit(model)


initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
distributed_backend=ddp
distributed_backend=ddp
All DDP processes registered. Starting ddp with 1 processes
All DDP processes registered. Starting ddp with 1 processes
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


global step:  0


Set SLURM handle signals.
Set SLURM handle signals.

  | Name        | Type       | Params
-------------------------------------------
0 | model       | Longformer | 148 M 
1 | qa_outputs  | Linear     | 1 K   
2 | linear_type | Linear     | 3 K   
3 | fnn_sp_sent | Sequential | 591 K 
4 | fnn_sp_para | Sequential | 591 K 

  | Name        | Type       | Params
-------------------------------------------
0 | model       | Longformer | 148 M 
1 | qa_outputs  | Linear     | 1 K   
2 | linear_type | Linear     | 3 K   
3 | fnn_sp_sent | Sequential | 591 K 
4 | fnn_sp_para | Sequential | 591 K 


reading file: small.json


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  24 october 1632
answer:  lorax
self.tokenizer.sep_token:  </s>
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True


	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)


size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
answer:  1960
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
answer:  transcendentalist
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0359,  0.0133,  0.0292,  ..., -0.2841, -0.0267,  0.0766],
         [ 0.0486, -0.0523, -0.0488,  ...,  0.6019, -0.0301,  0.1249],
         [ 0.1347,  0.1540, -0.0192,  ..., -0.6490,  0.0473, -0.2353],
         ...,
         [-0.0236,  0.0741, -0.0145,  ..., -0.0990, -0.0409, -0.0745],
         [-0.0236,  0.0741, -0.0145,  ..., -0.0990, -0.0409, -0.0745],
         [-0.0236,  0.0741, -0.0145,  ..., -0.0990, -0.0409, -0.0745]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1331, 768])
size of input_ids: torch.Size([1, 1536])
self.tokenizer.sep_token:  </s>
answer:  ash avildsen and matty becke

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
self.tokenizer.sep_token:  </s>
answer:  lorax
answer:  24 october 1632
self.tokenizer.sep_token == '</s>':  True
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  doug moench and don perlin
answer:  1960
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  transcendentalist
size of sequence_output: torch.Size([1, 1536, 768])


size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.1106, -0.0103,  0.0366,  ..., -0.3671, -0.0898,  0.0655],
         [ 0.1210,  0.0428,  0.1013,  ..., -0.4207, -0.0730,  0.2479],
         [-0.0758,  0.0504, -0.1213,  ..., -0.5045, -0.2694,  0.1369],
         ...,
         [ 0.0189,  0.1749,  0.0750,  ..., -0.1533, -0.0350, -0.2362],
         [-0.0665,  0.2056, -0.0351,  ..., -0.0933, -0.0637, -0.1430],
         [-0.0550,  0.1716,  0.0951,  ..., -0.0538,  0.0068, -0.2469]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1080, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.1997,  0.0716, -0.0086,  ..., -0.3736, -0.0572,  0.0385],
         [ 0.1203,  0.1857, -0.0060,  ..., -0.2135, -0.0619

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0726,  0.0640, -0.0117,  ..., -0.2106, -0.0494,  0.1057],
         [ 0.2421,  0.2590,  0.0826,  ..., -0.4265, -0.0518,  0.3259],
         [ 0.0857,  0.0300,  0.0105,  ..., -0.4446, -0.0461,  0.1669],
         ...,
         [-0.1097,  0.2487, -0.1712,  ..., -0.2676, -0.0820, -0.0329],
         [ 0.0153,  0.0648, -0.0239,  ..., -0.1014, -0.0475, -0.0636],
         [-0.0388,  0.0420, -0.0178,  ..., -0.1119, -0.0557, -0.0623]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1343, 768])
answer:  st johns
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0600,  0.0740,  0.0020,  

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0118,  0.0734,  0.0204,  ..., -0.2059, -0.0240,  0.0572],
         [ 0.1370,  0.2720, -0.1168,  ..., -0.2919,  0.0112,  0.0489],
         [ 0.0397,  0.0566,  0.0789,  ..., -0.4607,  0.0238,  0.0421],
         ...,
         [-0.0386,  0.1811,  0.0719,  ..., -0.0498, -0.0997,  0.0417],
         [-0.0260,  0.0817, -0.0184,  ..., -0.1089, -0.0177, -0.0698],
         [-0.0072,  0.0745, -0.0217,  ..., -0.1060, -0.0440, -0.0716]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1283, 768])
answer:  scottie pippen
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-7.4364e-02,  3.1980e-

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-5.4728e-02,  1.3913e-03,  2.3613e-02,  ..., -5.8015e-01,
           1.8563e-02,  1.4894e-01],
         [ 1.4574e-01, -8.0592e-02,  2.5095e-01,  ..., -1.4375e-01,
           2.8028e-02,  1.0362e-01],
         [ 1.0094e-01, -3.0564e-02,  9.6016e-02,  ..., -6.5910e-01,
          -3.7110e-02, -3.0453e-02],
         ...,
         [ 4.9037e-04,  7.0593e-02,  2.3534e-03,  ..., -9.6795e-02,
          -4.3533e-02, -6.2386e-02],
         [-1.3919e-02,  7.7417e-02,  2.4859e-03,  ..., -1.0999e-01,
          -3.5045e-02, -6.1303e-02],
         [-2.1549e-02,  5.1460e-02, -1.2580e-02,  ..., -1.1370e-01,
          -2.9996e-02, -7.5967e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1296, 768])
answer:  shane meadows
self.tokenize

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0328, -0.0005,  0.0263,  ..., -0.1943, -0.0340,  0.1016],
         [ 0.1863, -0.0454,  0.1096,  ...,  0.1210,  0.0735,  0.2636],
         [ 0.1185, -0.0578, -0.0616,  ..., -0.2118, -0.0007,  0.2589],
         ...,
         [ 0.0238,  0.1159, -0.0207,  ..., -0.0548, -0.0799, -0.1437],
         [-0.0158,  0.0614, -0.0147,  ..., -0.0998, -0.0262, -0.0489],
         [ 0.0618,  0.0784, -0.0661,  ...,  0.0341,  0.1786, -0.2109]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1269, 768])
answer:  lost princess of oz
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[-0.0437,  0.0157,

answer:  szombathelyi haladás
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  ring
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[-0.0456, -0.0140,  0.0729,  ..., -0.2566,  0.0890,  0.2158],
         [-0.0042, -0.0358,  0.0755,  ...,  0.0407,  0.2466,  0.3184],
         [ 0.0532, -0.1721, -0.1456,  ..., -0.3422,  0.0148,  0.1230],
         ...,
         [-0.1986,  0.2130,  0.0959,  ...,  0.0592,  0.1021, -0.0737],
         [-0.0132,  0.0708, -0.0021,  ..., -0.1036, -0.0379, -0.0601],
         [-0.0477,  0.3014, -0.1526,  ...,  0.0124, -0.0507,  0.1879]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 656, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 2.

size of input_ids: torch.Size([1, 512])
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 0.1306,  0.2392,  0.1031,  ..., -0.0563,  0.1528,  0.1085],
         [ 0.3735,  0.2653,  0.0256,  ..., -0.0169,  0.3229, -0.0989],
         [-0.0189, -0.0188,  0.0476,  ..., -0.0981,  0.2579, -0.0101],
         ...,
         [-0.0083,  0.0787, -0.0305,  ..., -0.0971, -0.0414, -0.0035],
         [-0.0248,  0.0570, -0.0142,  ..., -0.0953, -0.0377, -0.0783],
         [-0.0164,  0.0576, -0.0134,  ..., -0.1008, -0.0393, -0.0680]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 392, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1394,  0.1195,  0.0992,  ..., -0.3077,  0.2107,  0.0531],
         [ 0.1315,  0.0929,  0.2702,  ..., -0.1794,  0.1408,  0.2202],
         [ 0.3658,  0.1445,  0.0653,  ..., -0.9938,  0.1383,  0.1

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  lorax
answer:  24 october 1632
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  1960
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1223,  0.0387,  0.0784,  ..., -0.2250,  0.1310,  0.1307],
         [ 0.

answer:  4
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 0.0982, -0.0328,  0.1054,  ..., -0.3011,  0.1453,  0.0821],
         [ 0.3076,  0.1939,  0.1840,  ..., -0.0015,  0.1511,  0.5456],
         [ 0.1868,  0.0763, -0.0046,  ..., -0.4326,  0.1115,  0.0364],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1718, 768])
self.tokenizer.sep_token:  </s>
answer:  loop parkway
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0370,  0.0941,  0.0419,  ..., -0.1607,  0.162

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  st johns
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1827,  0.0812,  0.0347,  ..., -0.0913,  0.1811,  0.2213],
         [ 0.1827,  0.0629,  0.2533,  ...,  0.5935,  0.1622,  0.5787],
         [ 0.3404,  0.0319, -0.0062,  ..., -0.3868,  0.4086,  0.1963],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1456, 768])
answer:  square enix
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0647,  

self.tokenizer.sep_token == '</s>':  True
answer:  wachovia securities
answer:  no
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0287,  0.1064,  0.0264,  ...,  0.0372,  0.1099,  0.2248],
         [ 0.5491, -0.3313,  0.2603,  ...,  1.0237,  0.2083,  0.5710],
         [ 0.0413,  0.0328,  0.0115,  ..., -0.1087,  0.3684,  0.2890],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1163, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0292, -0.0408,  0.0925,  ..., -0.1619,  0.2170,  0.0869],
         [ 0.1549,  0.0106,  0.0835,  ...,  0.2186, 

answer:  magnolia pictures
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0959,  0.0826,  0.0702,  ..., -0.1023,  0.1978,  0.1525],
         [ 0.2697, -0.1184,  0.2008,  ...,  0.3531,  0.1955,  0.4013],
         [ 0.2552, -0.0669,  0.1455,  ..., -0.2931,  0.2671, -0.0212],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1192, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1004,  0.1523,  0.0940,  ..., -0.0859,  0.0682,  0.0205],
         [ 0.2877,  0.0539,  0.1886,  ...,  1.0060,  0.1268,  0.3437],
         [ 0.1407, -0.0055,  0.0604, 

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  july 16 2012
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0224,  0.2026,  0.0612,  ..., -0.1677,  0.1605,  0.1341],
         [-0.0480,  0.2772, -0.0492,  ...,  0.2297,  0.3704,  0.2182],
         [ 0.3580,  0.0625, -0.0508,  ..., -0.3476,  0.2088, -0.0123],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1205, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 5.3343e-02,  2.0309e-01, -4.9136e-04,  ...,  5.0670e-02,
           3.8593e-02,  5.6632e-02],
         [ 5.1553e-02,  3.2699e-02,  2.6760e-01,  ...,  5.9101e-01,
           

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  2001
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 0.1685,  0.1343,  0.1509,  ...,  0.0583,  0.2045,  0.2934],
         [ 0.1506, -0.0906, -0.2234,  ...,  0.3318,  0.5585,  0.2301],
         [ 0.1322, -0.1848, -0.0191,  ..., -0.2357,  0.3554,  0.0475],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 457, 768])
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[-0.1518,  0.0867,  0.0146,  ..., -0.3602,  0.2441,  0.1222],
         [ 0.2164,  0.3344,  0.1650,  ...,  0.064

size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1011,  0.0584, -0.0042,  ..., -0.0554,  0.1996,  0.0963],
         [ 0.1053,  0.2249,  0.0904,  ...,  0.7017,  0.1805,  0.4147],
         [ 0.2511,  0.2710, -0.1127,  ..., -0.6488,  0.3354,  0.0586],
         ...,
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800],
         [-0.0191,  0.0750, -0.0116,  ..., -0.0931, -0.0366, -0.0800]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1242, 768])



Epoch 00000: avg_val_f1 reached 0.00253 (best 0.00253), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_0.ckpt as top 5

Epoch 00000: avg_val_f1 reached 0.00253 (best 0.00253), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_0.ckpt as top 5


validation_end
before sync --> sizes:  79, 79, 79
after sync --> sizes: 79, 79, 79
avg_loss:  tensor(23.8353, device='cuda:0')	avg_answer_loss:  tensor(6.2247, device='cuda:0')	avg_type_loss:  tensor(1.2993, device='cuda:0')	avg_sp_para_loss:  tensor(0.5808, device='cuda:0')	avg_sp_sent_loss:  tensor(0.5306, device='cuda:0')
avg_val_f1:  0.0025316456073447118	avg_val_em:  0.0	avg_val_prec:  0.0021097047042243086	avg_val_recall:  0.0031645569620253164
avg_val_sp_sent_f1:  0.07073538439183295	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0793248948794377	avg_val_sp_sent_recall:  0.07489451539667347
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  Tru

answer:  shane meadows
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.1852,  0.0785, -0.0208,  ..., -0.1188,  0.2572,  0.1128],
         [ 0.6080,  0.5488, -0.2958,  ...,  0.5940,  0.3633,  0.3780],
         [ 0.3858,  0.1679, -0.0526,  ..., -0.4889,  0.4121,  0.2248],
         ...,
         [-0.0247,  0.0774, -0.0116,  ..., -0.0998, -0.0363, -0.0712],
         [-0.0131,  0.0561, -0.0096,  ..., -0.0953, -0.0270, -0.0608],
         [ 0.0164,  0.0491, -0.0269,  ..., -0.1077, -0.0047, -0.1093]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1336, 768])
answer:  1970
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  hollywood madam
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_ou

answer:  2001
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.0567,  0.1531, -0.0389,  ..., -0.0129,  0.4279,  0.0198],
         [-0.0821,  0.3271,  0.0822,  ...,  0.0952,  0.6946,  0.2540],
         [ 0.1467,  0.0836, -0.1490,  ...,  0.1843,  0.4235, -0.3423],
         ...,
         [-0.0249,  0.0686, -0.0139,  ..., -0.0877, -0.0451, -0.0631],
         [ 0.0271,  0.4480, -0.1728,  ..., -0.1872,  0.0622, -0.1031],
         [-0.0305,  0.1411,  0.1011,  ..., -0.0117, -0.0873, -0.2132]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 800, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  essex
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.2866,  0.0213, 

answer:  toledo
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 5.5788e-01,  3.7048e-02, -5.4111e-04,  ...,  2.0729e-01,
           6.1752e-01,  1.8289e-01],
         [ 1.4218e-01, -3.0544e-01,  2.1430e-01,  ...,  1.3617e+00,
           6.6305e-01, -5.1045e-02],
         [ 5.0940e-01,  1.4443e-01,  1.1565e-01,  ...,  6.0137e-02,
           3.8755e-01,  1.6144e-01],
         ...,
         [ 1.4463e-02,  6.8108e-02, -2.8251e-03,  ..., -9.6176e-02,
          -3.4441e-02, -7.1044e-02],
         [-3.4059e-03,  8.1055e-02, -1.3818e-02,  ..., -1.0458e-01,
          -7.5498e-03, -9.1760e-02],
         [ 1.3256e-01,  4.1189e-01, -3.8825e-02,  ...,  5.6006e-02,
           1.4869e-01, -1.8081e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 819, 768])
answer:  russia
self.

answer:  michael caine
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 7.4757e-01,  2.0323e-01, -2.2555e-01,  ...,  2.1729e-01,
           6.4262e-01,  1.2505e-01],
         [ 5.9372e-01, -2.6139e-01, -3.8291e-01,  ...,  7.9879e-01,
           1.1079e+00,  7.2382e-02],
         [ 4.7673e-01,  5.0992e-02, -1.0043e-01,  ...,  4.1361e-01,
           7.6761e-01, -2.2916e-01],
         ...,
         [-7.7187e-02,  2.3473e-01,  2.8660e-04,  ..., -2.2458e-01,
          -2.0154e-01, -4.2165e-01],
         [-9.5910e-03,  4.1451e-02, -1.0358e-02,  ..., -9.8377e-02,
          -1.2831e-02, -8.8326e-02],
         [-1.6740e-02,  6.8677e-02,  5.0730e-04,  ..., -8.6465e-02,
          -3.3139e-02, -8.4560e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1064, 768])
answer:  step

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.7336,  0.0255, -0.3711,  ...,  1.1342,  1.4860, -0.0799],
         [ 0.0504, -0.2161, -0.2792,  ...,  1.1770,  0.8335,  0.1444],
         [ 0.7410,  0.1520, -0.6248,  ...,  1.1098,  1.0579, -0.1970],
         ...,
         [-0.0191,  0.0686, -0.0072,  ..., -0.0894, -0.0305, -0.0723],
         [ 0.1191,  0.1659,  0.0829,  ...,  0.0327, -0.0081, -0.5002],
         [ 0.0045,  0.0500,  0.0129,  ..., -0.1149, -0.0414, -0.0500]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1269, 768])
answer:  paul winters
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  buddleja
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.858

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 512])
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 6.0253e-01,  8.9506e-02,  1.7209e-01,  ...,  9.8133e-01,
           1.4493e+00,  7.9466e-02],
         [ 4.5881e-01, -4.6948e-02,  1.5306e-01,  ...,  2.4290e+00,
           3.4726e-01, -4.7124e-01],
         [ 4.5605e-01, -1.7196e-01, -1.2411e-01,  ...,  1.4813e+00,
           1.2541e+00, -4.8189e-01],
         ...,
         [-1.5556e-02,  1.0262e-01, -2.1442e-03,  ..., -8.3349e-02,
          -3.6235e-02, -8.0120e-02],
         [-9.6662e-02,  2.9972e-01, -1.2477e-01,  ...,  8.3485e-02,
          -1.2110e-01, -1.7241e-01],
         [ 6.7860e-02,  2.2409e-01, -4.1338e-02,  ..., -6.7456e-02,
           4.9643e-03, -3.0883e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 392, 768])
self.tokenizer.sep_token:  </s>
self.to

answer:  transcendentalist
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.6514,  0.1071, -0.2029,  ...,  2.3371,  1.3351,  0.1725],
         [ 0.8455,  0.1772,  0.0073,  ...,  2.5339,  0.9216, -0.4022],
         [ 0.4617,  0.1731, -0.0158,  ...,  1.6968,  1.1092, -0.3050],
         ...,
         [-0.0146,  0.0793, -0.0138,  ..., -0.1052, -0.0368,  0.0119],
         [ 0.0452,  0.2870, -0.3165,  ..., -0.0793, -0.1148, -0.6095],
         [-0.0074,  0.0735,  0.0137,  ..., -0.0838, -0.0191, -0.0843]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1326, 768])
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.0590,  0.1445, -0.5199,  ...,  1.7723,  1.6694, -0.7570],
         [ 1.1997,  0.1300, -0.3895,  ...,  1.7741,  0.5129, -0.7016],
         [ 1.0424,  0.1529, -0.2563

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  24 october 1632
answer:  lorax
self.tokenizer.sep_token:  </s>
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
answer:  1960
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0157, -0.0073,  0.0069,  ...,  2.2420,  1.5617, -0.0585],
         [ 0.3150,  0.1235, -0.0756,  ...,  2.7772,  0.7563,  0.1382],
         [ 0.639

answer:  lashkaretaiba
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  4
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.6757,  0.1931, -0.1657,  ...,  2.1764,  1.5568, -0.2436],
         [ 0.4513,  0.1707,  0.0212,  ...,  2.3876,  0.7915,  0.1031],
         [ 0.3967,  0.2400, -0.2206,  ...,  2.1255,  0.7083, -0.0643],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1208, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.0636e+00,  1.4932e-01, -1.5304e-03,  ...,  2

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9196,  0.1021, -0.0835,  ...,  2.1416,  1.6452, -0.2051],
         [ 0.5120,  0.2104, -0.0417,  ...,  2.2824,  0.8252,  0.1528],
         [ 0.5386,  0.2226,  0.1616,  ...,  2.2166,  1.0886,  0.1526],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1062, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  st johns
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0702,  0.1342, -0.0244,  ...,  2.2328,  1.5934,  0.0660],
  

answer:  wachovia securities
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.7821,  0.1021,  0.0587,  ...,  1.9704,  1.5782, -0.1466],
         [ 0.6485,  0.1352,  0.0743,  ...,  2.2754,  0.8938, -0.0190],
         [ 0.4014,  0.0089,  0.3046,  ...,  2.0318,  1.2846, -0.3546],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1002, 768])
size of input_ids: torch.Size([1, 1536])
answer:  no
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8781,  0.1520, -0.1496,  ...,  2.2083,  1.5936,  0.0176],
         [ 0.5536,  

answer:  university of mount union
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.8156,  0.4243, -0.0925,  ...,  2.0813,  1.5808,  0.1155],
         [ 0.1287,  0.5354,  0.0623,  ...,  2.2553,  0.7064,  0.0146],
         [ 0.8173,  0.3644, -0.0721,  ...,  2.4909,  1.0717, -0.1213],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 800, 768])
size of input_ids: torch.Size([1, 1536])
answer:  magnolia pictures
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1057,  0.1029,  0.0704,  ...,  2.2452,  1.7565, -0.3242],


answer:  munster rugby
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 0.8607,  0.1265, -0.2412,  ...,  1.9834,  1.4797,  0.0600],
         [ 0.4065,  0.1023, -0.2261,  ...,  2.4859,  1.0102,  0.2601],
         [ 0.4032, -0.5485,  0.1151,  ...,  0.9464,  1.2342,  0.0808],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 508, 768])
size of input_ids: torch.Size([1, 1536])
answer:  july 16 2012
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8094,  0.1453, -0.0745,  ...,  2.3739,  1.6560, -0.6074],
         [ 0.5785,

answer:  son of ulf jarl
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  buddleja
self.tokenizer.sep_token:  </s>
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9873,  0.0263, -0.2387,  ...,  2.0156,  1.7477,  0.0788],
         [ 0.5957,  0.1607, -0.1279,  ...,  2.3916,  0.9533,  0.2603],
         [ 0.7437,  0.3292, -0.2433,  ...,  1.9643,  1.0655,  0.0024],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1064, 768])
size of input_ids: torch.Size([1, 512])
self.tokenizer.sep_token == '</s>':  True
answer:  2001
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch

size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.0778,  0.2210, -0.0303,  ...,  2.0745,  1.4848, -0.1442],
         [ 0.6914,  0.1718,  0.0924,  ...,  2.5642,  0.7299,  0.0148],
         [ 0.5691,  0.0761,  0.0638,  ...,  1.7015,  0.7440, -0.2266],
         ...,
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902],
         [-0.0126,  0.0737, -0.0054,  ..., -0.0804, -0.0306, -0.0902]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1632, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0363,  0.1034, -0.2046,  ...,  2.4116,  1.5541, -0.1260],
         [ 0.6534,  0.2434,  0.0439,  ...,  2.7582,  0.8016,  0.1517],
         [ 0.8340,  0.2788, -0.0332,  ...,  2.1187,  0.9513, -0.2231],
         ...,
         [-0


Epoch 00001: avg_val_f1 reached 0.02223 (best 0.02223), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_1.ckpt as top 5

Epoch 00001: avg_val_f1 reached 0.02223 (best 0.02223), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_1.ckpt as top 5


validation_end
before sync --> sizes:  79, 79, 79
after sync --> sizes: 79, 79, 79
avg_loss:  tensor(14.9075, device='cuda:0')	avg_answer_loss:  tensor(6.1241, device='cuda:0')	avg_type_loss:  tensor(0.1913, device='cuda:0')	avg_sp_para_loss:  tensor(0.5163, device='cuda:0')	avg_sp_sent_loss:  tensor(0.2664, device='cuda:0')
avg_val_f1:  0.02223092921172516	avg_val_em:  0.0	avg_val_prec:  0.016089558884312835	avg_val_recall:  0.0599156120155431
avg_val_sp_sent_f1:  0.07371910916099066	avg_val_sp_sent_em:  0.012658227848101266	avg_val_sp_sent_prec:  0.07278481088107146	avg_val_sp_sent_recall:  0.0801687768743008
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '

answer:  dog
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  no
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.3680e+00, -1.7817e-01, -1.1386e-01,  ...,  1.9056e+00,
           1.3888e+00, -2.5035e-01],
         [ 9.6188e-01, -1.0420e-04,  4.0997e-02,  ...,  1.4362e+00,
           9.5479e-01, -2.5026e-01],
         [ 8.0660e-01,  2.7117e-02,  2.1798e-02,  ...,  1.5980e+00,
           6.8658e-01, -1.2696e-01],
         ...,
         [ 1.3035e-01,  2.0282e-01, -1.6195e-01,  ..., -1.6540e-02,
           1.8594e-01, -2.9500e-01],
         [ 4.5843e-03,  5.2002e-02, -1.5141e-03,  ..., -6.6352e-02,
          -2.6672e-02, -1.0612e-01],
         [-1.7001e-02,  7.0121e-02, -1.4106e-02,  ..., -9.0528e-02,
          -3.5649e-02, -8.0082e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1471, 768])
self.tokeni

answer:  university of mount union
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0706, -0.1740, -0.0070,  ...,  2.2278,  0.8494, -0.4564],
         [ 0.7876,  0.0718,  0.2668,  ...,  2.3060,  0.8998,  0.3426],
         [ 0.3656, -0.0694,  0.0945,  ...,  2.6680,  0.8572, -0.2772],
         ...,
         [-0.0143,  0.1441,  0.0181,  ..., -0.1038, -0.0542, -0.0568],
         [-0.0045,  0.0697, -0.0044,  ..., -0.0831, -0.0306, -0.0776],
         [-0.0067,  0.0944,  0.0094,  ..., -0.0763, -0.0290, -0.0814]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1205, 768])
answer:  stephen gyllenhaal and naomi achs
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  1976 to 2009
size of input_ids: torch.Size([1, 1024])
size of sequence_output: 

answer:  1905
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8754,  0.2227,  0.1119,  ...,  0.5481,  1.6157, -0.1014],
         [ 0.6849, -0.1388,  0.0683,  ...,  2.1967,  1.0340,  0.0998],
         [ 0.7904, -0.3320, -0.0141,  ...,  0.6822,  0.7091, -0.0267],
         ...,
         [ 0.0039,  0.0505, -0.0053,  ..., -0.1148, -0.0150, -0.0631],
         [-0.0105,  0.0734, -0.0043,  ..., -0.0855, -0.0293, -0.0879],
         [-0.0037,  0.0669, -0.0053,  ..., -0.0770, -0.0239, -0.0773]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1074, 768])
answer:  lorax
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8654, -0.4875,

answer:  1962
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0576e+00,  1.4302e-01,  1.7623e-01,  ...,  1.9086e+00,
           1.4556e+00, -2.1073e-02],
         [ 3.2016e-01, -1.0528e-01,  2.0152e-01,  ...,  2.7372e+00,
           7.1541e-01,  7.5015e-02],
         [ 7.0881e-01, -2.6846e-01,  3.7322e-02,  ...,  1.9177e+00,
           8.6393e-01,  2.9953e-01],
         ...,
         [-5.5624e-02,  1.1231e-01,  7.0610e-02,  ..., -1.7859e-02,
          -6.1876e-03, -2.0681e-01],
         [-9.9059e-03,  9.7754e-02, -1.5034e-03,  ..., -8.0234e-02,
          -3.5907e-02, -6.6191e-02],
         [-1.1325e-02,  6.6563e-02, -3.0937e-04,  ..., -7.6861e-02,
          -3.1286e-02, -8.8759e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1152, 768])
answer:  1960
self.tok

answer:  son of ulf jarl
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.4260,  0.0972, -0.1559,  ...,  2.0478,  1.3582,  0.1868],
         [ 1.1294, -0.2507,  0.2200,  ...,  2.1748,  0.7185,  0.2245],
         [ 0.6608, -0.0921,  0.6265,  ...,  1.3999,  0.7474,  0.0883],
         ...,
         [-0.0262,  0.1808,  0.0226,  ..., -0.0961, -0.0311, -0.2561],
         [ 0.0124,  0.0402, -0.0213,  ..., -0.0846, -0.0419, -0.0712],
         [-0.0137,  0.0685,  0.0031,  ..., -0.1113, -0.0217, -0.0781]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1388, 768])
answer:  1970
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 9.1081

answer:  neil burger
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1284e+00, -2.3258e-01,  1.8596e-01,  ...,  1.7812e+00,
           1.5284e+00,  3.8164e-01],
         [ 7.6852e-01, -2.4725e-01,  4.4313e-01,  ...,  1.2679e+00,
           1.0015e+00,  3.0861e-01],
         [ 7.9880e-01, -3.1980e-01,  1.5150e-01,  ...,  1.5811e+00,
           8.9756e-01, -1.3436e-01],
         ...,
         [-3.2342e-02,  9.9560e-02,  9.5583e-03,  ..., -5.6335e-02,
          -7.3317e-03, -1.7354e-01],
         [-1.8586e-02,  6.3412e-02, -1.3983e-03,  ..., -9.1795e-02,
          -3.8664e-02, -9.5547e-02],
         [-3.0134e-03,  7.5137e-02,  5.9503e-03,  ..., -8.5936e-02,
          -2.7751e-02, -6.7026e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1367, 768])
answer:  loop p

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  barry hearn
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9016,  0.0903,  0.2185,  ...,  1.7364,  1.8080, -0.4237],
         [ 0.1163, -0.1187,  0.7803,  ...,  1.5854,  1.0317,  0.0911],
         [ 0.7084, -0.3242,  0.3903,  ...,  1.6051,  0.9287, -0.6134],
         ...,
         [-0.0107,  0.0636,  0.0107,  ..., -0.0551, -0.0188, -0.0765],
         [ 0.1758,  0.1183, -0.1643,  ..., -0.2854,  0.2769, -0.2860],
         [-0.0534,  0.2583, -0.0086,  ...,  0.0795, -0.0731,  0.0722]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1423, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  ghanaian
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 8.6283

size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1510e+00, -1.6157e-01,  6.1685e-02,  ...,  1.8364e+00,
           1.0018e+00,  5.6397e-01],
         [ 8.8306e-01,  8.0997e-02,  1.8236e-01,  ...,  8.1769e-01,
           1.1007e+00,  8.1656e-02],
         [ 5.8640e-01,  2.0735e-01,  2.9730e-01,  ...,  1.3298e+00,
           6.7635e-01,  6.0020e-02],
         ...,
         [-1.2812e-02,  7.1682e-02, -8.6348e-05,  ..., -7.4096e-02,
          -3.3518e-02, -8.2559e-02],
         [-6.3446e-03,  7.0471e-02, -1.0337e-03,  ..., -1.0515e-01,
          -3.7025e-02, -8.5886e-02],
         [-3.8594e-02,  1.3551e-01,  8.9893e-04,  ..., -2.1769e-02,
          -5.3216e-02, -1.9003e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1148, 768])
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  lorax
answer:  24 october 1632
self.tokenizer.sep_token:  </s>
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
answer:  1960
self.tokenizer.sep_token:  </s>
answer:  transcendentalist
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1392, -0.2462,  0.2018,  ...,  2.3754,  1.50

answer:  4
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.6672, -0.1084,  0.1910,  ...,  2.1101,  1.4428, -0.3145],
         [ 0.7757, -0.0919,  0.4318,  ...,  2.3396,  0.8848, -0.0932],
         [ 0.7349, -0.0382,  0.3293,  ...,  2.1724,  0.8469, -0.2106],
         ...,
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1208, 768])
size of input_ids: torch.Size([1, 2048])
answer:  loop parkway
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.1120, -0.0534,  0.2133,  ...,  2.1414,  1.5563, -0.1262],
         [ 1.0199,  0.0835, 

self.tokenizer.sep_token:  </s>
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8047, -0.1440,  0.1618,  ...,  2.1578,  1.5254, -0.0088],
         [ 0.2516,  0.1256,  0.3260,  ...,  2.0580,  0.6949,  0.4185],
         [ 0.3952,  0.0059,  0.3619,  ...,  2.0807,  1.0566,  0.4079],
         ...,
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1062, 768])
size of input_ids: torch.Size([1, 1536])
self.tokenizer.sep_token == '</s>':  True
answer:  st johns
answer:  square enix
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0821, -0.0803,  0.1721,  ...,  2.2894,  1.5303,  0.1557],
         [ 0.5301, -0.

answer:  scottie pippen
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 7.3843e-01, -1.2399e-01,  3.1318e-01,  ...,  2.0015e+00,
           1.4795e+00,  1.3839e-03],
         [ 5.3890e-01,  3.8759e-02,  5.0007e-01,  ...,  2.1446e+00,
           7.5191e-01,  1.2804e-01],
         [ 2.5849e-01, -7.1256e-02,  6.7330e-01,  ...,  1.7531e+00,
           1.0923e+00, -1.1377e-01],
         ...,
         [-1.1403e-02,  7.5863e-02, -3.1761e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02],
         [-1.1403e-02,  7.5863e-02, -3.1761e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02],
         [-1.1403e-02,  7.5863e-02, -3.1761e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1002, 768])
answer:  wachovia securities
self.tokenizer.sep_token:

answer:  university of mount union
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.7737,  0.0992,  0.1637,  ...,  2.4227,  1.4891, -0.0151],
         [ 0.0809,  0.3450,  0.6197,  ...,  2.3629,  0.6056,  0.1472],
         [ 0.7193,  0.1031,  0.3256,  ...,  2.6545,  0.9730, -0.0129],
         ...,
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 800, 768])
size of input_ids: torch.Size([1, 1536])
answer:  magnolia pictures
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1750, -0.1481,  0.3188,  ...,  2.2380,  1.6963, -0.2286],


answer:  munster rugby
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 0.9180,  0.0165, -0.0364,  ...,  2.0929,  1.3283,  0.1094],
         [ 0.3095, -0.0447,  0.1324,  ...,  2.4371,  0.8021,  0.3068],
         [ 0.2782, -0.7314,  0.4507,  ...,  0.8432,  1.1774,  0.1155],
         ...,
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 508, 768])
size of input_ids: torch.Size([1, 1536])
answer:  july 16 2012
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8540, -0.0725,  0.2087,  ...,  2.2613,  1.5269, -0.4531],
         [ 0.6635,

answer:  son of ulf jarl
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  buddleja
self.tokenizer.sep_token:  </s>
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0013e+00, -2.8179e-01,  1.5742e-03,  ...,  1.8705e+00,
           1.6708e+00,  2.4649e-01],
         [ 4.5178e-01,  6.5502e-02,  3.1349e-01,  ...,  2.0372e+00,
           7.7311e-01,  5.0448e-01],
         [ 7.6051e-01,  1.0501e-02,  1.1854e-01,  ...,  1.7489e+00,
           1.0016e+00,  1.2644e-01],
         ...,
         [-1.1403e-02,  7.5862e-02, -3.1760e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02],
         [-1.1403e-02,  7.5862e-02, -3.1760e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02],
         [-1.1403e-02,  7.5862e-02, -3.1760e-03,  ..., -7.8063e-02,
          -3.0298e-02, -9.0846e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1064, 768])
siz

size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.0727,  0.0311,  0.1065,  ...,  2.0810,  1.4333, -0.2047],
         [ 0.5937,  0.0847,  0.4256,  ...,  2.4177,  0.6066,  0.1552],
         [ 0.5148, -0.0584,  0.2404,  ...,  1.4299,  0.6431, -0.0748],
         ...,
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908],
         [-0.0114,  0.0759, -0.0032,  ..., -0.0781, -0.0303, -0.0908]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1632, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9862, -0.1301,  0.0402,  ...,  2.3467,  1.5396, -0.0376],
         [ 0.5199,  0.1110,  0.4470,  ...,  2.5400,  0.7412,  0.3317],
         [ 0.7203,  0.0058,  0.4051,  ...,  1.9690,  0.9473, -0.0246],
         ...,
         [-0


Epoch 00002: avg_val_f1 reached 0.04511 (best 0.04511), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_2.ckpt as top 5

Epoch 00002: avg_val_f1 reached 0.04511 (best 0.04511), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_2.ckpt as top 5


validation_end
before sync --> sizes:  79, 79, 79
after sync --> sizes: 79, 79, 79
avg_loss:  tensor(14.3244, device='cuda:0')	avg_answer_loss:  tensor(5.9333, device='cuda:0')	avg_type_loss:  tensor(0.1849, device='cuda:0')	avg_sp_para_loss:  tensor(0.5102, device='cuda:0')	avg_sp_sent_loss:  tensor(0.2365, device='cuda:0')
avg_val_f1:  0.04510631035023098	avg_val_em:  0.0	avg_val_prec:  0.03630362026676347	avg_val_recall:  0.09398734222956096
avg_val_sp_sent_f1:  0.07747639281840264	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.08462929574749138	avg_val_sp_sent_recall:  0.08649789079835143
avg_val_joint_f1:  0.001100715579865854	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0006027728130545797	avg_val_joint_recall:  0.006329113924050633
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.7835,  0.0538,  0.4544,  ...,  2.4847,  1.1779, -0.1173],
         [ 0.3815,  0.1453,  0.5660,  ...,  1.8999,  0.9212,  0.5202],
         [ 0.0925, -0.1602,  0.3750,  ...,  1.7785,  0.8473,  0.2340],
         ...,
         [-0.0051,  0.0792,  0.0068,  ..., -0.0750, -0.0328, -0.0767],
         [-0.0053,  0.0782,  0.0072,  ..., -0.0914, -0.0156, -0.0897],
         [-0.0089,  0.0732,  0.0054,  ..., -0.0770, -0.0310, -0.0761]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1103, 768])
answer:  keri russell
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.3387, -0.6171, -0.047

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  inside men
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9627, -0.5114,  0.0559,  ...,  2.4639,  1.6454, -0.1641],
         [ 0.7961, -0.3430,  0.4840,  ...,  2.3281,  0.9091,  0.3296],
         [ 0.8860, -0.7189,  0.1621,  ...,  2.0059,  0.7723, -0.3418],
         ...,
         [-0.0068,  0.0680, -0.0203,  ..., -0.0914, -0.0316, -0.0882],
         [ 0.0630,  0.2366, -0.1384,  ..., -0.3482,  0.0116, -0.0708],
         [ 0.1348,  0.3720, -0.2439,  ...,  0.2166,  0.1441, -0.1240]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1391, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  hollywood madam
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 

answer:  herman wouk
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 1.1102, -0.1155,  0.0487,  ...,  2.6298,  0.9796,  0.2817],
         [ 0.9352, -0.2721,  0.0489,  ...,  2.3079,  0.9526, -0.2951],
         [ 1.2195, -0.2820,  0.2393,  ...,  2.4878,  1.0829, -0.5350],
         ...,
         [-0.0139,  0.0414,  0.0037,  ..., -0.0786, -0.0319, -0.0829],
         [-0.0380,  0.1244,  0.0220,  ..., -0.0235, -0.0152,  0.0233],
         [ 0.0504,  0.2371,  0.0231,  ..., -0.1012, -0.0427, -0.2153]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 800, 768])
answer:  stephen gyllenhaal and naomi achs
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_o

answer:  lord voldemort
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.2848, -0.1007,  0.2997,  ...,  2.2342,  1.0078, -0.2444],
         [ 0.2577, -0.0250,  0.5702,  ...,  2.1865,  0.9672,  0.0755],
         [ 1.1463, -0.2240,  0.1727,  ...,  2.1963,  0.5592, -0.2557],
         ...,
         [ 0.0098,  0.0422, -0.0109,  ..., -0.0784, -0.0279, -0.0777],
         [-0.0333,  0.3041,  0.0803,  ...,  0.0048, -0.1752, -0.2760],
         [-0.0040,  0.0759,  0.0087,  ..., -0.0732, -0.0248, -0.0858]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1192, 768])
answer:  2415
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  varazdat samuel varaz samuelian
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  lost princess of oz
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 8.1798e-01, -7.0563e-02, -9.3321e-02,  ...,  2.0050e+00,
           1.5802e+00, -5.5982e-01],
         [ 5.3664e-01, -9.5912e-02,  5.3912e-01,  ...,  2.0617e+00,
           6.9953e-01, -6.1381e-01],
         [ 9.2755e-01, -1.7186e-01,  6.7058e-01,  ...,  2.1453e+00,
           8.5316e-01, -5.8836e-01],
         ...,
         [-1.2621e-02,  7.3515e-02,  1.2855e-03,  ..., -8.3610e-02,
          -3.0459e-02, -7.7371e-02],
         [-1.3787e-03,  3.8802e-02, -8.9691e-03,  ..., -7.6498e-02,
          -2.8753e-02, -9.2702e-02],
         [-1.1851e-02,  7.4299e-02,  5.2470e-03,  ..., -7.9751e-02,
          -3.4014e-02, -8.6742e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1229, 768])
self.to

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8501, -0.0835,  0.0828,  ...,  1.4707,  1.2226, -0.1922],
         [ 0.7963,  0.1153,  0.6807,  ...,  2.0358,  0.1039,  0.0205],
         [ 0.9112,  0.1013,  0.4510,  ...,  0.4813,  0.9083, -0.1379],
         ...,
         [-0.0165,  0.0687, -0.0101,  ..., -0.0809, -0.0143, -0.0795],
         [ 0.0089,  0.0687, -0.0023,  ..., -0.0697, -0.0228,  0.0154],
         [-0.0195,  0.0795,  0.0029,  ..., -0.0781, -0.0278, -0.0891]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1167, 768])
answer:  austria
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  ring
size of input_ids: torch.Size([1, 512])
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 4.3348e-01, -9.0

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.7232, -0.1725,  0.3088,  ...,  1.4753,  1.1887, -0.0648],
         [ 0.9198, -0.0712,  0.4070,  ...,  2.1655,  0.5514, -0.4164],
         [ 0.8761, -0.2196,  0.6577,  ...,  2.6381,  0.5352, -0.4927],
         ...,
         [-0.0134,  0.0270, -0.0070,  ..., -0.0645, -0.0418, -0.0748],
         [-0.0093,  0.0628, -0.0070,  ..., -0.0796, -0.0379,  0.0172],
         [ 0.1152,  0.2964, -0.0082,  ...,  0.1650, -0.0563, -0.0865]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1176, 768])
answer:  essex
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0587,  0.0923,  0.2732,  ...

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  24 october 1632
answer:  lorax
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
answer:  1960
answer:  transcendentalist
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0424, -0.4555,  0.2159,  ...,  2.3180,  1.05

answer:  lashkaretaiba
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  4
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.3107, -0.4209,  0.1681,  ...,  1.9905,  0.8832,  0.0231],
         [ 0.6628, -0.2960,  0.5830,  ...,  2.4835,  0.4851,  0.1762],
         [ 0.7028, -0.3000,  0.5619,  ...,  2.2943,  0.4959,  0.1146],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1208, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 2048])
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 1.0281, -0.2003,  0.2897,  ...,  2.2011,  1.24

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8079, -0.2109,  0.2277,  ...,  2.2211,  1.1611, -0.0904],
         [ 0.2087, -0.1159,  0.3639,  ...,  2.1966,  0.5096,  0.3711],
         [ 0.2484,  0.0289,  0.3689,  ...,  1.8018,  0.7226,  0.4241],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1062, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  st johns
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0508, -0.1467,  0.2384,  ...,  2.1588,  1.0387,  0.2158],
  

answer:  wachovia securities
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.6245, -0.3320,  0.3152,  ...,  1.9181,  1.1538,  0.1307],
         [ 0.4305, -0.2300,  0.4887,  ...,  1.9715,  0.4406,  0.2024],
         [ 0.1164, -0.0895,  0.4165,  ...,  1.2909,  0.7929, -0.0577],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1002, 768])
size of input_ids: torch.Size([1, 1536])
answer:  no
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9043, -0.1352, -0.0208,  ...,  2.4337,  1.0503,  0.1580],
         [ 0.9500, -

answer:  university of mount union
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 0.6740, -0.0298,  0.1684,  ...,  2.1853,  1.1095, -0.0029],
         [-0.1306, -0.1003,  0.5517,  ...,  2.2884,  0.3328,  0.4419],
         [ 0.7054, -0.1909,  0.4920,  ...,  2.4610,  0.7534,  0.2496],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 800, 768])
size of input_ids: torch.Size([1, 1536])
answer:  magnolia pictures
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9717, -0.3089,  0.3222,  ...,  2.2771,  1.3036,  0.0631],


answer:  munster rugby
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 0.6928, -0.1229,  0.2254,  ...,  1.8356,  0.9088,  0.1641],
         [ 0.1343, -0.2161,  0.0088,  ...,  2.2727,  0.4929,  0.2968],
         [ 0.1372, -0.9410,  0.4374,  ...,  0.6327,  0.9952,  0.1876],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 508, 768])
size of input_ids: torch.Size([1, 1536])
answer:  july 16 2012
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.6491, -0.1604,  0.2067,  ...,  2.2373,  1.1194, -0.3101],
         [ 0.6170,

answer:  buddleja
self.tokenizer.sep_token:  </s>
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.9761, -0.4242, -0.1696,  ...,  1.5258,  1.3208,  0.3078],
         [ 0.4122, -0.1861,  0.2440,  ...,  2.0414,  0.4689,  0.4487],
         [ 0.7863, -0.1874,  0.0362,  ...,  1.2699,  0.5501,  0.0619],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1064, 768])
size of input_ids: torch.Size([1, 512])
self.tokenizer.sep_token == '</s>':  True
answer:  2001
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 4.7080e-01, -2.7773e-01,  4.6903e-01,  ...,  1.902

size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 0.9518,  0.1035,  0.1354,  ...,  2.0739,  0.9395, -0.1571],
         [ 0.3315, -0.1335,  0.4621,  ...,  2.5003,  0.4247,  0.2733],
         [ 0.2367, -0.0596,  0.1897,  ...,  1.6608,  0.4943, -0.0812],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900],
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -0.0310, -0.0900]]],
       device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1632, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8949, -0.1426,  0.0260,  ...,  2.3668,  1.1350, -0.1116],
         [ 0.4382, -0.1369,  0.5060,  ...,  2.4666,  0.5425,  0.2740],
         [ 0.6299, -0.1839,  0.5972,  ...,  1.8346,  0.7656,  0.0629],
         ...,
         [-0.0121,  0.0792, -0.0035,  ..., -0.0778, -


Epoch 00003: avg_val_f1 reached 0.07498 (best 0.07498), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_3.ckpt as top 5

Epoch 00003: avg_val_f1 reached 0.07498 (best 0.07498), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_3.ckpt as top 5


validation_end
before sync --> sizes:  79, 79, 79
after sync --> sizes: 79, 79, 79
avg_loss:  tensor(13.6986, device='cuda:0')	avg_answer_loss:  tensor(5.5226, device='cuda:0')	avg_type_loss:  tensor(0.1473, device='cuda:0')	avg_sp_para_loss:  tensor(0.5094, device='cuda:0')	avg_sp_sent_loss:  tensor(0.2346, device='cuda:0')
avg_val_f1:  0.07498217939953261	avg_val_em:  0.012658227848101266	avg_val_prec:  0.07027059779325619	avg_val_recall:  0.14767932514601115
avg_val_sp_sent_f1:  0.07333735329440877	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0655213990543462	avg_val_sp_sent_recall:  0.09282700472240206
avg_val_joint_f1:  0.010678595144160186	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.011644742439819288	avg_val_joint_recall:  0.0200421942185752
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_tok

answer:  square enix
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0528, -0.2833,  0.2629,  ...,  2.0799,  1.2015, -0.5403],
         [ 0.9762, -0.1638,  0.3764,  ...,  2.2047,  0.3044, -0.3320],
         [ 0.6156, -0.1091,  0.1510,  ...,  1.6324,  0.4318, -0.4304],
         ...,
         [-0.0198,  0.0617, -0.0240,  ..., -0.0803, -0.0361,  0.0324],
         [-0.0211,  0.0727,  0.0058,  ..., -0.0776, -0.0328, -0.0915],
         [-0.0635,  0.1402, -0.0709,  ..., -0.0525,  0.0094, -0.2262]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1367, 768])
answer:  avengers
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[ 1.3148

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  no
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.0598, -0.4133,  0.1589,  ...,  2.4581,  0.7843, -0.3982],
         [ 0.4673, -0.3501,  0.0547,  ...,  2.2454,  0.3914,  0.3923],
         [ 0.3709, -0.3611, -0.0476,  ...,  1.9552,  0.2966, -0.1052],
         ...,
         [-0.0075,  0.0973,  0.0042,  ..., -0.0865, -0.0309, -0.0917],
         [ 0.0041,  0.0477,  0.0052,  ..., -0.0912, -0.0296, -0.0887],
         [-0.0030,  0.0430, -0.0037,  ..., -0.0566, -0.0165, -0.0850]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1290, 768])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  szombathelyi haladás
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.8

answer:  1905
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 8.4125e-01,  5.9316e-02, -1.6538e-01,  ...,  2.4689e+00,
           1.2399e+00, -1.4505e-01],
         [ 5.0705e-01, -2.3856e-01,  8.4411e-02,  ...,  2.5240e+00,
           5.3833e-01, -2.6543e-01],
         [ 4.6763e-01,  2.1246e-01,  1.2483e-01,  ..., -4.0701e-02,
          -1.0737e-01, -1.6721e-02],
         ...,
         [ 9.4352e-03,  5.5283e-02,  4.9242e-04,  ..., -1.1512e-01,
          -2.5555e-02, -6.4949e-02],
         [-9.1398e-03,  8.2522e-02, -6.0614e-03,  ..., -1.0596e-01,
          -1.9809e-02, -9.9806e-02],
         [ 1.7873e-01,  1.6766e-01, -4.9569e-02,  ..., -2.3220e-01,
           2.5180e-01, -6.5462e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1167, 768])
answer:  ariana grande

answer:  4
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 3.0929e-01, -4.5250e-01, -5.8019e-01,  ...,  1.7013e+00,
           1.0348e+00, -1.0396e-01],
         [ 1.8184e-01, -1.9372e-01, -2.9682e-01,  ...,  2.0651e+00,
           4.8997e-01, -4.4175e-01],
         [ 7.4295e-01, -4.4075e-01,  2.3658e-01,  ...,  2.7709e+00,
           7.7568e-01, -7.7691e-01],
         ...,
         [ 1.3538e-02,  8.1496e-02,  2.5599e-03,  ..., -7.2057e-02,
          -1.9679e-02, -7.7635e-02],
         [-1.7517e-02,  6.3995e-02,  6.4496e-03,  ..., -7.0818e-02,
          -3.5833e-02, -8.4630e-02],
         [-2.5637e-02,  2.7828e-01,  1.3422e-02,  ..., -9.5473e-05,
          -1.0221e-01, -3.4008e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
answer:  flowering plants
size of sequence_output after removing padding: torch.Size([1, 1343, 768])

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.2026, -0.6421,  0.2054,  ...,  1.7551,  1.3755,  0.0127],
         [ 0.3609, -0.4284,  0.0156,  ...,  2.0066,  0.7713, -0.1801],
         [ 0.0949, -0.1990, -0.0213,  ...,  1.4364,  1.0639, -0.2200],
         ...,
         [ 0.5033,  0.5974, -0.2477,  ..., -0.6534,  0.5006, -0.3042],
         [-0.0086,  0.0764,  0.0025,  ..., -0.0726, -0.0214,  0.0171],
         [ 0.1176,  0.3477, -0.0757,  ..., -0.0693,  0.0679, -0.3312]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1181, 768])
answer:  dog
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.1991, -0.2856,  0.1448,  ..., 

answer:  2415
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 9.7407e-01, -4.8335e-01,  1.2060e-01,  ...,  2.0547e+00,
           1.4301e+00,  5.1952e-01],
         [ 1.6624e-01,  9.2799e-02,  2.0663e-01,  ...,  2.1864e+00,
           7.1133e-01,  9.0885e-02],
         [ 5.3598e-01, -1.7446e-01, -6.0264e-03,  ...,  1.5009e+00,
           1.0471e+00, -3.9316e-01],
         ...,
         [-3.8265e-02,  1.1998e-01,  2.9921e-02,  ..., -9.5692e-03,
          -7.8848e-03,  1.4359e-02],
         [-4.0607e-03,  1.9540e-01,  4.0787e-02,  ..., -6.1933e-02,
          -5.3576e-02, -2.2539e-01],
         [ 2.1003e-02,  7.2559e-02,  1.0935e-04,  ..., -7.8986e-02,
          -2.2706e-02, -9.2420e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1456, 768])
answer:  loop parkway


answer:  paul winters
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  farinelli
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 6.2747e-01, -1.2794e-01, -2.3343e-01,  ...,  1.8084e+00,
           8.4396e-01, -9.5572e-01],
         [ 4.0990e-01,  2.3155e-01,  5.0321e-01,  ...,  1.3082e+00,
           9.3999e-01, -7.6447e-01],
         [ 4.4826e-01,  1.1984e-01,  2.7947e-01,  ...,  9.2732e-01,
           6.4478e-01, -4.8353e-01],
         ...,
         [ 1.5968e-04,  7.6832e-02,  4.0715e-03,  ..., -8.3675e-02,
          -2.5557e-02, -8.6370e-02],
         [-4.1598e-03,  6.7217e-02,  6.4060e-03,  ..., -8.5320e-02,
          -2.5618e-02, -8.1694e-02],
         [-1.4951e-02,  7.1047e-02, -1.6048e-03,  ..., -7.1891e-02,
           7.1204e-04, -1.0027e-01]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1423, 7

size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 3.5369e-01,  4.7943e-01, -1.7919e-01,  ...,  2.3155e-01,
           6.5502e-01, -6.1775e-01],
         [ 2.3969e-02,  8.0628e-02,  4.0490e-01,  ...,  2.9501e+00,
           4.4497e-01, -3.0574e-01],
         [ 5.4992e-01,  1.7431e-01,  6.4992e-02,  ...,  1.8837e+00,
           6.6524e-01, -7.0002e-01],
         ...,
         [-8.9777e-03,  7.1144e-02, -6.9282e-03,  ..., -7.7825e-02,
          -5.2779e-02, -7.4710e-02],
         [ 1.2854e-03,  1.0841e-01,  5.2166e-03,  ..., -7.2124e-02,
          -1.7918e-02, -8.2386e-02],
         [-1.9368e-02,  7.0620e-02, -1.0736e-03,  ..., -7.6831e-02,
          -3.6595e-02, -8.7421e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward>)
size of sequence_output after removing padding: torch.Size([1, 1632, 768])
size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 0.0823,  0.0802, -0.0671,  ...

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
answer:  danny leiner
answer:  24 october 1632
answer:  lorax
self.tokenizer.sep_token:  </s>
answer:  espn
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of input_ids: torch.Size([1, 1536])
answer:  1937
answer:  doug moench and don perlin
answer:  1960
self.tokenizer.sep_token:  </s>
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 3.5204e-01, -1.4802e-01,  1.7774e-01,  ...,  1.1993e+00,
           7.3000e-01, -5.8875e-01],
         [ 4.2601e-02

answer:  flowering plants
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 2.9501e-02,  3.0134e-01,  1.2734e-01,  ...,  1.1234e+00,
           6.1933e-01, -1.1520e-01],
         [-3.9318e-01, -1.2880e-01,  5.9712e-01,  ...,  2.1821e+00,
           4.0817e-01,  4.1271e-02],
         [ 7.8656e-01,  8.5545e-02,  2.2403e-01,  ...,  5.8448e-01,
           6.7730e-01, -2.3229e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1266, 768])
size of input_ids: torch.Size([1, 1536])
answer:  lashkaretaiba
self.tokenizer.sep_token:  </

answer:  herman wouk
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.5634e-01,  5.1630e-01, -1.0805e-01,  ...,  1.5184e+00,
           6.4464e-01, -2.3778e-01],
         [-4.1595e-01, -4.1691e-02,  6.9524e-01,  ...,  2.3077e+00,
           4.2728e-01, -4.9361e-01],
         [ 5.2433e-03, -2.6497e-01,  2.4335e-01,  ...,  1.9733e+00,
           3.8082e-01, -6.3942e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1182, 768])
size of input_ids: torch.Size([1, 1024])
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token 

answer:  neil burger
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 2048, 768])
sequence_output  tensor([[[ 3.4599e-02,  3.3027e-01, -3.3168e-01,  ...,  1.3346e+00,
           4.5496e-01, -4.2043e-01],
         [ 7.2414e-01, -3.3482e-01,  3.1918e-01,  ...,  2.0978e+00,
           3.9661e-01, -4.5221e-01],
         [-2.0686e-01,  3.9170e-01,  1.0122e+00,  ...,  1.2127e+00,
           1.7118e-01, -9.2153e-02],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0490e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0490e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0490e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1791, 768])
size of input_ids: torch.Size([1, 1536])
answer:  amélie simone mauresmo born 5 july 1979 is frenc

answer:  hollywood madam
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[-2.4296e-02,  1.9469e-01, -9.2838e-02,  ...,  1.7975e-01,
           1.8925e-02,  3.5463e-01],
         [ 1.5016e-01, -2.5745e-01,  6.4209e-01,  ...,  1.2922e+00,
           2.5783e-01, -7.8626e-01],
         [-6.1323e-02,  3.4983e-01,  1.8891e-02,  ...,  7.3464e-01,
           5.8203e-01,  5.2032e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 781, 768])
size of input_ids: torch.Size([1, 2048])
answer:  toledo
self.tokenizer.sep_token:  </s>
self.t

answer:  magnolia pictures
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 3.9023e-01, -2.1002e-01,  4.4520e-02,  ...,  1.7342e+00,
           1.3665e+00, -5.1715e-01],
         [ 4.5368e-01, -2.4704e-01,  2.3303e-01,  ...,  1.8428e+00,
           5.0527e-01, -1.1399e-01],
         [ 5.7933e-01, -2.3682e-01,  4.1701e-01,  ...,  7.8219e-01,
           8.2615e-01, -2.2113e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1192, 768])
size of input_ids: torch.Size([1, 1536])
answer:  1970
self.tokenizer.sep_token:  </s>
self.

answer:  chihuahua
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 3.2743e-01, -2.2804e-03, -1.0871e-01,  ...,  1.9131e+00,
           1.0277e+00, -1.0201e+00],
         [ 5.3249e-01, -3.8220e-01,  3.9335e-01,  ...,  2.1007e+00,
           4.4766e-01, -3.5683e-01],
         [ 1.2716e-01, -1.0642e-01,  3.0682e-01,  ...,  1.4667e+00,
           3.7821e-01, -3.9003e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1367, 768])
size of input_ids: torch.Size([1, 512])
answer:  munster rugby
self.tokenizer.sep_token:  </s>
self.

answer:  christmas
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
answer:  szombathelyi haladás
size of input_ids: torch.Size([1, 1024])
size of sequence_output: torch.Size([1, 1024, 768])
sequence_output  tensor([[[-2.7290e-01,  3.8575e-01,  1.5126e-02,  ...,  1.0338e+00,
           3.2483e-01,  1.0454e-01],
         [-7.2214e-01, -1.2168e-02,  5.6574e-01,  ...,  1.2897e+00,
           2.7939e-02, -6.6053e-01],
         [-1.5036e-01, -3.6920e-01,  4.2181e-01,  ...,  6.0600e-01,
           4.0917e-01, -3.4312e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0507e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 656, 768])
self.tokenizer.sep_token:  </s

size of input_ids: torch.Size([1, 1536])
size of sequence_output: torch.Size([1, 1536, 768])
sequence_output  tensor([[[ 1.9977e-02,  2.2706e-01,  8.1088e-02,  ...,  1.3812e+00,
           5.6507e-01, -1.9139e-01],
         [-4.6589e-01,  3.1808e-01,  8.0837e-01,  ...,  1.2226e+00,
           2.8616e-01, -3.6811e-01],
         [ 3.3312e-01,  4.2245e-01,  5.6109e-02,  ...,  2.0793e-01,
           1.3103e-01, -3.0385e-01],
         ...,
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02],
         [-1.1996e-02,  7.7738e-02, -4.0489e-04,  ..., -7.5476e-02,
          -3.0758e-02, -8.7542e-02]]], device='cuda:0')
size of sequence_output after removing padding: torch.Size([1, 1457, 768])
size of input_ids: torch.Size([1, 512])
size of sequence_output: torch.Size([1, 512, 768])
sequence_output  tensor([[[ 1.5437e-01,  1.3683e-01,  1.2137e


Epoch 00004: avg_val_f1 reached 0.10303 (best 0.10303), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_4.ckpt as top 5

Epoch 00004: avg_val_f1 reached 0.10303 (best 0.10303), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_4.ckpt as top 5


validation_end
before sync --> sizes:  79, 79, 79
after sync --> sizes: 79, 79, 79
avg_loss:  tensor(13.1980, device='cuda:0')	avg_answer_loss:  tensor(5.0585, device='cuda:0')	avg_type_loss:  tensor(0.1420, device='cuda:0')	avg_sp_para_loss:  tensor(0.5090, device='cuda:0')	avg_sp_sent_loss:  tensor(0.2339, device='cuda:0')
avg_val_f1:  0.10302699894844732	avg_val_em:  0.05063291139240506	avg_val_prec:  0.0959462461686587	avg_val_recall:  0.16962025324000587
avg_val_sp_sent_f1:  0.0750251165296458	avg_val_sp_sent_em:  0.012658227848101266	avg_val_sp_sent_prec:  0.06962025373042384	avg_val_sp_sent_recall:  0.08438818628274941
avg_val_joint_f1:  0.024532501974815053	avg_val_joint_em:  0.012658227848101266	avg_val_joint_prec:  0.021057560026079794	avg_val_joint_recall:  0.04092827014908006
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_token:  </s>
self.tokenizer.sep_token == '</s>':  True
self.tokenizer.sep_to

In [None]:
#     trainer.test(model)

In [None]:
### To install apex ### 
#     !git clone https://github.com/NVIDIA/apex
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/apex/")
#     !module load cuda101/neuralnet/7/7.6.4  
#     !module load cuda10.1/toolkit/10.1.243 
#     !conda install -c conda-forge cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf201901 cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf202003 cudatoolkit-dev --yes
#     !which nvcc
#     !python -m pip install -v --no-cache-dir ./
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/")

In [40]:
# debug: check args
import shlex
argString ='--train_dataset small.json --dev_dataset small.json  \
    --gpus 0 --num_workers 4 \
    --max_seq_len 4096 --doc_stride -1  \
    --save_prefix hotpotqa-longformer_jupyter  --model_path /xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096'
# hotpot_dev_distractor_v1.json

import argparse 
if __name__ == "__main__":
    main_arg_parser = argparse.ArgumentParser(description="hotpotqa")
    parser = hotpotqa.add_model_specific_args(main_arg_parser, os.getcwd())
    args = parser.parse_args(shlex.split(argString)) 
    for arg in vars(args):
        print((arg, getattr(args, arg)))
    main(args)

('save_dir', 'jupyter-hotpotqa')
('save_prefix', 'hotpotqa-longformer_jupyter')
('train_dataset', 'small.json')
('dev_dataset', 'small.json')
('batch_size', 2)
('gpus', '0')
('warmup', 1000)
('lr', 5e-05)
('val_every', 1.0)
('val_percent_check', 1.0)
('num_workers', 4)
('seed', 1234)
('epochs', 6)
('max_seq_len', 4096)
('max_doc_len', 4096)
('max_num_answers', 64)
('max_question_len', 55)
('doc_stride', -1)
('ignore_seq_with_no_answers', False)
('disable_checkpointing', False)
('n_best_size', 20)
('max_answer_length', 30)
('regular_softmax_loss', False)
('test', False)
('model_path', '/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096')
('no_progress_bar', False)
('attention_mode', 'sliding_chunks')
('fp32', False)
('train_percent', 1.0)


### Sandbox

In [None]:
import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096') 
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

In [None]:
model = Longformer.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

In [None]:
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
model = model.cuda() 
input_ids = input_ids.cuda()

In [None]:
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens

# padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
input_ids, attention_mask = pad_to_window_size(
        input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

In [None]:
output = model(input_ids, attention_mask=attention_mask)[0]
output