In [1]:
# increase the cell width 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))   

# need to run this every time start this notebook, to add python3.7/site-packages to sys.pat, in order to import ipywidgets, which is used when RobertaTokenizer.from_pretrained('roberta-base') 
import sys
# sys.path.insert(0, '/xdisk/msurdeanu/fanluo/miniconda3/envs/hotpotqa/lib/python3.7/site-packages') 

### convert hotpotqa to squard format

According to Longformer: use the following input format with special tokens:  “[CLS] [q] question [/q] [p] sent1,1 [s] sent1,2 [s] ... [p] sent2,1 [s] sent2,2 [s] ...” 
where [s] and [p] are special tokens representing sentences and paragraphs. The special tokens were added to the RoBERTa vocabulary and randomly initialized before task finetuning.

In [2]:
# helper functions to convert hotpotqa to squard format modified from  https://github.com/chiayewken/bert-qa/blob/master/run_hotpot.py

import tqdm 
from datetime import datetime 
import pytz 
timeZ_Az = pytz.timezone('US/Mountain') 
#!pip install -U transformers
#!pip install torch==1.6.0 torchvision==0.7.0
import transformers 

QUESTION_START = '[question]'
QUESTION_END = '[/question]' 
TITLE_START = '<t>'  # indicating the start of the title of a paragraph (also used for loss over paragraphs)
TITLE_END = '</t>'   # indicating the end of the title of a paragraph
SENT_MARKER_END = '[/sent]'  # indicating the end of the title of a sentence (used for loss over sentences)
PAR = '[/par]'  # used for indicating end of the regular context and beginning of `yes/no/null` answers
EXTRA_ANSWERS = " yes no null"

 
def create_example_dict(context, answer, id, question, is_sup_fact, is_supporting_para):
    return {
        "context": context,
        "qas": [                        # each context corresponds to only one qa in hotpotqa
            {
                "answer": answer,
                "id": id,
                "question": question,
                "is_sup_fact": is_sup_fact,
                "is_supporting_para": is_supporting_para
            }
        ],
    }

def create_para_dict(example_dicts):
    if type(example_dicts) == dict:
        example_dicts = [example_dicts]   # each paragraph corresponds to only one [context, qas] in hotpotqa
    return {"paragraphs": example_dicts}   


In [3]:
# !python -m pip install tqdm 
# !python -m pip install git+https://github.com/allenai/longformer.git 
# !python -m pip install pytorch-lightning==0.6.0
# !python -m pip install jdc  
# !wget https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-base-4096.tar.gz
# !tar -xf longformer-base-4096.tar.gz

In [4]:
import re
import string

def convert_hotpot_to_squad_format(json_dict, gold_paras_only=False):
    
    """function to convert hotpotqa to squard format.


    Note: A context corresponds to several qas in SQuard. In hotpotqa, one question corresponds to several paragraphs as context. 
          "paragraphs" means different: each paragraph in SQuard contains a context and a list of qas; while 10 paragraphs in hotpotqa concatenated into a context for one question.

    Args:
        json_dict: The original data load from hotpotqa file.
        gold_paras_only: when is true, only use the 2 paragraphs that contain the gold supporting facts; if false, use all the 10 paragraphs
 

    Returns:
        new_dict: The converted dict of hotpotqa dataset, use it as a dict would load from SQuAD json file
                  usage: input_data = new_dict["data"]   https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_squad.py#L230

    """
 
    new_dict = {"data": []} 
    for example in json_dict: 

        support_para = set(
            para_title for para_title, _ in example["supporting_facts"]
        )
        sp_set = set(list(map(tuple, example['supporting_facts'])))
        
        raw_contexts = example["context"]
        if gold_paras_only: 
            raw_contexts = [lst for lst in raw_contexts if lst[0] in support_para]
            
        is_supporting_para = []  # a boolean list with 10 True/False elements, one for each paragraph
        is_sup_fact = []         # a boolean list with True/False elements, one for each context sentence
        for para_title, para_lines in raw_contexts:
            is_supporting_para.append(para_title in support_para)   
            for sent_id, sent in enumerate(para_lines):
                is_sup_fact.append( (para_title, sent_id) in sp_set )    
        
        contexts = []   
        for para_id, para in enumerate(raw_contexts):   
            title = _normalize_text(para[0])    
            sents = [_normalize_text(sent) for sent in para[1]] 
            
            if("kept_para_sent" in example):    # reduceded context 
                sent_joint = '' 
                for sent_id, sent in enumerate(sents):  
                    if(sent_id > 0 and example["kept_para_sent"][para_id][sent_id] - example["kept_para_sent"][para_id][sent_id-1] > 1):    
                        sent_joint += (' </s> ' + sent + ' ' + SENT_MARKER_END )   # </s> indicates at least one sentence is removed    
                    else:   
                        sent_joint += (sent + ' ' + SENT_MARKER_END )   
            else:   
                sent_joint =  (' ' + SENT_MARKER_END +' ').join(sents) + ' ' + SENT_MARKER_END      
                
            contexts.append(TITLE_START + ' ' + title + ' ' + TITLE_END + ' ' +  sent_joint)    
        # extra space is fine, which would be ignored latter. most sentences has already have heading space, there are several no heading space; call the _normalize_text() which is same as the one used during evaluation
        
#         context = " </s> ".join(contexts)
#         print(context)
        
#         exit(0)

        
        answer = _normalize_text(example["answer"]) 
#         print("answer: ", answer)
        if(len(answer) > 0):   # answer can be '' after normalize
            new_dict["data"].append(
                create_para_dict(
                    create_example_dict(
                        context=contexts,
                        answer=answer,
                        id = example["_id"],
                        question=_normalize_text(example["question"]),
                        is_sup_fact = is_sup_fact,
                        is_supporting_para = is_supporting_para 
                    )
                )
            ) 

    return new_dict

def _normalize_text(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [5]:
# debug: check whether convert_hotpot_to_squad_format() works
import os
os.chdir('/xdisk/msurdeanu/fanluo/hotpotQA/Data')
!cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[1:100]' > train100.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[37:50]' > small_dev.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[31:50]' > sample.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[6666:7000]' > small_dev.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[1515:1525]' > small_dev3.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[6800:7405]' > small_end.json    
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_dev_distractor_v1.json | ../../helper/jq-linux64 -c '.[6800:7405]' > small_end.json  
    
    
import json
with open("small.json", "r", encoding='utf-8') as f:  
    json_dict = convert_hotpot_to_squad_format(json.load(f))['data']
    print(json.dumps(json_dict[3], indent=2))

{
  "paragraphs": [
    {
      "context": [
        "<t> cadmium chloride </t> cadmium chloride is white crystalline compound of cadmium and chlorine with formula cdcl [/sent] it is hygroscopic solid that is highly soluble in water and slightly soluble in alcohol [/sent] although it is considered to be ionic it has considerable covalent character to its bonding [/sent] crystal structure of cadmium chloride described below composed of twodimensional layers of ions is reference for describing other crystal structures [/sent] also known are cdcl\u2022ho and cdcl\u20225ho [/sent]",
        "<t> water blue </t> water blue also known as aniline blue acid blue 22 soluble blue 3m marine blue v or ci 42755 is chemical compound used as stain in histology [/sent] water blue stains collagen blue in tissue sections [/sent] it is soluble in water and slightly soluble in ethanol [/sent]",
        "<t> diflucortolone valerate </t> diflucortolone valerate also nerisone creamoily creamointment neriderm

### longfomer's fine-tuning


- For answer span extraction we use BERT’s QA model with addition of a question type (yes/no/span) classification head over the first special token ([CLS]).

- For evidence extraction we apply 2 layer feedforward networks on top of the representations corresponding to sentence and paragraph tokens to get the corresponding evidence prediction scores and use binary cross entropy loss to train the model.

- We combine span, question classification, sentence, and paragraphs losses and train the model in a multitask way using linear combination of losses.


In [6]:
### Section2: This is modified from longfomer's fine-tuning with triviaqa.py from https://github.com/allenai/longformer/blob/master/scripts/triviaqa.py

# !pip uninstall longformer -y
# !python -m pip uninstall longformer -y
# !pip install git+https://github.com/allenai/longformer.git 
# !python -m pip uninstall pytorch-lightning -y
# !pip uninstall pytorch-lightning -y
# !python -m pip install git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
#!pip install torch==1.6.0 torchvision==0.7.0
 


####requirements.txt:torch>=1.2.0, transformers>=3.0.2, tensorboardX, pytorch-lightning==0.6.0, test-tube==0.7.5
# !conda install transformers --yes
# !conda install cudatoolkit=10.0 --yes
# !python -m pip install git+https://github.com/allenai/longformer.git
# !conda install -c conda-forge regex --force-reinstall --yes
# !conda install pytorch-lightning -c conda-forge
#!python -m pip install jdc 
# !pip install test-tube 
#!python -m pip install ipywidgets 
# !conda update --force conda --yes  
# !jupyter nbextension enable --py widgetsnbextension 
# !conda install jupyter --yes


import os
import json
import string
import random
import numpy as np
import torch
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, Dataset 

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
from pytorch_lightning.logging import TestTubeLogger    # sometimes pytorch_lightning.loggers works instead

from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer
import jdc
from more_itertools import locate
from collections import Counter
from collections import defaultdict

In [7]:
print(pl.__file__)

/home/u32/fanluo/.local/lib/python3.6/site-packages/pytorch_lightning/__init__.py


#### class hotpotqaDataset

##### \_\_init\_\_, \_\_getitem\_\_ and \_\_len\_\_ 

In [8]:
class hotpotqaDataset(Dataset):
    """
    Largely based on
    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
    and
    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
    """
    
    
    def __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride,
                 max_num_answers, ignore_seq_with_no_answers, max_question_len):
        assert os.path.isfile(file_path)
        self.file_path = file_path
#         if("reduced_context" not in self.file_path):
        with open(self.file_path, "r", encoding='utf-8') as f:
            print(f'reading file: {self.file_path}')
            self.data_json = convert_hotpot_to_squad_format(json.load(f))['data']
                
#         else:
#             with open(self.file_path, "r", encoding='utf-8') as f:
#                 print(f'reading file: {self.file_path}')
#                 self.data_json = json.load(f)['data']            
#                 print(self.data_json[0])
            
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.max_doc_len = max_doc_len
        self.doc_stride = doc_stride
        self.max_num_answers = max_num_answers
        self.ignore_seq_with_no_answers = ignore_seq_with_no_answers
        self.max_question_len = max_question_len


#         print(tokenizer.all_special_tokens) 
    
        # A mapping from qid to an int, which can be synched across gpus using `torch.distributed`
        if 'train' not in self.file_path:  # only for the evaluation set 
            self.val_qid_string_to_int_map =                  {
                    entry["paragraphs"][0]['qas'][0]['id']: index
                    for index, entry in enumerate(self.data_json)
                }
        else:
            self.val_qid_string_to_int_map = None
            
            
    def __len__(self):
        return len(self.data_json)

    def __getitem__(self, idx):
        entry = self.data_json[idx]
        tensors_list = self.one_example_to_tensors(entry, idx)
        if(len(tensors_list) != 1):
            print("tensors_list: ", tensors_list)
        assert len(tensors_list) == 1
        return tensors_list[0]

##### one_example_to_tensors

In [9]:
    %%add_to hotpotqaDataset
    def one_example_to_tensors(self, example, idx):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
                return True
            return False
        
        def map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset):
            # char offset to word offset
            if(answer_start >= len(char_to_word_offset)):
                print("answer_start: ", answer_start)
                print("len(char_to_word_offset): ", len(char_to_word_offset))
            # char offset to word offset
            start_word_position = char_to_word_offset[answer_start]
            end_word_position = char_to_word_offset[answer_end-1] 

#             print("start_word_position: ", start_word_position)
#             print("end_word_position: ", end_word_position)
            # sub_tokens postion reletive to context
            tok_start_position_in_doc = orig_to_tok_index[start_word_position]  
            not_end_of_doc = int(end_word_position + 1 < len(orig_to_tok_index))
            tok_end_position_in_doc = orig_to_tok_index[end_word_position + not_end_of_doc] - not_end_of_doc
            
            if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end:
                return (-1, -1) # this answer is outside the current slice                     
            
            # sub_tokens postion reletive to begining of all the tokens, including query sub tokens  
            start_position = tok_start_position_in_doc + doc_offset  
            end_position = tok_end_position_in_doc + doc_offset
            
            return (start_position, end_position)
        
#         print("idx: ", idx)
#         print("len(example): ", "len(example)")
        if(len(example["paragraphs"])==0):
            print("idx: ", idx, "'s len(example[‘paragraphs’])==0")

        tensors_list = []
        for paragraph in example["paragraphs"]:  # example["paragraphs"] only contains one paragraph in hotpotqa
            # print("for paragraph in example['paragraphs']: ") 
            context = self.tokenizer.sep_token + ' ' + (' ' + self.tokenizer.sep_token + ' ').join(paragraph["context"] )   
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in context:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c) # add a new token
                    else:
                        doc_tokens[-1] += c  # append the character to the last token
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            
#             print("len(char_to_word_offset): ", len(char_to_word_offset))
#             print("char_to_word_offset: ", char_to_word_offset)
            for qa in paragraph["qas"]:
                question_text = qa["question"]
                # print("question text: ", question_text)  
                sp_sent = qa["is_sup_fact"]
                sp_para = qa["is_supporting_para"]
                start_position = None
                end_position = None
                orig_answer_text = None 

#                     print("len(sp_sent):", len(sp_sent))
#                     print("sp_sent", sp_sent) 
#                     print("doc_tokens", doc_tokens)
 
                # keep all answers in the document, not just the first matched answer. It also added the list of textual answers to make evaluation easy.
                
                   
                # ===== Given an example, convert it into tensors  =============
                 
                query_tokens = self.tokenizer.tokenize(question_text)
                query_tokens = query_tokens[:self.max_question_len]
                tok_to_orig_index = []
                orig_to_tok_index = []
                all_doc_tokens = []
                
                # each original token in the context is tokenized to multiple sub_tokens
                for (i, token) in enumerate(doc_tokens):
                    orig_to_tok_index.append(len(all_doc_tokens))
                    # hack: the line below should have been `self.tokenizer.tokenize(token')`
                    # but roberta tokenizer uses a different subword if the token is the beginning of the string
                    # or in the middle. So for all tokens other than the first, simulate that it is not the first
                    # token by prepending a period before tokenizing, then dropping the period afterwards
                    sub_tokens = self.tokenizer.tokenize(f'. {token}')[1:] if i > 0 else self.tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        tok_to_orig_index.append(i)
                        all_doc_tokens.append(sub_token)
                
                # all sub tokens, truncate up to limit
                all_doc_tokens = all_doc_tokens[:self.max_doc_len-7] 

                # The -7 accounts for CLS, QUESTION_START, QUESTION_END， [/par]， yes， no， </s>   
                max_tokens_per_doc_slice = self.max_seq_len - len(query_tokens) - 7
                if(max_tokens_per_doc_slice <= 0):
                    print("(max_tokens_per_doc_slice <= 0)")
                assert max_tokens_per_doc_slice > 0
                if self.doc_stride < 0:                           # default
                    # negative doc_stride indicates no sliding window, but using first slice
                    self.doc_stride = -100 * len(all_doc_tokens)  # large -negtive value for the next loop to execute once
                
                # inputs to the model
                input_ids_list = []
                input_mask_list = []
                segment_ids_list = []
                start_positions_list = []
                end_positions_list = []
                q_type_list = []
                sp_sent_list =  [1 if ss else 0 for ss in sp_sent]
                sp_para_list = [1 if sp else 0 for sp in sp_para]
                
                if(len(all_doc_tokens) == 0):
                    print("idx: ", idx, " len(all_doc_tokens) == 0")
#               
                
                for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride):    # execute once by default
                
                    # print("slice_start in range") 
                    slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))

                    doc_slice_tokens = all_doc_tokens[slice_start:slice_end]
                    tokens = [self.tokenizer.cls_token] + [QUESTION_START] + query_tokens + [QUESTION_END] + doc_slice_tokens + [PAR] + self.tokenizer.tokenize("yes") + self.tokenizer.tokenize("no") + [self.tokenizer.eos_token]   
                    segment_ids = [0] * (len(query_tokens) + 3) + [1] * (len(doc_slice_tokens) + 4) 
#                     if(len(segment_ids) != len(tokens)):
#                         print("len(segment_ids): ", len(segment_ids))
#                         print("len(tokens): ", len(tokens))
                    assert len(segment_ids) == len(tokens)

                    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)   
                    input_mask = [1] * len(input_ids)

                    doc_offset = len(query_tokens) + 3 - slice_start  # where context starts
                    
                    # ===== answer positions tensors  ============
                    start_positions = []
                    end_positions = []
 
                    answer = qa["answer"] 
                    # print("idx: ", idx, " qa['id']: ", qa['id'], " answer: ", answer)
                    if answer == '':
                        q_type = -1
                        start_positions.append(-1)   
                        end_positions.append(-1)           
                    
                    elif answer == 'yes':
                        q_type = 1
                        start_positions.append(len(tokens)-3)   
                        end_positions.append(len(tokens)-3) 
                    elif answer == 'no':
                        q_type = 2
                        start_positions.append(len(tokens)-2)   
                        end_positions.append(len(tokens)-2)  
                    else:
                        # keep all the occurences of answer in the context 
#                         for m in re.finditer("\s?".join(answer.split()), context):   # "\s?".join(answer.split()) in order to match even with extra space in answer or context
                        for m in re.finditer(_normalize_text(answer), context, re.IGNORECASE):
                            answer_start, answer_end = m.span() 
                            start_position, end_position = map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset)
                            if(start_position != -1):
                                start_positions.append(start_position)   
                                end_positions.append(end_position)
                            
                        if(len(start_positions) > 0): 
                            q_type = 0
                        else: # answer not found in context
                            q_type = -1
                            start_positions.append(-1)   
                            end_positions.append(-1) 


                    # answers from start_positions and end_positions if > self.max_num_answers
                    start_positions = start_positions[:self.max_num_answers]
                    end_positions = end_positions[:self.max_num_answers]

                    # -1 padding up to self.max_num_answers
                    padding_len = self.max_num_answers - len(start_positions)
                    start_positions.extend([-1] * padding_len)
                    end_positions.extend([-1] * padding_len)

                    # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
                    found_start_positions = set()
                    found_end_positions = set()
                    for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)):
                        
                        if start_position in found_start_positions:
                            start_positions[i] = -1
                        if end_position in found_end_positions:
                            end_positions[i] = -1
                        found_start_positions.add(start_position)
                        found_end_positions.add(end_position)
                    
                                         
                    if self.doc_stride >= 0:  # no need to pad if document is not strided
                        # Zero-pad up to the sequence length.
                        padding_len = self.max_seq_len - len(input_ids)
                        input_ids.extend([self.tokenizer.pad_token_id] * padding_len)
                        input_mask.extend([0] * padding_len)
                        segment_ids.extend([0] * padding_len)
                        
                        print("self.doc_stride >= 0")
                        assert len(input_ids) == self.max_seq_len
                        assert len(input_mask) == self.max_seq_len
                        assert len(segment_ids) == self.max_seq_len  
                        
                    input_ids_list.append(input_ids)
                    input_mask_list.append(input_mask)
                    segment_ids_list.append(segment_ids)
                    start_positions_list.append(start_positions)
                    end_positions_list.append(end_positions)
                    q_type_list.append(q_type)
                    
                tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list), torch.tensor(segment_ids_list),
                                     torch.tensor(start_positions_list), torch.tensor(end_positions_list), torch.tensor(q_type_list),
                                     torch.tensor([sp_sent_list]),  torch.tensor([sp_para_list]),
                                     qa['id'], answer))     
        return tensors_list



##### collate_one_doc_and_lists

In [10]:
    %%add_to hotpotqaDataset
    @staticmethod
    def collate_one_doc_and_lists(batch):
        num_metadata_fields = 2  # qid and answer  
        fields = [x for x in zip(*batch)]
        stacked_fields = [torch.stack(field) for field in fields[:-num_metadata_fields]]  # don't stack metadata fields
        stacked_fields.extend(fields[-num_metadata_fields:])  # add them as lists not torch tensors

        # always use batch_size=1 where each batch is one document
        # will use grad_accum to increase effective batch size
        assert len(batch) == 1
        fields_with_batch_size_one = [f[0] for f in stacked_fields]
        return fields_with_batch_size_one


##### class info

In [11]:
dir(hotpotqaDataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'collate_one_doc_and_lists',
 'one_example_to_tensors']

In [12]:
from inspect import getmembers
getmembers(hotpotqaDataset)

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__class__', type),
 ('__delattr__', <slot wrapper '__delattr__' of 'object' objects>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__doc__': '\n    Largely based on\n    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py\n    and\n    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n    ',
                '__init__': <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>,
                '__len__': <function __main__.hotpotqaDataset.__len__(self)>,
                '__getitem__': <function __main__.hotpotqaDataset.__getitem__(self, idx)>,
                'one_example_to_tensors': <function __main__.one_example_to_tensors(self, example, idx)>,
                'collate_one

In [13]:
from inspect import isfunction
functions_list = [o for o in getmembers(hotpotqaDataset) if isfunction(o[1])]
functions_list

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__getitem__', <function __main__.hotpotqaDataset.__getitem__(self, idx)>),
 ('__init__',
  <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>),
 ('__len__', <function __main__.hotpotqaDataset.__len__(self)>),
 ('collate_one_doc_and_lists',
  <function __main__.collate_one_doc_and_lists(batch)>),
 ('one_example_to_tensors',
  <function __main__.one_example_to_tensors(self, example, idx)>)]

In [14]:
import inspect
inspect.getmro(hotpotqaDataset)  # a hierarchy of classes 

(__main__.hotpotqaDataset, torch.utils.data.dataset.Dataset, object)

In [15]:
inspect.getfullargspec(hotpotqaDataset.one_example_to_tensors)

FullArgSpec(args=['self', 'example', 'idx'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={})

In [16]:
help(hotpotqaDataset)

Help on class hotpotqaDataset in module __main__:

class hotpotqaDataset(torch.utils.data.dataset.Dataset)
 |  Largely based on
 |  https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
 |  and
 |  https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
 |  
 |  Method resolution order:
 |      hotpotqaDataset
 |      torch.utils.data.dataset.Dataset
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, idx)
 |  
 |  __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  one_example_to_tensors(self, example, idx)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  collate_one_doc_and_lists(batch)
 |  
 |  -----------------------

#### class hotpotqa

##### \_\_init\_\_,  forward, dataloaders

In [17]:
class hotpotqa(pl.LightningModule):
    def __init__(self, args):
        super(hotpotqa, self).__init__()
        self.args = args
        self.hparams = args
 
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        num_new_tokens = self.tokenizer.add_special_tokens({"additional_special_tokens": [TITLE_START, TITLE_END, SENT_MARKER_END, QUESTION_START , QUESTION_END, PAR]})
#         print(self.tokenizer.all_special_tokens)
        self.tokenizer.model_max_length = self.args.max_seq_len
        self.model = self.load_model()
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.num_labels = 2
        self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
         
        self.linear_type = torch.nn.Linear(self.model.config.hidden_size, 3)   #  question type (yes/no/span/null) classification 

        self.fnn_sp_sent = torch.nn.Sequential(
          torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
        
        self.fnn_sp_para = torch.nn.Sequential(
          torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
         
        
        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
        
 
    def load_model(self):
        
        config = LongformerConfig.from_pretrained(self.args.model_path) 
        # choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
        # 'n2': for regular n2 attantion
        # 'tvm': a custom CUDA kernel implementation of our sliding window attention
        # 'sliding_chunks': a PyTorch implementation of our sliding window attention
        config.attention_mode = 'sliding_chunks'
        model = Longformer.from_pretrained(self.args.model_path, config=config)

        print("self.args.model_path: ", self.args.model_path)
        for layer in model.encoder.layer:
            layer.attention.self.attention_mode = self.args.attention_mode
            self.args.attention_window = layer.attention.self.attention_window

        print("Loaded model with config:")
        print(model.config)

        for p in model.parameters():
            p.requires_grad_(True)
        model.train()
        return model

#%%add_to hotpotqa    # does not seems to work for the @pl.data_loader decorator, missing which causes error "validation_step() takes 3 positional arguments but 4 were given"    
###################################################### dataloaders ########################################################### 
    @pl.data_loader
    def train_dataloader(self):
        if self.train_dataloader_object is not None:
            return self.train_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.train_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers)
        
#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,   # set shuffle=False, otherwise it will sample a different subset of data every epoch with train_percent_check
                        num_workers=self.args.num_workers,  
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

        self.train_dataloader_object = dl  
        return self.train_dataloader_object
    
 

    @pl.data_loader
    def val_dataloader(self):
        if self.val_dataloader_object is not None:
            return self.val_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples 

        
        
#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, 
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.val_dataloader_object = dl
        return self.val_dataloader_object

    @pl.data_loader
    def test_dataloader(self):
        if self.test_dataloader_object is not None:
            return self.test_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples

#         dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, 
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.test_dataloader_object = dl
        return self.test_dataloader_object

#%%add_to hotpotqa  
    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, q_type, sp_sent, sp_para):
 
        print("input_ids: ", input_ids)
        if 'longformer' in self.args.model_path:
            
            if(input_ids.size(0) != 1):
                print("input_ids.size(0) != 1")
            assert(input_ids.size(0)==1)
            # Each batch is one document, and each row of the batch is a chunck of the document.    ????
            # Make sure all rows have the same question length.
            
#             print("start_positions: ", start_positions)
#             print("end_positions: ", end_positions)
            # local attention everywhere
            attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
            
            # global attention for the cls and all question tokens
            # print("question_end_index")
            question_end_index = self._get_special_index(input_ids, [QUESTION_END])
    #         if(question_end_index.size(0) == 1):
    #             attention_mask[:,:question_end_index.item()] = 2  
    #         else:
            attention_mask[:,:question_end_index[0].item()+1] = 2  # from <cls> until </q>
    #             print("more than 1 <q> in: ", self.tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) )
            
            # global attention for the sentence and paragraph special tokens  
            # print("sent_indexes")
            sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
            attention_mask[:, sent_indexes] = 2
            
            # 
            para_indexes = self._get_special_index(input_ids, [TITLE_START])
            attention_mask[:, para_indexes] = 2       
            print("para_indexes: ", para_indexes) 
    
            # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
            input_ids, attention_mask = pad_to_window_size(
                input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)
    
            sequence_output = self.model(
                    input_ids,
                    attention_mask=attention_mask)[0]
    #         print("size of sequence_output: " + str(sequence_output.size()))
#             print("sequence_output: " + str(sequence_output))
    
            # The pretrained hotpotqa model wasn't trained with padding, so remove padding tokens
            # before computing loss and decoding.
            padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
            if padding_len > 0:
                sequence_output = sequence_output[:, :-padding_len]
    #         print("size of sequence_output after removing padding: " + str(sequence_output.size()))
        else:
            sequence_output = self.model(input_ids, attention_mask=attention_mask)[0]      
        
        ###################################### layers on top of sequence_output ##################################
        

        ### 1. answer start and end positions classification ###   
        logits = self.qa_outputs(sequence_output) 
        start_logits, end_logits = logits.split(1, dim=-1) 
        start_logits = start_logits.squeeze(-1) 
        end_logits = end_logits.squeeze(-1)
 
        ### 2. type classification, similar as class LongformerClassificationHead(nn.Module) https://huggingface.co/transformers/_modules/transformers/modeling_longformer.html#LongformerForSequenceClassification.forward ### 
        type_logits = self.linear_type(sequence_output[:,0]) 
        
        # ### 3. supporting paragraph classification ###  
        sp_para_output = sequence_output[:,para_indexes,:]  
        print("sp_para_output: ", sp_para_output)
        sp_para_output_t = self.fnn_sp_para(sp_para_output) 
        print("sp_para_output_t: ", sp_para_output_t)
         # linear_sp_sent generates a single score for each sentence, instead of 2 scores for yes and no.   
        # Argument the score with additional score=0. The same way did in the HOTPOTqa paper
        sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float, device=sp_para_output_t.device) 
        predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).contiguous() 
#         print("predict_support_para: ", predict_support_para)
        ### 4. supporting fact classification ###     
        # the first sentence in a paragraph is leading by <p>, other sentences are leading by <s>
 
        sp_sent_output = sequence_output[:,sent_indexes,:]  
        sp_sent_output_t = self.fnn_sp_sent(sp_sent_output)     
        sp_sent_output_aux = torch.zeros(sp_sent_output_t.shape, dtype=torch.float, device=sp_sent_output_t.device) 
        predict_support_sent = torch.cat([sp_sent_output_aux, sp_sent_output_t], dim=-1).contiguous() 
        
#         answer_loss, type_loss = self.loss_computation(start_positions, end_positions, start_logits, end_logits, q_type, type_logits)
        outputs = (start_logits, end_logits, type_logits, sp_para_output_t, sp_sent_output_t)  
        answer_loss, type_loss, sp_para_loss, sp_sent_loss  = self.loss_computation(start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para, predict_support_para, sp_sent, predict_support_sent)
 
        outputs = (answer_loss, type_loss, sp_para_loss, sp_sent_loss,) + outputs    
        return outputs
    
    def loss_computation(self, start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para, predict_support_para, sp_sent, predict_support_sent):
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                print('start_logits.size(): ', start_logits.size())
                print('start_positions.size(): ', start_positions.size())
                print('end_logits.size(): ', end_logits.size())
                print('end_positions.size(): ', end_positions.size())                
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
            else: 
                start_positions = start_positions[:, 0:1]   # only use the top1 start_position considering only one appearance of the answer string
                end_positions = end_positions[:, 0:1]
                start_loss = crossentropy(start_logits, start_positions[:, 0])
                end_loss = crossentropy(end_logits, end_positions[:, 0])
                
 
            crossentropy = torch.nn.CrossEntropyLoss(ignore_index=-1)
            type_loss = crossentropy(type_logits, q_type)  
            
            crossentropy_average = torch.nn.CrossEntropyLoss(reduction = 'mean', ignore_index=-1)     
            if(sp_para.view(-1).size(0) > 0):
#                 print("predict_support_para.view(-1, 2): ", predict_support_para.view(-1, 2))
#                 print("sp_para.view(-1): ", sp_para.view(-1))
                sp_para_loss = crossentropy_average(predict_support_para.view(-1, 2), sp_para.view(-1))
#                 print("sp_para_loss: ", sp_para_loss)
            else:
                sp_para_loss = torch.tensor(0).type_as(type_loss)  # when raw_context is []
                
            if(sp_sent.view(-1).size(0) > 0):    
                sp_sent_loss = crossentropy_average(predict_support_sent.view(-1, 2), sp_sent.view(-1))      
            else:
                sp_sent_loss = torch.tensor(0).type_as(type_loss)  # when raw_context is []
 
            answer_loss = (start_loss + end_loss) / 2 
        return answer_loss, type_loss, sp_para_loss, sp_sent_loss  


#     %%add_to hotpotqa    
    def _get_special_index(self, input_ids, special_tokens):
        
        if(input_ids.size(0)!=1):
            print("input_ids.size(0): ", input_ids.size(0))
            print("input_ids: ", input_ids)
        
        assert(input_ids.size(0)==1) 
        mask = input_ids != input_ids # initilaize 
        for special_token in special_tokens:
            mask = torch.logical_or(mask, input_ids.eq(self.tokenizer.convert_tokens_to_ids(special_token))) 
 
        token_indices = torch.nonzero(mask, as_tuple=False)    
         
 
        return token_indices[:,1]    

    def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1):
        """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
        assert logits.ndim == 2
        assert target.ndim == 2
        assert logits.size(0) == target.size(0) 
        
        # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
        # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

        # target are indexes of tokens, padded with ignore_index=-1
        # logits are scores (one for each label) for each token
 
        # compute a target mask
        target_mask = target == ignore_index
        # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
        masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
    
        # gather logits
        print('masked_target: ', masked_target)
        print('logits: ', logits)
        gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
 
        # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
        gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
 
        # each batch is one example
        gathered_logits = gathered_logits.view(1, -1)
        logits = logits.view(1, -1)
 
        # numerator = log(sum(exp(gathered logits)))
        log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
 
        log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
        
        # compute the loss
        loss = -(log_score - log_norm) 
        
        # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
        # when `target` is all `ignore_index`, loss is 0 
        loss = loss[~torch.isinf(loss)].sum()
#         loss = torch.tanh(loss)
#         print("final loss: " + str(loss)) 
        return loss  


In [18]:
# debug
# input_ids = torch.tensor([[-1, 5, -1, 2]])
# input_ids.size(0)
# token_indices =  torch.nonzero(input_ids == torch.tensor(-1))[:,1]
# # token_indices
# # token_indices.item()
# # indices =  torch.LongTensor([[2],[0,2]])

# # torch.gather(input_ids, 1, token_indices.unsqueeze(0))
# # p_index = token_indices.view(input_ids.size(0), -1)[:,1::2]   
# # attention_mask = torch.ones(input_ids.shape, dtype=torch.long) 
# # attention_mask[:,token_indices] = 2
# # attention_mask
# p_index = torch.tensor([1, 3, 4])
# s_index = torch.tensor([1,3,6])
# torch.sort(torch.cat((s_index, p_index)))[0]
# attention_mask.view(-1)[ p_index.view(-1), :].view(attention_mask.size(0), -1)
# # for pi in p_index[0]:
# #     attention_mask[:, pi] = 2
# # attention_mask
# # s_index = torch.tensor([[1,3]])
# # torch.sort(torch.cat((p_index, s_index), -1), -1)

# sequence_output  = torch.tensor([[[-1, 5, -1, 2],
#                                  [-2, 27, 2, 9],
#                                  [3, 6, 1, 65],
#                                  [52, 36, 13, 2],
#                                  [73, 26, 1, 7]
#                                 ]])

# sp_para_output_t   = torch.tensor([[[-1],
#                                  [-2 ],
#                                  [3],
#                                  [52],
#                                  [73]
#                                 ]])
# torch.zeros(sp_para_output_t.shape, dtype=torch.float) 

# print("size of sequence_output: " + str(sequence_output.size()))
# # print("size of p_index.unsqueeze(0).unsqueeze(-1): " + str(p_index.unsqueeze(0).size()))
# sequence_output[:,p_index,:]
# b = torch.tensor([0, 1, 2, 3])
# p_index.unsqueeze(-1) * b

# input_ids = torch.tensor([[0.2, 0.0, 0.6, 0.6], [0.2, 0.6, 0.0, 0.0]]) 
# # input_ids.tolist()
# p_index =  torch.nonzero(input_ids == torch.tensor(0.2))
# print(p_index)
# s_index =  torch.nonzero(input_ids == torch.tensor(0.6))
# print(s_index)

# sp_sent = torch.tensor([[0, 1, 1, 0]])
# torch.nonzero(sp_sent, as_tuple=True)[1]
# cat_index = torch.tensor([])
# cat_index = torch.cat((cat_index, ids[0][1]))
# print(ids)
# print(cat_index)
# p_index[p_index[:,0] == 0]

# cat_index[cat_index[:,0].argsort()]

# sorted(torch.cat((p_index, s_index)), key = lambda x: x[0])
# torch.sort(torch.cat((p_index, s_index)), 0)[0]
# for cor in token_indices:
#     attention_mask[cor[0].item()][cor[1].item()] = 2
# attention_mask 
# input_ids = torch.tensor([[-1, 5, -6, 2]])
# print(input_ids.size())
# input_ids.topk(k=2, dim=-1).indices

# predict_type = torch.tensor([[-0.0925, -0.0999, -0.1671]])
# p_type = torch.argmax(predict_type, dim=1).item()
# p_type_score = torch.max(predict_type, dim=1)[0].item()
# print("predict_type: ", predict_type)
# print("p_type: ", p_type)
# print("p_type_score: ", p_type_score)
    
# a = torch.tensor([[0.9213,  1.0887, -0.8858, -1.7683]])
# a.view(-1).size() 
# print(torch.sigmoid(a))
# a = torch.tensor([ 9.213,  1.0887, -0.8858, 7683])
# print(torch.sigmoid(a))

# a = torch.tensor([[[1],[2],[4],[-1],[-1]]])
# a= a.squeeze(-1)
# a.size() 
# a[:, torch.where(a!=-1)[1]]
# m = torch.nn.Sigmoid()
# print("m: ", m)
# loss = torch.nn.BCELoss()
# # input = torch.randn(3, requires_grad=True)
# # print("input: ", input)
# # target = torch.empty(3).random_(2)
# # print("target: ", target)
# # output = loss(m(input), target)
# # print("output: ", output)

# input = torch.tensor([1.0293, -0.1585,  1.1408], requires_grad=True)
# print("input: ", input)
# print("Sigmoid(input): ", m(input))
# target = torch.tensor([0., 1., 0.])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# input = torch.tensor([[1.0293, -0.1585,  1.1408]], requires_grad=True)
# print("input: ", input)
# target = torch.tensor([[0., 1., 0.]])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# 1.1761 * 3
# soft_input = torch.nn.Softmax(dim=-1)
# log_soft_input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss() 
# loss(log_soft_input, target)
# input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss()
# loss(input,target)

# loss =torch.nn.CrossEntropyLoss()
# loss(input,target) 

# sp_sent_logits =torch.tensor([[[0.0988],
#          [0.0319],
#          [0.0314]]])
# sp_sent_logits.squeeze()

# input_ids = torch.tensor([[0.6, 0.0, 0.6, 0.0]]) 
# token_indices =  torch.nonzero(input_ids == torch.tensor(0.6))
# token_indices[:,1][0].item()

# def or_softmax_cross_entropy_loss_one_doc(logits, target, ignore_index=-1, dim=-1):
#     """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
#     assert logits.ndim == 2
#     assert target.ndim == 2
#     assert logits.size(0) == target.size(0) 

#     # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
#     # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

#     # target are indexes of tokens, padded with ignore_index=-1
#     # logits are scores (one for each label) for each token
# #         print("or_softmax_cross_entropy_loss_one_doc" ) 
# #         print("size of logits: " + str(logits.size()))                    # torch.Size([1, 746]), 746 is number of all tokens 
# #         print("size of target: " + str(target.size()))                    # torch.Size([1, 64]),  -1 padded
#     print("target: " + str(target)) 

#     # compute a target mask
#     target_mask = target == ignore_index
#     # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
#     masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
#     print("masked_target: " + str(masked_target))     
#     # gather logits
#     gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
# #         print("size of gathered_logits: " + str(gathered_logits.size()))  # torch.Size([1, 64])
#     print("gathered_logits: " + str(gathered_logits)) 
#     # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
#     gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
#     print("gathered_logits after -inf: " + str(gathered_logits))      # tensor([[0.4382,   -inf,   -inf,   -inf,   -inf,...]])

#     # each batch is one example
#     gathered_logits = gathered_logits.view(1, -1)
#     logits = logits.view(1, -1)
# #         print("size of gathered_logits after view: " + str(gathered_logits.size()))  # torch.Size([1, 64])
# #         print("size of logits after view: " + str(logits.size()))                    # torch.Size([1, 746])　　

#     # numerator = log(sum(exp(gathered logits)))
#     log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
#     print("log_score: " + str(log_score)) 
#     # denominator = log(sum(exp(logits)))
#     log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
#     print("log_norm: " + str(log_norm)) 

#     # compute the loss
#     loss = -(log_score - log_norm)
#     print("loss: " + str(loss))


#     # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
#     # replace -inf with 0
#     loss = loss[~torch.isinf(loss)].sum()
#     print("final loss: " + str(loss)) 
#     return loss 

# # input = torch.tensor([[ 0,  0.0780],
# #         [0, 0.9253 ],
# #         [0, 0.0987]])
# # target = torch.tensor([0,1,0])
# # target.size(0) < 1
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312],
# #         [-0.3499, -1.9253, -1.5725],
# #         [-0.6578, -0.0987,  1.1570]])
# # target=torch.tensor([0,1,2])
# # predict_support_para.view(-1, 2), sp_para.view(-1)
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312]])
# # target=torch.tensor([0])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([2])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([-1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# a = torch.tensor([6.4062])    
# b = torch.tensor([2.23])
# torch.cat((a,b))
 
# for a in list_tensor
# from functools import reduce
# reduce(lambda x,y: torch.cat((x,y)), list_tensor[:-1])

# torch.tanh(a)
# # if(torch.isinf(a)):
# #     print("is inf")
# 5 * 1e-2


# import torch
# special_tokens = [1,2]
# input_ids = torch.tensor([[ 1, 0, 2, 1, 0, 2]])

# mask = input_ids != input_ids # initilaize 
# for special_token in special_tokens:
#     mask = torch.logical_or(mask, input_ids.eq(special_token)) 
#     print("mask: ", mask)
# torch.nonzero(mask)    
 

In [19]:
# # debug: check loaded dataset by DataLoader
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# num_new_tokens = tokenizer.add_special_tokens({"additional_special_tokens": ["<p>", "<q>", "</q>"]})
# # # # print(tokenizer.all_special_tokens)    
# # # # print(tokenizer.all_special_ids)     
# # # # tokenizer.convert_tokens_to_ids("<s>")
# # # # tokenizer.sep_token
# print(tokenizer.tokenize("yes"))
# print(tokenizer.tokenize("no"))
# print(tokenizer.tokenize("null"))
# # # all_doc_tokens = []
# # # orig_to_tok_index = []
# # # tok_to_orig_index = []
# # # for (i, token) in enumerate(["<s>", "da", "tell", "<p>", "say"]):
# # #     orig_to_tok_index.append(len(all_doc_tokens))
# # #     sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token)
# # #     for sub_token in sub_tokens:
# # #         tok_to_orig_index.append(i)
# # #         all_doc_tokens.append(sub_token)
# # # all_doc_tokens


 

# dataset = hotpotqaDataset(file_path= args.train_dataset, tokenizer=tokenizer,
#                           max_seq_len= args.max_seq_len, max_doc_len= args.max_doc_len,
#                           doc_stride= args.doc_stride,
#                           max_num_answers= args.max_num_answers,
#                           max_question_len= args.max_question_len,
#                           ignore_seq_with_no_answers= args.ignore_seq_with_no_answers)
# print(len(dataset))

# # # dl = DataLoader(dataset, batch_size=1, shuffle=None,
# # #                     num_workers=args.num_workers, sampler=None,
# # #                     collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

# example = dataset[3]  
# [input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids] = example
 

# print(input_ids[0][:20].tolist())
# print(input_mask) 
# print(segment_ids) 
# print(subword_starts) 
# print(subword_ends)
# print(q_type)
# print(sp_sent) 
# print(sp_para) 
# print(qids)
# print(tokenizer.convert_ids_to_tokens(input_ids[0][667:669+1].tolist()))
# 0.0033 * 90447 
# 28*4
# torch.tensor(0.0)

##### configure_ddp

In [20]:
 %%add_to hotpotqa
 # A hook to overwrite to define your own DDP(DistributedDataParallel) implementation init. 
 # The only requirement is that: 
 # 1. On a validation batch the call goes to model.validation_step.
 # 2. On a training batch the call goes to model.training_step.
 # 3. On a testing batch, the call goes to model.test_step
 def configure_ddp(self, model, device_ids):
    model = LightningDistributedDataParallel(
        model,
        device_ids=device_ids,
        find_unused_parameters=True
    )
    return model

##### **configure_optimizers**

In [21]:
%%add_to hotpotqa
def configure_optimizers(self):
    # Set up optimizers and (optionally) learning rate schedulers
    def lr_lambda(current_step):
        if current_step < self.args.warmup:
            return float(current_step) / float(max(1, self.args.warmup))
        return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup)))

    optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr)

    scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)
    return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

##### **training_step**

In [22]:
%%add_to hotpotqa
def training_step(self, batch, batch_nb):
    # do the forward pass and calculate the loss for a batch 
#     input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, qid, answer = batch 
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch
#     print("qid: " + str(qid)) 
    # print("size of input_ids: " + str(input_ids.size())) 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
#     answer_loss, type_loss = output[:2]
    # answer_loss, type_loss = output[:4]
    answer_loss, type_loss, sp_para_loss, sp_sent_loss  = output[:4]
    # print("answer_loss: ", answer_loss)
    # print("type_loss: ", type_loss)
    # print("sp_para_loss: ", sp_para_loss)
    # print("sp_sent_loss: ", sp_sent_loss)

#     loss  = answer_loss +  type_loss + sp_para_loss + sp_sent_loss
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss
#     print("weighted loss: ", loss)
#     print("self.trainer.optimizers[0].param_groups[0]['lr']: ", self.trainer.optimizers[0].param_groups[0]['lr'])
    lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']  # loss.new_zeros(1) is tensor([0.]), converting 'lr' to tensor' by adding it.  

    tensorboard_logs = {'loss': loss, 'train_answer_loss': answer_loss, 'train_type_loss': type_loss, 
                         'train_sp_para_loss': sp_para_loss, 'train_sp_sent_loss': sp_sent_loss, 
                        'lr': lr #,
                        # 'mem': torch.tensor(torch.cuda.memory_allocated(input_ids.device) / 1024 ** 3).type_as(loss) 
    }
    return tensorboard_logs

##### training_end

In [23]:
# %%add_to hotpotqa
    # # the function is called for each batch after every epoch is completed
    # def training_end(self, output): 
    #     # print("training_end at epoch: ", self.current_epoch)
    # #     print("len(outputs): ",len(outputs))
    # #     print("output: ",output)
    
    #     # one batch only has one example
    #     avg_loss = output['loss']    
    #     avg_answer_loss = output['train_answer_loss']  
    #     avg_type_loss = output['train_type_loss']    
    #     avg_sp_para_loss = output['train_sp_para_loss']   
    #     avg_sp_sent_loss = output['train_sp_sent_loss'] 
    #     avg_lr = output['lr']      
         
     
    #     if self.trainer.use_ddp:
    #         torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_answer_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_type_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_para_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_sent_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_lr, op=torch.distributed.ReduceOp.SUM)
    #         avg_lr /= self.trainer.world_size 
            
     
    #     tensorboard_logs = { #'avg_train_loss': avg_loss, 
    #             'avg_train_answer_loss': avg_answer_loss, 'avg_train_type_loss': avg_type_loss, 'avg_train_sp_para_loss': avg_sp_para_loss, 'avg_train_sp_sent_loss': avg_sp_sent_loss, 'lr': avg_lr
    #           }
    
    #     return {'loss': avg_loss, 'log': tensorboard_logs}


##### validation_step

In [25]:
%%add_to hotpotqa
# When the validation_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of validation, model goes back to training mode and gradients are enabled.
def validation_step(self, batch, batch_nb):
    print("validation_step")
#     print("batch_nb: ", batch_nb)
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch
#     input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, qid, answer = batch
    print("qid: ", qid)
#     print("q_type: ", q_type)
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
#     answer_loss, type_loss, start_logits, end_logits, type_logits = output 
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

#     if(q_type.item() != -1 ):
    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)
#         answers_pred  = self.decode(input_ids, start_logits, end_logits, type_logits)
#     else:
#         answers_pred  = [{'text': '', 'score': -1000000, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': 1}]

    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    pre_answer = _normalize_text(answers_pred[0]['text'])
#         print("pred answer_score: " + str(pre_answer_score))
#         print("pred answer_text: " + str(pre_answer)) 

    gold_answer = _normalize_text(answer)
    f1, prec, recall = self.f1_score(pre_answer, gold_answer)
    em = self.exact_match_score(pre_answer, gold_answer) 
    f1 = torch.tensor(f1).type_as(loss)
    prec = torch.tensor(prec).type_as(loss)
    recall = torch.tensor(recall).type_as(loss)
    em = torch.tensor(em).type_as(loss)
#         print("f1: " + str(f1))
#         print("prec: " + str(prec))
#         print("recall: " + str(recall))
#         print("em: " + str(em))  

    if(len(sp_sent_pred) > 0):
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = self.sp_metrics(sp_sent_pred, torch.where(sp_sent.squeeze())[0].tolist())
        sp_sent_em = torch.tensor(sp_sent_em).type_as(loss)
        sp_sent_precision = torch.tensor(sp_sent_precision).type_as(loss)
        sp_sent_recall = torch.tensor(sp_sent_recall).type_as(loss)
        sp_sent_f1 = torch.tensor(sp_sent_f1).type_as(loss)

#         print("sp_sent_em: " + str(sp_sent_em))
#         print("sp_sent_precision: " + str(sp_sent_precision))
#         print("sp_sent_recall: " + str(sp_sent_recall))    
#         print("sp_sent_f1: " + str(sp_sent_f1))    

        joint_prec = prec * sp_sent_precision
        joint_recall = recall * sp_sent_recall
        if joint_prec + joint_recall > 0:
            joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
        else:
            joint_f1 = torch.tensor(0.0).type_as(loss)
        joint_em = em * sp_sent_em 

    else:
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)
        joint_em, joint_f1, joint_prec, joint_recall =  torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)

    print("pre_answer:\t", pre_answer, "\tgold_answer:\t", gold_answer) 
    print("sp_sent_pred:\t", sp_sent_pred, "\tsp_sent_gold:\t", torch.where(sp_sent.squeeze())[0].tolist(), "\tsp_para_pred:\t", sp_para_pred.tolist(), "\tsp_para_gold:\t", torch.where(sp_para.squeeze())[0].tolist())



    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 
            'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss,
               'answer_score': pre_answer_score, 'f1': f1, 'prec':prec, 'recall':recall, 'em': em ,
              'sp_em': sp_sent_em, 'sp_f1': sp_sent_f1, 'sp_prec': sp_sent_precision, 'sp_recall': sp_sent_recall,
              'joint_em': joint_em, 'joint_f1': joint_f1, 'joint_prec': joint_prec, 'joint_recall': joint_recall

    }


###### decode

In [26]:
%%add_to hotpotqa
def decode(self, input_ids, start_logits, end_logits, type_logits, sp_para_logits, sp_sent_logits):
    print("decode")

    
    # answer decode
    question_end_index = self._get_special_index(input_ids, [QUESTION_END]) 
    # one example per batch
    start_logits = start_logits.squeeze()
    end_logits = end_logits.squeeze()
#     print("start_logits: ", start_logits)
#     print("end_logits: ", end_logits)
    start_logits_indices = start_logits.topk(k=min(self.args.n_best_size, start_logits.size(0)), dim=-1).indices 
    end_logits_indices = end_logits.topk(k=min(self.args.n_best_size, end_logits.size(0)), dim=-1).indices 
    if(len(start_logits_indices.size()) > 1):
        print("len(start_logits_indices.size()): ", len(start_logits_indices.size()))
        assert("len(start_logits_indices.size()) > 1")
    p_type = torch.argmax(type_logits, dim=1).item()
    p_type_score = torch.max(type_logits, dim=1)[0] 
#     print("type_logits: ", type_logits)
    print("p_type: ", p_type)
#     print("p_type_score: ", p_type_score)

    answers = []
    if p_type == 0:
        potential_answers = []
        for start_logit_index in start_logits_indices: 
            for end_logit_index in end_logits_indices: 
                if start_logit_index <= question_end_index.item():
                    continue
                if end_logit_index <= question_end_index.item():
                    continue
                if start_logit_index > end_logit_index:
                    continue
                answer_len = end_logit_index - start_logit_index + 1
                if answer_len > self.args.max_answer_length:
                    continue
                potential_answers.append({'start': start_logit_index, 'end': end_logit_index,
                                          'start_logit': start_logits[start_logit_index],  # single logit score for start position at start_logit_index
                                          'end_logit': end_logits[end_logit_index]})    
        sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) 
#             print("sorted_answers: " + str(sorted_answers))

        if len(sorted_answers) == 0:
            answers.append({'text': 'NoAnswerFound', 'score': -1000000, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score})
        else:
            answer = sorted_answers[0]
            answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]

            answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())

            # remove [/sent], <t> and </t>

            for special_token in [SENT_MARKER_END, TITLE_START, TITLE_END, self.tokenizer.sep_token]:
                try:
                    if(answer_tokens[0] == special_token):
                        answer['start_logit'] = -2000000
                    elif(answer_tokens[-1] == special_token):
                        answer['end_logit'] = -2000000

                    answer_tokens.remove(special_token)
                except:
                    pass

            text = self.tokenizer.convert_tokens_to_string(answer_tokens) 
            score = (answer['start_logit'] + answer['end_logit'] + p_type_score) / 3
            answers.append({'text': text, 'score': score, 'start_logit': answer['start_logit'], 'end_logit': answer['end_logit'], 'p_type_score': p_type_score})

    elif p_type == 1: 
        answers.append({'text': 'yes', 'score': p_type_score, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score})
    elif p_type == 2:
        answers.append({'text': 'no', 'score': p_type_score, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score}) 
    else:
        assert False 
    
    print('answers: ', answers)
    
    # sp decode
    sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
    para_indexes = self._get_special_index(input_ids, [TITLE_START])

    s_to_p_map = []   
    for s in sent_indexes:
        s_to_p = torch.where(torch.le(para_indexes, s))[0][-1]     # last para_index smaller or equal to s
        s_to_p_map.append(s_to_p.item())  # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 7, 7, 8, 9]
    print('s_to_p_map: ', s_to_p_map)
    if(len(s_to_p_map)>0):      # https://arxiv.org/pdf/2004.06753.pdf section 3.3
        para_sent_logits_sum = torch.tensor([], device=sp_sent_logits.device)  
        evidence_candidates = {}
        para_sents_offset = [0]
        for i in range(s_to_p_map[-1]+1):
            para_sent_logits = torch.masked_select(sp_sent_logits.squeeze(), torch.tensor([p==i for p in s_to_p_map])) 
            print("para_sent_logits: " + str(para_sent_logits))
            para_sent_logits_sum = torch.cat([para_sent_logits_sum, torch.sum(para_sent_logits).unsqueeze(0) ])
            print("para_sent_logits_sum: " + str(para_sent_logits_sum))
            para_sents_offset.append(para_sent_logits.numel()+para_sents_offset[-1])  # [0, 21, 22, 24, 25, 26, 29, 30, 34, 35, 36], one more elements than num of paras   
            print("para_sents_offset: " + str(para_sents_offset))
            evidence_candidates[i] = torch.gt(para_sent_logits, 0.1).nonzero(as_tuple=True)[0]  # 0.1 is the threshold to be a candidate sentences
            print("evidence_candidates: " + str(evidence_candidates))
        # para_sent_logits_sum: tensor([ 7.8180e-01,  6.8700e-02,  1.6170e-01,  7.4000e-02,  6.0000e-04,  2.2680e-01, -3.0400e-02,  9.3400e-02,  1.1200e-01,  1.2470e-01])
        # evidence_candidates: sentences with logits larger than threshold in each para,  [tensor([ 1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 16, 17, 19, 20]), tensor([0]), tensor([0, 1]), tensor([0]), tensor([0]), tensor([0, 1, 2]), tensor([], dtype=torch.int64), tensor([0, 2]), tensor([0]), tensor([0])]
        sp_para_pred = para_sent_logits_sum.squeeze().topk(k=min(para_sent_logits_sum.numel(), 2)).indices  # sp are from <=2 paragraphs
        print("sp_para_pred: " + str(sp_para_pred))
        sp_sent_pred = []
        if(sp_para_pred.numel() > 1):
            for para_idx in sp_para_pred: 
                if(para_idx.item() in evidence_candidates):
                    sp_sent_pred.extend([(para_sents_offset[para_idx]+sent).item() for sent in evidence_candidates[para_idx.item()]]) 
        elif(sp_para_pred.numel()==1 and sp_para_pred.item() in evidence_candidates):
            sp_sent_pred = [(para_sents_offset[sp_para_pred]+sent).item() for sent in evidence_candidates[sp_para_pred.item()]]
    else:
        sp_sent_pred = []
        sp_para_pred = [] 
    return (answers, sp_sent_pred, sp_para_pred)

###### metrics

In [27]:
%%add_to hotpotqa


def f1_score(self, prediction, ground_truth):
    normalized_prediction = _normalize_text(prediction)
    normalized_ground_truth = _normalize_text(ground_truth)
    ZERO_METRIC = (0, 0, 0)

    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(self, prediction, ground_truth):
    return int(_normalize_text(prediction) == _normalize_text(ground_truth))


def sp_metrics(self, prediction, gold): 
    tp, fp, fn = 0, 0, 0
    for e in prediction:
        if e in gold:
            tp += 1
        else:
            fp += 1 
    for e in gold:
        if e not in prediction:
            fn += 1 
    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0 
    return em, prec, recall, f1 



##### validation_end

In [28]:
%%add_to hotpotqa
# If a validation_step is not defined, this won't be called. Called at the end of the validation loop with the outputs of validation_step.
def validation_epoch_end(self, outputs):
    print("validation_epoch_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  


    answer_scores = [x['answer_score'] for x in outputs] 
    f1_scores = [x['f1'] for x in outputs]  
    em_scores = [x['em'] for x in outputs]  
    prec_scores =  [x['prec'] for x in outputs] 
    recall_scores = [x['recall'] for x in outputs]  
    sp_sent_f1_scores = [x['sp_f1'] for x in outputs]   
    sp_sent_em_scores = [x['sp_em'] for x in outputs]   
    sp_sent_prec_scores = [x['sp_prec'] for x in outputs]   
    sp_sent_recall_scores = [x['sp_recall'] for x in outputs]   
    joint_f1_scores = [x['joint_f1'] for x in outputs]  
    joint_em_scores = [x['joint_em'] for x in outputs]  
    joint_prec_scores = [x['joint_prec'] for x in outputs]  
    joint_recall_scores = [x['joint_recall'] for x in outputs]



    print(f'before sync --> sizes:  {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 

        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)

        sp_sent_f1_scores = self.sync_list_across_gpus(sp_sent_f1_scores, avg_loss.device, torch.float)
        sp_sent_em_scores = self.sync_list_across_gpus(sp_sent_em_scores, avg_loss.device, torch.float)
        sp_sent_prec_scores = self.sync_list_across_gpus(sp_sent_prec_scores, avg_loss.device, torch.float)
        sp_sent_recall_scores = self.sync_list_across_gpus(sp_sent_recall_scores, avg_loss.device, torch.float)

        joint_f1_scores = self.sync_list_across_gpus(joint_f1_scores, avg_loss.device, torch.float)
        joint_em_scores = self.sync_list_across_gpus(joint_em_scores, avg_loss.device, torch.float)
        joint_prec_scores = self.sync_list_across_gpus(joint_prec_scores, avg_loss.device, torch.float)
        joint_recall_scores = self.sync_list_across_gpus(joint_recall_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')

    avg_val_f1 = sum(f1_scores) / len(f1_scores)
    avg_val_em = sum(em_scores) / len(em_scores)
    avg_val_prec = sum(prec_scores) / len(prec_scores)
    avg_val_recall = sum(recall_scores) / len(recall_scores)
    avg_val_sp_sent_f1 = torch.tensor(sum(sp_sent_f1_scores) / len(sp_sent_f1_scores) ).type_as(avg_loss)   
    avg_val_sp_sent_em = torch.tensor(sum(sp_sent_em_scores) / len(sp_sent_em_scores) ).type_as(avg_loss)    
    avg_val_sp_sent_prec = torch.tensor(sum(sp_sent_prec_scores) / len(sp_sent_prec_scores) ).type_as(avg_loss)   
    avg_val_sp_sent_recall = torch.tensor(sum(sp_sent_recall_scores) / len(sp_sent_recall_scores) ).type_as(avg_loss)    
    avg_val_joint_f1 = torch.tensor(sum(joint_f1_scores) / len(joint_f1_scores) ).type_as(avg_loss)  
    avg_val_joint_em = torch.tensor(sum(joint_em_scores) / len(joint_em_scores) ).type_as(avg_loss)  
    avg_val_joint_prec = torch.tensor(sum(joint_prec_scores) / len(joint_prec_scores) ).type_as(avg_loss)   
    avg_val_joint_recall = torch.tensor(sum(joint_recall_scores) / len(joint_recall_scores) ).type_as(avg_loss) 

    print("avg_loss: ", avg_loss, end = '\t')   
    print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    print("avg_type_loss: ", avg_type_loss, end = '\t') 
    # print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t')   
    # print("avg_sp_sent_loss: ", avg_sp_sent_loss)   
    print("avg_val_f1: ", avg_val_f1, end = '\t')   
    print("avg_val_em: ", avg_val_em, end = '\t')   
    print("avg_val_prec: ", avg_val_prec, end = '\t')   
    print("avg_val_recall: ", avg_val_recall)   
    print("avg_val_sp_sent_f1: ", avg_val_sp_sent_f1, end = '\t')   
    print("avg_val_sp_sent_em: " , avg_val_sp_sent_em, end = '\t')  
    print("avg_val_sp_sent_prec: ", avg_val_sp_sent_prec, end = '\t')   
    print("avg_val_sp_sent_recall: ", avg_val_sp_sent_recall)   
    print("avg_val_joint_f1: " , avg_val_joint_f1, end = '\t')  
    print("avg_val_joint_em: ", avg_val_joint_em, end = '\t')   
    print("avg_val_joint_prec: ", avg_val_joint_prec, end = '\t')   
    print("avg_val_joint_recall: ", avg_val_joint_recall)   


    logs = {'avg_val_loss': avg_loss, 'avg_val_answer_loss': avg_answer_loss, 'avg_val_type_loss': avg_type_loss, 
        'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss,   
        'avg_val_f1': avg_val_f1 , 'avg_val_em': avg_val_em,  'avg_val_prec': avg_val_prec, 'avg_val_recall': avg_val_recall ,    
        'avg_val_sp_sent_f1': avg_val_sp_sent_f1, 'avg_val_sp_sent_em': avg_val_sp_sent_em,  'avg_val_sp_sent_prec': avg_val_sp_sent_prec, 'avg_val_sp_sent_recall': avg_val_sp_sent_recall,    
        'avg_val_joint_f1': avg_val_joint_f1, 'avg_val_joint_em': avg_val_joint_em,  'avg_val_joint_prec': avg_val_joint_prec, 'avg_val_joint_recall': avg_val_joint_recall 
    }   

    return logs


def sync_list_across_gpus(self, l, device, dtype):
    l_tensor = torch.tensor(l, device=device, dtype=dtype)
    gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)]
    torch.distributed.all_gather(gather_l_tensor, l_tensor)
    return torch.cat(gather_l_tensor).tolist()

##### test_step

In [29]:
%%add_to hotpotqa
def test_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch
#     input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, qid, answer = batch

    print("test_step of qid: ", qid, end="\t") 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output = output 
#     answer_loss, type_loss, start_logits, end_logits, type_logits = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)
#     answers_pred = self.decode(input_ids, start_logits, end_logits, type_logits)

    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    start_logit = answers_pred[0]['start_logit']
    end_logit = answers_pred[0]['end_logit']
    type_score = answers_pred[0]['p_type_score']
    pre_answer = _normalize_text(answers_pred[0]['text'])
    # print("pred answer_score: " + str(pre_answer_score))
    # print("pred answer_text: " + str(pre_answer)) 

    gold_answer = _normalize_text(answer)
    f1, prec, recall = self.f1_score(pre_answer, gold_answer)
    em = self.exact_match_score(pre_answer, gold_answer) 
    f1 = torch.tensor(f1).type_as(loss)
    prec = torch.tensor(prec).type_as(loss)
    recall = torch.tensor(recall).type_as(loss)
    em = torch.tensor(em).type_as(loss)

    print("pre_answer:\t", pre_answer, "\tgold_answer:\t", gold_answer) 
    print("sp_sent_pred:\t", sp_sent_pred, "\tsp_sent_gold:\t", torch.where(sp_sent.squeeze())[0].tolist(), "\tsp_para_pred:\t", sp_para_pred.tolist(), "\tsp_para_gold:\t", torch.where(sp_para.squeeze())[0].tolist())

    self.logger.log_metrics({'answer_loss': answer_loss, 'type_loss': type_loss, 
                                'answer_score': pre_answer_score, 'start_logit': start_logit, 'end_logit': end_logit,  
                                'type_score': type_score,
                                'f1': f1, 'prec':prec, 'recall':recall, 'em': em 
                            }) 


    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 
             'answer_score': pre_answer_score, 'start_logit': start_logit, 'end_logit': end_logit, 'type_score': type_score,
             'f1': f1, 'prec':prec, 'recall':recall, 'em': em,
            'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss, 
            }



##### test_end

In [30]:
%%add_to hotpotqa
def test_epoch_end(self, outputs):
    print("test_epoch_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  

    answer_scores = [x['answer_score'] for x in outputs]  # [item for sublist in outputs for item in sublist['answer_score']] #torch.stack([x['answer_score'] for x in outputs]).mean() # 
    f1_scores = [x['f1'] for x in outputs]  
    em_scores = [x['em'] for x in outputs]  
    prec_scores =  [x['prec'] for x in outputs] 
    recall_scores = [x['recall'] for x in outputs]  

    print(f'before sync --> sizes:  {len(answer_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)
#         int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}')
    print("answer_scores: ", answer_scores)
    avg_test_f1 = sum(f1_scores) / len(f1_scores)
    avg_test_em = sum(em_scores) / len(em_scores)
    avg_test_prec = sum(prec_scores) / len(prec_scores)
    avg_test_recall = sum(recall_scores) / len(recall_scores)     
    print("avg_loss: ", avg_loss, end = '\t') 
    print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    print("avg_type_loss: ", avg_type_loss, end = '\t') 
    print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t') 
    print("avg_sp_sent_loss: ", avg_sp_sent_loss, end = '\t')  

    logs = {'avg_test_loss': avg_loss, 'avg_test_answer_loss': avg_answer_loss, 'avg_test_type_loss': avg_type_loss, 
            'avg_test_f1': avg_test_f1 , 'avg_test_em': avg_test_em,  'avg_test_prec': avg_test_prec, 'avg_test_recall': avg_test_recall ,    
            'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss
           }

    return {'avg_test_loss': avg_loss, 'log': logs}

##### add_model_specific_args

In [31]:
%%add_to hotpotqa
@staticmethod
def add_model_specific_args(parser, root_dir):
    parser.add_argument("--save_dir", type=str, default='jupyter-hotpotqa')
    parser.add_argument("--save_prefix", type=str, required=True)
    parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format")
    parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format")
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
    parser.add_argument("--gpus", type=str, default='0',
                        help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ")
    parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps")
    parser.add_argument("--lr", type=float, default=0.00005, help="Maximum learning rate")
    parser.add_argument("--val_every", type=float, default=1.0, help="How often within one training epoch to check the validation set.")
    parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
    parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers")
    parser.add_argument("--seed", type=int, default=1234, help="Seed")
    parser.add_argument("--epochs", type=int, default=6, help="Number of epochs")
    parser.add_argument("--max_seq_len", type=int, default=4096,
                        help="Maximum length of seq passed to the transformer model")
    parser.add_argument("--max_doc_len", type=int, default=4096,
                        help="Maximum number of wordpieces of the input document")
    parser.add_argument("--max_num_answers", type=int, default=64,
                        help="Maximum number of answer spans per document (64 => 94%)")
    parser.add_argument("--max_question_len", type=int, default=55,
                        help="Maximum length of the question")
    parser.add_argument("--doc_stride", type=int, default=-1,
                        help="Overlap between document chunks. Use -1 to only use the first chunk")
    parser.add_argument("--ignore_seq_with_no_answers", action='store_true',
                        help="each example should have at least one answer. Default is False")
    parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
    parser.add_argument("--n_best_size", type=int, default=20,
                        help="Number of answer candidates. Used at decoding time")
    parser.add_argument("--max_answer_length", type=int, default=30,
                        help="maximum num of wordpieces/answer. Used at decoding time")
    parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss")
    parser.add_argument("--test", action='store_true', help="Test only, no training")
    parser.add_argument("--model_path", type=str,
                        help="Path to the checkpoint directory")
    parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing")
    parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                        default='sliding_chunks', help='Which implementation of selfattention to use')
    parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
    parser.add_argument('--train_percent', type=float, default=1.0)
    return parser

##### class info

In [32]:
dir(hotpotqa)

['CHECKPOINT_HYPER_PARAMS_KEY',
 'CHECKPOINT_HYPER_PARAMS_NAME',
 'CHECKPOINT_HYPER_PARAMS_TYPE',
 'T_destination',
 '_LightningModule__get_hparams_assignment_variable',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_apply',
 '_auto_collect_arguments',
 '_call_impl',
 '_forward_unimplemented',
 '_get_name',
 '_get_special_index',
 '_init_slurm_connection',
 '_load_from_state_dict',
 '_load_model_state',
 '_named_members',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',


In [33]:
from inspect import getmembers, isfunction
getmembers(hotpotqa)

[('CHECKPOINT_HYPER_PARAMS_KEY', 'hyper_parameters'),
 ('CHECKPOINT_HYPER_PARAMS_NAME', 'hparams_name'),
 ('CHECKPOINT_HYPER_PARAMS_TYPE', 'hparams_type'),
 ('T_destination', ~T_destination),
 ('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__abstractmethods__', frozenset()),
 ('__annotations__',
  {'_device': Ellipsis, '_dtype': typing.Union[str, torch.dtype]}),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__class__', abc.ABCMeta),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__init__': <function __main__.hotpotqa.__init__(self, args)>,
                'load_model': <function __main__.hotpotqa.load_model(self)>,
                'train_dataloader': <function pytorch_lightning.core.decorators.data_loader.<local

In [34]:
functions_list = [o for o in getmembers(hotpotqa) if isfunction(o[1])]
functions_list

[('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dir__', <function torch.nn.modules.module.Module.__dir__(self)>),
 ('__getattr__',
  <function torch.nn.modules.module.Module.__getattr__(self, name:str) -> Union[torch.Tensor, _ForwardRef('Module')]>),
 ('__init__', <function __main__.hotpotqa.__init__(self, args)>),
 ('__repr__', <function torch.nn.modules.module.Module.__repr__(self)>),
 ('__setattr__',
  <function torch.nn.modules.module.Module.__setattr__(self, name:str, value:Union[torch.Tensor, _ForwardRef('Module')]) -> None>),
 ('__setstate__',
  <function torch.nn.modules.module.Module.__setstate__(self, state)>),
 ('_apply', <function torch.nn.modules.module.Module._apply(self, fn)>),


In [35]:
import inspect
inspect.getmro(hotpotqa)  # a hierarchy of classes 

(__main__.hotpotqa,
 pytorch_lightning.core.lightning.LightningModule,
 abc.ABC,
 pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin,
 pytorch_lightning.core.grads.GradInformation,
 pytorch_lightning.core.saving.ModelIO,
 pytorch_lightning.core.hooks.ModelHooks,
 torch.nn.modules.module.Module,
 object)

In [36]:
help(hotpotqa.configure_optimizers)

Help on function configure_optimizers in module __main__:

configure_optimizers(self)



In [37]:
# import inspect
# code, line_no = inspect.getsourcelines(hotpotqa.training_step)
# print(''.join(code))

### main

In [38]:
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
    if not args.test:     # if it needs to train, remove exsiting folder
        import shutil
        save_folder = os.path.join(args.save_dir, args.save_prefix)
        if os.path.exists(save_folder):
            shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder 
        
    import shutil
    save_folder = os.path.join(args.save_dir, args.save_prefix)
    if os.path.exists(save_folder):
        shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder


In [41]:
    hotpotqa.__abstractmethods__=set()   # without this, got an error "Can't instantiate abstract class hotpotqa with abstract methods" if these two abstract methods are not implemented in the same cell where class hotpotqa defined 
    model = hotpotqa(args)
#     model.to('cuda')    # this is necessary to use gpu
    

self.args.model_path:  /xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096
Loaded model with config:
LongformerConfig {
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],
  "attention_mode": "sliding_chunks",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}



In [42]:
    logger = TestTubeLogger( # The TestTubeLogger adds a nicer folder structure to manage experiments and snapshots all hyperparameters you pass to a LightningModule.
        save_dir=args.save_dir,
        name=args.save_prefix,
        version=0  # always use version=0
    )

In [43]:
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
        save_top_k=5,
        verbose=True,
        monitor='avg_val_f1',
        mode='max',
        prefix=''
    )

In [44]:
    train_set_size = 9 * args.train_percent # 90447 * args.train_percent   # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    print("train_set_size: ", train_set_size) 

    args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus!='' else None
    num_devices = 1 or len(args.gpus)
    print("num_devices: ", num_devices)

    train_set_size = 90447 * args.train_percent    # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    args.steps = args.epochs * train_set_size / (args.batch_size * num_devices)

    print(f'>>>>>>> #train_set_size: {train_set_size}, #steps: {args.steps},  #warmup steps: {args.warmup}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<')

train_set_size:  9.0
num_devices:  1
>>>>>>> #train_set_size: 90447.0, #steps: 271341.0,  #warmup steps: 1000, #epochs: 6, batch_size: 2 <<<<<<<


In [45]:
    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp', # if args.gpus and (len(args.gpus) > 1) else None,
                             track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None, replace_sampler_ddp=False,
                             accumulate_grad_batches=args.batch_size,
                             train_percent_check = args.train_percent,
        #                          val_check_interval=args.val_every,
                             val_percent_check=args.val_percent_check,
                             test_percent_check=args.val_percent_check,
                             logger=logger if not args.disable_checkpointing else False,
                             checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
                             show_progress_bar=args.no_progress_bar,
                             use_amp=not args.fp32, 
                             amp_level='O2',
#                              check_val_every_n_epoch=1
                             )


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Multi-processing is handled by Slurm.
CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


In [46]:
#     if not args.test: 
trainer.fit(model)


initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=ddp
All DDP processes registered. Starting ddp with 1 processes
----------------------------------------------------------------------------------------------------
Set SLURM handle signals.

  | Name        | Type       | Params
-------------------------------------------
0 | model       | Longformer | 148 M 
1 | qa_outputs  | Linear     | 1 K   
2 | linear_type | Linear     | 2 K   
3 | fnn_sp_sent | Sequential | 591 K 
4 | fnn_sp_para | Sequential | 591 K 


reading file: small.json


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 5.4706e-02, -1.5538e-01,  3.4885e-01,  ..., -2.5452e-01,
          -1.9276e-01,  2.0085e-01],
         [-4.6809e-02, -3.1444e-01,  3.3644e-02,  ..., -2.7567e-01,
          -2.7047e-01, -1.7898e-02],
         [ 9.5809e-02,  2.6476e-01,  2.1300e-04,  ...,  2.4332e-01,
          -2.1853e-01,  3.7953e-01],
         ...,
         [-1.4013e-01,  7.2769e-02,  6.7755e-02,  ..., -5.8887e-01,
           1.2882e-01,  2.1166e-01],
         [-1.9286e-01,  2.8433e-01,  1.2068e-01,  ..., -6.7525e-01,
           5.1510e-01,  3.2453e-02],
         [-9.4074e-02, -2.1625e-02,  2.9422e-01,  ..., -2.7439e-01,
          -1.3567e-01,  3.9778e-02]]], device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0200],
         [ 0.0533],
         [ 0.0834],
         [-0.05

validation_step
qid:  5a8d7341554299441c6b9fe5
input_ids:  tensor([[    0, 50268, 13792,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  29,  186,  346,  509,  663,  734, 1082, 1242, 1484, 1664],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1022, -0.0290,  0.0089,  ..., -0.5491, -0.2408,  0.1749],
         [ 0.1352,  0.1677, -0.0301,  ..., -0.6092, -0.4661,  0.0331],
         [ 0.1261,  0.1799,  0.1295,  ..., -0.7442, -0.1860,  0.2010],
         ...,
         [ 0.0568,  0.0369,  0.0486,  ..., -0.3320, -0.3804,  0.2190],
         [ 0.4530, -0.0914,  0.0478,  ..., -0.4651, -0.2001,  0.1138],
         [ 0.3283, -0.2592, -0.0527,  ..., -0.6081, -0.3060, -0.0805]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[0.0389],
         [0.0751],
         [0.0318],
         [0.1155],
         [0.0593],
         [0.0839],
         [0.0490],
         [0.0625],
         [0.0841],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  tor

validation_epoch_end
before sync --> sizes:  2, 2, 2
after sync --> sizes: 2, 2, 2
avg_loss:  tensor(28.4462, device='cuda:0')	avg_answer_loss:  tensor(6.8955, device='cuda:0')	avg_type_loss:  tensor(1.4930, device='cuda:0')	avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')
reading file: small.json


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.0082,  0.0719,  0.2380,  ..., -0.5333, -0.1460,  0.1197],
         [-0.4725, -0.2648,  0.0687,  ...,  0.1257, -0.0669,  0.1381],
         [ 0.1906,  0.3724, -0.2047,  ...,  0.1236, -0.3601,  0.3201],
         ...,
         [-0.3070,  0.1384, -0.0398,  ..., -0.3126,  0.1217,  0.2014],
         [-0.1923,  0.2210,  0.0426,  ..., -0.2007, -0.4245, -0.0064],
         [ 0.1185,  0.0360,  0.4657,  ..., -0.0626, -0.3391,  0.2227]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0115],
         [ 0.0360],
         [ 0.0854],
         [ 0.0357],
         [ 0.0052],
         [ 0.0198],
         [ 0.0256],
         [-0.0456],
         [-0.0083],
         [ 0.0802]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  t



input_ids:  tensor([[    0, 50268, 12196, 26241,    21,  1236, 12336, 29227,  1506,  7259,
           268,  1141, 50269,     2, 50265, 13200,  4306,    50,    42, 25763,
          1848,   232, 50266, 13200,  4306,    50,    42, 25763,  1848,   232,
            16,  9031,   493,  1182, 15919, 25032,  5808,  1982,    30, 29227,
          1506,  7259,   254,    11, 35348,  2517,  3225,   223, 28899,     9,
          5808,  1982,    30,    39,  1141,  1236,  4438, 50267,  1040,   439,
         43179,   454,  8548,  3620,   107,    71,    24,    21,  1982,     8,
           316,   107,    71,  7259,   254,    17,    27,    29,   744, 50267,
            24,    16,  3924,  1687,     7,    28,     9,   773,    55,    25,
           892,     9,  7259,   254,    17,    27,    29, 11419,   434,    87,
            25, 11314,  2125,     9, 11845, 50267,  1437,     2, 50265, 48841,
         35678,   261,    30,  6414,   504,  5243, 50266, 48841, 35678,   261,
            30,  6414,     9,   504,  52

input_ids:  tensor([[    0, 50268,   438,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 20, 125, 185, 346, 416, 577, 675, 764, 808, 950], device='cuda:0')
sp_para_output:  tensor([[[ 0.4456, -0.1763, -0.0232,  ..., -0.0736, -0.3650, -0.1497],
         [ 0.1885,  0.3640,  0.0586,  ..., -0.2565,  0.1183, -0.3278],
         [ 0.1585,  0.2182,  0.0884,  ...,  0.3811, -0.2520, -0.1700],
         ...,
         [ 0.0291,  0.5402,  0.1409,  ..., -0.3148,  0.0525,  0.0923],
         [-0.1543,  0.1070,  0.3409,  ...,  0.2471, -0.1638, -0.2244],
         [ 0.2827,  0.6781,  0.1196,  ...,  0.3327, -0.3066,  0.0122]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0096],
         [-0.0160],
         [-0.0231],
         [-0.0413],
         [-0.0344],
         [-0.0139],
         [-0.0017],
         [ 0.0275],
         [-0.0724],
         [-0.0612]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  t

logits:  tensor([[ 8.6243e-02, -5.3711e-01, -2.0740e-01, -1.8030e-01, -1.6211e-01,
          7.8491e-02, -1.0577e-01, -6.6711e-02, -1.1823e-01,  3.1647e-02,
         -4.7656e-01, -2.4487e-01, -1.5759e-01, -1.5186e-01, -1.2598e-01,
         -6.1963e-01, -6.0730e-02, -4.4336e-01, -2.0520e-01, -3.4692e-01,
         -4.7998e-01, -2.6807e-01,  5.9128e-03, -1.2286e-01, -7.0557e-02,
          4.6921e-03, -1.0138e-01, -6.6345e-02, -5.9738e-03,  6.7825e-03,
         -9.1858e-02, -1.0992e-01,  6.2439e-02,  4.6310e-03,  1.5488e-02,
          5.9174e-02, -1.2262e-01, -5.2917e-02, -1.9189e-01, -6.6211e-01,
         -7.1899e-02, -1.2286e-01,  1.4270e-01, -4.0210e-01, -3.1299e-01,
         -7.7588e-01, -5.7080e-01, -4.5923e-01, -1.6821e-01, -1.1359e-01,
         -2.6932e-02, -1.4783e-01, -3.5449e-01,  1.0638e-01, -2.7905e-01,
         -1.9995e-01,  1.0785e-01, -3.0396e-01, -4.0588e-02, -3.0441e-02,
         -1.3098e-01, -6.9458e-02, -3.8477e-01, -5.0232e-02, -1.0809e-01,
         -1.4233e-01,  1.5434

sp_para_output:  tensor([[[ 0.0680,  0.1125, -0.0367,  ..., -0.6481, -0.1882,  0.1624],
         [ 0.0428,  0.4086,  0.0937,  ..., -0.3477, -0.6739,  0.2265],
         [-0.1094,  0.0565, -0.0288,  ..., -0.4599, -0.0517,  0.2079],
         ...,
         [ 0.0293,  0.2037, -0.1003,  ...,  0.0287, -0.2246,  0.2885],
         [-0.0019, -0.2994, -0.1558,  ...,  0.2006, -0.1944,  0.2424],
         [ 0.2322, -0.2457,  0.0616,  ...,  0.1121, -0.3075,  0.2188]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0365],
         [ 0.0342],
         [ 0.0193],
         [-0.0095],
         [ 0.0129],
         [-0.0170],
         [-0.1013],
         [-0.0184],
         [ 0.0237],
         [ 0.1244]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[551

logits:  tensor([[ 8.6975e-02, -3.9429e-01, -5.8655e-02, -7.0605e-01, -1.5979e-01,
         -2.9199e-01, -1.1920e-01, -6.7188e-01, -3.5181e-01, -3.5610e-03,
         -2.3267e-01, -2.4365e-01, -4.0210e-01, -7.5684e-02, -2.4951e-01,
         -1.4355e-01,  2.7023e-02, -2.1667e-01,  8.6731e-02, -1.8079e-01,
          2.9688e-01, -2.3389e-01, -3.5425e-01,  2.7618e-02, -2.2241e-01,
         -2.5146e-01, -3.9722e-01, -3.6987e-01, -3.5828e-02, -3.0444e-01,
         -2.2778e-01, -1.9336e-01, -1.3269e-01,  1.4099e-01, -1.0529e-01,
         -1.1554e-01,  4.4525e-02,  5.3177e-03,  8.8440e-02,  4.7314e-01,
          6.0364e-02,  1.4258e-01, -7.9773e-02,  1.4307e-01, -1.7114e-01,
          5.5313e-03, -3.7231e-01, -3.7817e-01, -4.7949e-01, -1.4990e-01,
         -1.4600e-01, -1.1566e-01, -2.7319e-01, -1.3660e-01, -1.1603e-01,
          3.6963e-01, -1.5186e-01, -2.5244e-01, -1.3206e-02, -2.7466e-01,
         -3.9404e-01,  1.1682e-01, -1.2433e-01, -4.3640e-02, -7.2510e-02,
         -3.5962e-01, -5.0098

sp_para_output:  tensor([[[ 0.1283,  0.1412,  0.0957,  ..., -0.8707, -0.0167, -0.2518],
         [ 0.1542, -0.2270,  0.1322,  ..., -0.3433,  0.0799,  0.3737],
         [ 0.1466,  0.1123, -0.1020,  ..., -0.2437, -0.3013, -0.0272],
         ...,
         [-0.0484,  0.1713, -0.0401,  ..., -0.6337,  0.0221, -0.1048],
         [ 0.1655,  0.0410,  0.2366,  ..., -0.6278, -0.5289,  0.1084],
         [ 0.0123,  0.0811,  0.1257,  ..., -0.4059, -0.1497, -0.1685]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0441],
         [ 0.1068],
         [ 0.0769],
         [-0.0251],
         [-0.0166],
         [ 0.0506],
         [ 0.1299],
         [ 0.0668],
         [ 0.1993],
         [ 0.0967]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110

input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1336,  0.3324,  0.1681,  ..., -0.7637,  0.0610,  0.1073],
         [ 0.0695,  0.1394,  0.0579,  ..., -0.5199, -0.1725,  0.1277],
         [ 0.0350,  0.0217,  0.0567,  ..., -0.6449,  0.0340,  0.1638],
         ...,
         [ 0.1755,  0.2698, -0.2663,  ..., -0.3152, -0.3696,  0.2511],
         [-0.0021,  0.4027,  0.0333,  ..., -1.0715, -0.2174, -0.0251],
         [ 0.1478,  0.0173, -0.0028,  ..., -0.4742, -0.1046,  0.1628]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0190],
         [ 0.0172],
         [ 0.0488],
         [-0.0287],
         [-0.0130],
         [-0.0064],
         [ 0.0527],
         [-0.0396],
         [ 0.0502],
         [ 0.0204]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

input_ids:  tensor([[    0, 50268,  8015,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   87,  154,  235,  298,  423,  471,  555,  788, 1046],
       device='cuda:0')
sp_para_output:  tensor([[[-2.1465e-02,  4.7597e-02,  1.4238e-01,  ..., -4.0550e-01,
          -1.8167e-01,  5.6836e-02],
         [ 6.8437e-02, -1.6103e-01, -2.3555e-01,  ..., -4.0803e-01,
          -1.2934e-02,  7.7159e-03],
         [ 5.2226e-02,  1.1833e-01,  3.8024e-02,  ..., -1.3814e-01,
          -1.4397e-01,  5.7350e-01],
         ...,
         [-5.1848e-02, -1.2810e-01,  4.8020e-02,  ..., -4.5231e-02,
           3.5037e-03,  1.8938e-01],
         [-2.8579e-01, -9.0410e-02, -5.4582e-04,  ..., -1.6164e-01,
          -2.6741e-01, -1.1251e-01],
         [ 3.3560e-01, -3.5267e-01, -3.3346e-04,  ...,  2.5118e-03,
           1.6809e-02,  2.2201e-01]]], device='cuda:0',
       grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0192],
         [ 0.0242],
         [ 0.0022],
         [ 0.

logits:  tensor([[ 2.7905e-01,  6.8555e-01,  2.8296e-01,  6.2109e-01,  3.4570e-01,
          3.2446e-01, -1.2042e-01,  1.6724e-01,  3.7549e-01,  3.0615e-01,
          2.3230e-01,  3.3740e-01,  6.4844e-01,  2.9272e-01,  1.7834e-01,
          1.3696e-01,  7.1973e-01,  2.9688e-01,  1.8750e-01,  4.5361e-01,
          6.1230e-01,  3.6377e-01,  5.4785e-01,  4.8340e-01,  5.7959e-01,
          3.7817e-01,  6.1035e-01,  2.5562e-01,  4.6143e-01,  4.5605e-01,
          5.4492e-01,  4.3433e-01,  2.1997e-01,  3.5913e-01,  3.4180e-01,
          1.3904e-01,  3.9038e-01,  4.6313e-01,  2.3718e-01,  5.9906e-02,
          1.7688e-01, -4.3365e-02, -7.2754e-02,  1.2732e-01,  1.9238e-01,
          2.9590e-01,  7.9834e-01,  2.6782e-01,  5.6055e-01,  4.0063e-01,
          3.7842e-01,  5.5322e-01,  3.3350e-01,  5.5566e-01,  3.8647e-01,
          1.3086e-01,  1.4795e-01,  4.1040e-01,  4.6899e-01,  5.6250e-01,
          3.7964e-01,  5.9863e-01,  2.5415e-01,  2.3010e-01,  7.3291e-01,
          4.5044e-01,  1.5771

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2711,  0.0766, -0.0404,  ..., -0.2340, -0.0678, -0.2027],
         [ 0.2989, -0.0561, -0.2971,  ..., -0.3284, -0.1061, -0.0629],
         [-0.1440,  0.0368, -0.1838,  ..., -0.8559, -0.1021,  0.1196],
         ...,
         [ 0.0480,  0.0921,  0.0208,  ...,  0.0354, -0.3449, -0.1266],
         [ 0.1482, -0.2100, -0.1496,  ..., -0.4224, -0.1854, -0.0964],
         [ 0.1819,  0.1221, -0.0298,  ..., -0.4820, -0.4675, -0.1058]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0745],
         [ 0.0064],
         [-0.0229],
         [ 0.2170],
         [ 0.0746],
         [-0.0887],
         [-0.0047],
         [ 0.0047],
         [ 0.0667],
         [-0.0194]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[-0.0659,  0.0303,  0.0852,  ..., -1.1116, -0.3138,  0.1394],
         [ 0.3416,  0.0598, -0.0064,  ..., -0.7952, -0.4330,  0.0587],
         [ 0.0676, -0.1207,  0.1481,  ..., -0.3235, -0.2529,  0.0873],
         ...,
         [ 0.2218,  0.2124, -0.0441,  ..., -0.4982, -0.4281,  0.0360],
         [ 0.1032, -0.0306,  0.1138,  ..., -0.3155, -0.1042, -0.0301],
         [ 0.2829, -0.0401, -0.0580,  ..., -0.6398, -0.2050,  0.3655]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0290],
         [ 0.1068],
         [-0.0176],
         [-0.0334],
         [-0.0331],
         [-0.0254],
         [ 0.0837],
         [ 0.0533],
         [ 0.0357],
         [ 0.0129]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 1.8241e-01, -1.4891e-01,  2.4945e-01,  ..., -9.6877e-01,
          -1.6171e-01,  2.0469e-01],
         [ 2.6505e-01,  2.3270e-02, -5.8151e-03,  ..., -4.5993e-01,
           9.3004e-02,  3.8258e-02],
         [ 9.6756e-02,  1.4527e-02, -9.9018e-02,  ..., -5.8512e-01,
           3.8614e-02,  1.1983e-01],
         ...,
         [ 5.8738e-02, -6.7621e-02, -6.5896e-02,  ..., -4.4699e-01,
          -1.4263e-02, -3.7104e-03],
         [ 2.6950e-01, -4.7719e-04,  2.0473e-01,  ..., -7.7463e-01,
          -1.7087e-01, -1.8241e-01],
         [-9.4009e-03,  8.5953e-02,  1.2350e-01,  ..., -4.9501e-01,
          -5.7526e-02, -4.3532e-02]]], device='cuda:0',
       grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0567],
         [-0.0040],
         [ 0.0445],
         [ 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.0559, -0.1418,  0.3471,  ..., -0.2158, -0.2069,  0.1851],
         [-0.0427, -0.3006,  0.0327,  ..., -0.2294, -0.2789, -0.0275],
         [ 0.0993,  0.2708,  0.0022,  ...,  0.2699, -0.2187,  0.3729],
         ...,
         [-0.1390,  0.0811,  0.0631,  ..., -0.5558,  0.1215,  0.2010],
         [-0.1896,  0.2902,  0.1180,  ..., -0.6545,  0.5086,  0.0233],
         [-0.0927, -0.0123,  0.2885,  ..., -0.2362, -0.1491,  0.0207]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0030],
         [ 0.0299],
         [ 0.0679],
         [-0.0746],
         [-0.0391],
         [ 0.0435],
         [-0.0266],
         [-0.0312],
         [-0.0577],
         [ 0.0264]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

sp_para_output:  tensor([[[ 0.1094, -0.0175,  0.0025,  ..., -0.5082, -0.2481,  0.1618],
         [ 0.1434,  0.1823, -0.0349,  ..., -0.5685, -0.4697,  0.0242],
         [ 0.1393,  0.1924,  0.1211,  ..., -0.6980, -0.1956,  0.1872],
         ...,
         [ 0.0658,  0.0424,  0.0393,  ..., -0.2976, -0.3847,  0.2081],
         [ 0.4633, -0.0824,  0.0394,  ..., -0.4319, -0.2135,  0.0998],
         [ 0.3340, -0.2535, -0.0512,  ..., -0.5694, -0.3156, -0.0911]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[0.0160],
         [0.0497],
         [0.0012],
         [0.0924],
         [0.0322],
         [0.0573],
         [0.0250],
         [0.0364],
         [0.0594],
         [0.0273]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1810])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1810])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[707,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[-0.1963,  0.2057, -0.0567,  ...,  0.1730,  0.0762,  0.0448],
         [ 0.0358,  0.3019, -0.1001,  ..., -0.1915,  0.0037,  0.2160],
         [ 0.1137,  0.0981, -0.0269,  ..., -0.9151, -0.3209,  0.0984],
         ...,
         [-0.0040,  0.0244,  0.0795,  ..., -0.1934, -0.3553,  0.0573],
         [-0.0577,  0.4538, -0.1598,  ..., -0.3199, -0.0795, -0.0675],
         [ 0.0536,  0.1029,  0.0519,  ..., -0.6355, -0.4005, -0.0237]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0284],
         [-0.0929],
         [-0.0117],
         [-0.0045],
         [-0.0057],
         [ 0.0033],
         [-0.0588],
         [ 0.0524],
         [-0.0599],
         [-0.0319]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[448,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.1750, -0.0121,  0.1452,  ...,  0.2263, -0.1886,  0.0381],
         [ 0.1144,  0.1081,  0.0997,  ...,  0.0125,  0.1001, -0.0803],
         [ 0.1251,  0.1800,  0.2139,  ...,  0.0143, -0.1324, -0.0664],
         ...,
         [-0.0427,  0.2299,  0.2609,  ..., -0.0170, -0.1162,  0.1184],
         [-0.1093,  0.2380,  0.0699,  ..., -0.1159, -0.4829, -0.3515],
         [ 0.1661, -0.0357,  0.2417,  ...,  0.3269, -0.3539, -0.1089]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0513],
         [-0.0500],
         [-0.0573],
         [-0.0196],
         [-0.0328],
         [-0.0267],
         [-0.0641],
         [-0.0231],
         [-0.0602],
         [-0.0894]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1098])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 63, 299, 583, 586, 589, 594, 667, 670,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1980,  0.0589, -0.0214,  ..., -0.1236, -0.2998, -0.1076],
         [-0.0561,  0.2632,  0.0046,  ..., -0.3893,  0.1180,  0.3905],
         [ 0.2434,  0.2244,  0.0749,  ..., -0.2319, -0.4432, -0.2348],
         ...,
         [ 0.2138,  0.0559, -0.0819,  ..., -0.3699, -0.4306, -0.1853],
         [ 0.1800, -0.0774,  0.1975,  ..., -0.3492, -0.3607,  0.0469],
         [ 0.2486,  0.1541, -0.0471,  ..., -0.5024, -0.4033, -0.1863]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0360],
         [-0.0589],
         [ 0.0035],
         [-0.0111],
         [-0.0173],
         [-0.0273],
         [ 0.0105],
         [-0.0660],
         [ 0.0522],
         [ 0.0174]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1681])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1681])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[955, 961,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1206,  0.3484,  0.1939,  ...,  0.0037,  0.0813,  0.0131],
         [-0.2079,  0.4304, -0.2877,  ..., -0.6767,  0.3123, -0.1621],
         [-0.0606,  0.0485, -0.1510,  ..., -0.0655, -0.0297, -0.2533],
         ...,
         [-0.0703,  0.2027, -0.2037,  ..., -0.1589, -0.0663, -0.1227],
         [-0.0589,  0.2694, -0.0239,  ..., -0.1567, -0.1887,  0.1265],
         [-0.2561,  0.0577, -0.1282,  ..., -0.1200, -0.0852,  0.0894]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 1.6953e-02],
         [ 4.9805e-02],
         [ 1.6464e-02],
         [-2.2385e-02],
         [ 9.3699e-05],
         [ 1.7792e-02],
         [-1.0452e-02],
         [ 3.6285e-02],
         [-1.7776e-02],
         [-3.5217e-02]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 642, 795,   0,  

sp_para_output:  tensor([[[-0.0206,  0.0267,  0.0717,  ..., -0.2302, -0.4963, -0.0078],
         [-0.1042,  0.0328, -0.0275,  ..., -0.1432, -0.2008,  0.0373],
         [-0.0165,  0.4253, -0.0713,  ..., -0.7315, -0.0052,  0.1630],
         ...,
         [-0.0411,  0.2064,  0.1267,  ..., -0.2154, -0.3083,  0.0868],
         [-0.0841, -0.2466, -0.0262,  ..., -0.7480, -0.2468,  0.2380],
         [ 0.1048, -0.2189,  0.0839,  ..., -0.0364, -0.2595,  0.1892]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0056],
         [ 0.0516],
         [-0.0275],
         [ 0.0084],
         [-0.0085],
         [ 0.0008],
         [-0.0478],
         [-0.0241],
         [ 0.0759],
         [ 0.0526]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[551, 556, 827,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.0446, -0.0151, -0.0490,  ..., -0.1755, -0.2593,  0.1305],
         [ 0.3026,  0.0916, -0.1217,  ..., -0.5165, -0.1533, -0.1551],
         [ 0.0903,  0.1863,  0.0204,  ..., -0.4815,  0.0097, -0.1964],
         ...,
         [ 0.2918, -0.1333,  0.1401,  ..., -0.1480, -0.1536,  0.1073],
         [ 0.3321,  0.0293, -0.1041,  ..., -0.4352, -0.2599, -0.2251],
         [-0.0164,  0.4761,  0.0656,  ..., -0.4854, -0.2042, -0.0610]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0170],
         [ 0.0640],
         [-0.0044],
         [-0.0058],
         [ 0.0204],
         [-0.0154],
         [-0.0162],
         [ 0.0020],
         [ 0.1199],
         [ 0.0553]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1340])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1173,    0,    0,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[-0.1643,  0.2312, -0.0998,  ..., -0.3308,  0.0742, -0.1627],
         [-0.0794, -0.4565, -0.4763,  ..., -0.2187,  0.0606,  0.2139],
         [ 0.0110, -0.2773,  0.2597,  ...,  0.3469, -0.6274, -0.1012],
         ...,
         [ 0.0200, -0.0281,  0.0457,  ...,  0.1580, -0.3293,  0.2461],
         [-0.0155,  0.1447,  0.1648,  ..., -0.2573, -0.2658, -0.0185],
         [-0.2068, -0.2260,  0.2520,  ...,  0.3615, -0.3658, -0.1094]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0084],
         [ 0.0055],
         [ 0.0671],
         [ 0.0170],
         [ 0.0044],
         [ 0.0773],
         [ 0.0826],
         [ 0.0508],
         [-0.0311],
         [ 0.0244]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 988])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 988])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[93,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

sp_para_output:  tensor([[[ 0.2738,  0.0379,  0.0304,  ..., -0.3618, -0.1010, -0.0937],
         [ 0.1120,  0.2339, -0.0497,  ..., -0.6682, -0.1864,  0.0080],
         [ 0.0594,  0.0507, -0.1015,  ..., -0.0551, -0.2809, -0.0924],
         ...,
         [-0.0118, -0.0730,  0.1130,  ..., -0.3116,  0.1071,  0.1751],
         [ 0.0108,  0.2587, -0.0556,  ..., -0.4011, -0.0593,  0.0456],
         [-0.0117,  0.0896, -0.0502,  ..., -0.2382, -0.3014, -0.1091]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0209],
         [ 0.0544],
         [ 0.0851],
         [-0.0703],
         [-0.0324],
         [-0.0243],
         [ 0.0234],
         [ 0.0158],
         [ 0.0759],
         [ 0.0649]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.0911, -0.0651,  0.1207,  ..., -0.2794, -0.1014,  0.0313],
         [ 0.0602, -0.0311, -0.0824,  ..., -0.2845, -0.1654, -0.0325],
         [ 0.1039,  0.1837,  0.0394,  ..., -0.2754, -0.1646,  0.1565],
         ...,
         [ 0.0151,  0.0433, -0.0093,  ..., -0.3566, -0.2154, -0.1291],
         [ 0.0779, -0.2508,  0.2337,  ..., -0.3894, -0.1805,  0.2653],
         [ 0.1054, -0.0192,  0.1254,  ..., -0.2658, -0.0486, -0.0218]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0332],
         [-0.0520],
         [-0.0328],
         [-0.0170],
         [ 0.0681],
         [ 0.0634],
         [ 0.0136],
         [-0.0326],
         [-0.0676],
         [ 0.0021]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.0398,  0.3599,  0.1565,  ..., -0.2592,  0.0197, -0.0312],
         [-0.0436, -0.0208, -0.1863,  ..., -0.4311, -0.1401, -0.0320],
         [-0.0824,  0.0132, -0.1334,  ..., -0.4186, -0.0627,  0.1527],
         ...,
         [ 0.0843, -0.0649, -0.1136,  ..., -0.4628, -0.2517,  0.0283],
         [-0.1359, -0.1913, -0.1056,  ..., -0.4181, -0.1797, -0.1774],
         [-0.0668, -0.1918, -0.1147,  ..., -0.3404, -0.0256,  0.0255]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0146],
         [ 0.0083],
         [-0.0389],
         [ 0.0021],
         [ 0.0366],
         [ 0.0340],
         [ 0.0461],
         [ 0.0523],
         [-0.0051],
         [ 0.0417]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1530,  0.1318,  0.1095,  ..., -0.0352, -0.0745,  0.1962],
         [ 0.1900,  0.2106,  0.0573,  ..., -0.3674, -0.1634,  0.1282],
         [-0.0060,  0.0878,  0.1861,  ..., -0.2203,  0.0664, -0.0319],
         ...,
         [ 0.0996,  0.1238, -0.0418,  ..., -0.0734, -0.1852,  0.2359],
         [ 0.1553,  0.2817, -0.0551,  ..., -0.4810,  0.1211,  0.0916],
         [ 0.1345,  0.2328, -0.0726,  ..., -0.3229, -0.3208, -0.1162]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0048],
         [ 0.0240],
         [ 0.0413],
         [ 0.0196],
         [ 0.0191],
         [ 0.0043],
         [ 0.0469],
         [-0.0782],
         [-0.0285],
         [ 0.0797]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1136])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1136])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[284, 535,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.0543,  0.2945, -0.0314,  ..., -0.5485,  0.1382,  0.2093],
         [ 0.1610, -0.1159, -0.1692,  ..., -0.1644, -0.2417,  0.3190],
         [ 0.3161,  0.0049, -0.1200,  ..., -0.3855, -0.2098,  0.0242],
         ...,
         [ 0.2060,  0.1304, -0.1652,  ..., -0.5430, -0.1034,  0.2725],
         [ 0.1182,  0.2875,  0.0706,  ..., -0.3599,  0.0452,  0.1160],
         [ 0.3438, -0.2727, -0.2658,  ..., -0.1432, -0.1626,  0.0993]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0173],
         [ 0.0065],
         [-0.0825],
         [ 0.0181],
         [ 0.0345],
         [ 0.0348],
         [-0.0778],
         [ 0.0381],
         [ 0.0098],
         [ 0.0446]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[203, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1063, -0.0728,  0.0090,  ..., -0.7717, -0.2418,  0.2265],
         [-0.0932,  0.2160, -0.0131,  ..., -0.3659, -0.0672,  0.1152],
         [-0.0925,  0.3950, -0.0118,  ..., -0.5993, -0.1723,  0.1546],
         ...,
         [ 0.0047,  0.1002, -0.0241,  ..., -0.5266, -0.2924, -0.0275],
         [ 0.0562,  0.0793,  0.0392,  ..., -0.1796,  0.1183,  0.3428],
         [-0.0026,  0.0642, -0.0482,  ..., -0.7394, -0.5180, -0.1425]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0347],
         [ 0.0030],
         [-0.0272],
         [ 0.0760],
         [ 0.0487],
         [ 0.1178],
         [ 0.0027],
         [ 0.0562],
         [ 0.0352],
         [ 0.0586]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1383])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1383])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[634, 969,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

logits:  tensor([[ 1.9043e-01, -5.6836e-01, -4.9133e-02, -6.4392e-02,  1.4453e-01,
         -2.4072e-01, -2.8711e-01, -5.2551e-02, -1.4307e-01, -4.9255e-02,
         -1.8811e-01, -8.7463e-02, -5.3857e-01, -1.8225e-01,  5.2948e-02,
         -4.9011e-02, -4.5972e-01,  1.2311e-01, -1.1090e-01, -8.2617e-01,
         -4.3530e-01, -8.7219e-02, -3.2593e-01,  1.3171e-01,  9.3201e-02,
         -4.8035e-02,  7.7820e-02, -1.3525e-01, -1.4563e-01,  9.6985e-02,
          1.1426e-01, -4.8523e-02,  6.9504e-03, -4.2310e-01, -2.6831e-01,
          1.5020e-03, -1.5637e-01, -6.9397e-02, -4.1733e-03, -5.6396e-01,
          1.3452e-01, -7.3926e-01, -1.1314e-02,  2.3096e-01,  1.7261e-01,
         -4.1138e-02,  2.3816e-01, -6.3965e-02, -4.9414e-01, -3.6230e-01,
         -8.7524e-02,  2.6978e-01, -8.5791e-01, -4.0894e-01, -8.6487e-02,
         -3.0591e-01, -1.4824e-02,  1.3867e-01, -1.0577e-01,  1.6406e-01,
         -2.6855e-01, -2.2949e-01,  2.9932e-01,  2.1960e-01,  7.5439e-02,
          1.1877e-01, -4.0283

sp_para_output:  tensor([[[-0.0461,  0.0364,  0.0562,  ..., -0.1261, -0.2172,  0.1457],
         [ 0.0302, -0.2595, -0.3465,  ..., -0.6064, -0.0027, -0.0205],
         [-0.0389,  0.2007, -0.2296,  ..., -0.1371, -0.2736,  0.3103],
         ...,
         [ 0.1355, -0.1150, -0.0084,  ..., -0.0902, -0.0691,  0.1159],
         [-0.1109, -0.0909,  0.0237,  ...,  0.0613, -0.0468,  0.1993],
         [ 0.0956, -0.2759,  0.0380,  ...,  0.1337, -0.1003,  0.1584]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0056],
         [ 0.0058],
         [ 0.0103],
         [-0.0267],
         [-0.1045],
         [-0.0546],
         [-0.0265],
         [-0.0659],
         [-0.0483],
         [-0.0420]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268, 288, 337, 362, 387, 430, 434,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[-5.2266e-02,  1.0341e-01,  1.3051e-01,  ...,  1.6818e-01,
           8.0166e-02,  6.5596e-02],
         [-8.9764e-02,  5.5675e-01, -6.3893e-02,  ..., -1.0684e-01,
          -9.2606e-02, -2.9789e-01],
         [-2.0646e-01,  2.0887e-01, -1.0688e-01,  ..., -3.5919e-01,
           4.0951e-02, -5.0996e-04],
         ...,
         [-5.2665e-02,  4.0593e-01,  2.1783e-03,  ..., -3.2903e-01,
           2.1304e-01,  2.7084e-01],
         [-1.4959e-01,  8.1868e-02, -3.7422e-02,  ..., -2.5061e-01,
          -2.1760e-02,  1.9597e-01],
         [-2.3971e-01,  2.1244e-01, -3.0990e-02,  ..., -1.7529e-02,
           4.9361e-01, -1.1585e-01]]], device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0108],
         [ 0.0437],
         [-0.0141],
         [-0.0154],
         [ 0.0472],
         [ 0.0442],
         [ 0.0329],
         [ 0.0325],
         [ 0.0500],
         [-0.0559]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 877])
start_positions.si

sp_para_output:  tensor([[[ 0.2841, -0.0493,  0.1860,  ..., -0.3209, -0.2123, -0.1174],
         [ 0.0832, -0.1108, -0.2222,  ..., -0.3442, -0.2484, -0.1717],
         [ 0.0129, -0.1504, -0.0954,  ..., -0.3876, -0.0106,  0.0768],
         ...,
         [-0.0210, -0.0893,  0.0099,  ..., -0.3290, -0.2857, -0.0677],
         [ 0.0236, -0.1579, -0.0921,  ..., -0.3793, -0.0165,  0.0575],
         [-0.1862, -0.0255, -0.2033,  ..., -0.0833, -0.4898, -0.0653]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0417],
         [ 0.0018],
         [ 0.0094],
         [-0.0483],
         [-0.0416],
         [ 0.0507],
         [ 0.0198],
         [-0.0037],
         [ 0.0067],
         [-0.0400]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 835])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 835])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[410, 577, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 1.7246e-01, -2.9101e-01, -1.8987e-01,  ..., -2.2513e-01,
          -3.0006e-01, -1.5289e-01],
         [ 3.4085e-01, -3.5309e-01, -4.2906e-01,  ..., -4.6900e-01,
          -2.2642e-01,  5.3690e-02],
         [-1.6746e-01,  3.2085e-01, -1.7624e-01,  ..., -7.7241e-01,
          -7.2192e-02, -8.2339e-02],
         ...,
         [ 1.6927e-01, -1.7717e-01, -2.6770e-01,  ..., -6.3994e-02,
          -1.7955e-01,  1.0536e-01],
         [ 5.6443e-04,  7.9701e-02, -3.5055e-01,  ..., -2.7156e-01,
          -1.6078e-01, -1.7937e-01],
         [ 1.8807e-01, -1.4659e-01, -1.7571e-01,  ..., -5.5147e-01,
          -2.6773e-01,  1.1354e-01]]], device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0826],
         [-0.0216],
         [-0.0317],
         [-0.0054],
         [ 0.1109],
         [ 0.0035],
         [-0.0027],
         [ 0.0146],
         [ 0.0740],
         [-0.0067]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1177])
start_positions.s

sp_para_output:  tensor([[[ 0.1594,  0.0254,  0.0629,  ...,  0.1633, -0.3948,  0.1131],
         [-0.0229,  0.2830, -0.1069,  ..., -0.4586,  0.1344,  0.2245],
         [ 0.0548, -0.0355, -0.0268,  ...,  0.3034, -0.4550, -0.0732],
         ...,
         [ 0.0322,  0.0032, -0.0320,  ..., -0.0361, -0.5263, -0.1673],
         [-0.0115,  0.1899, -0.0219,  ...,  0.1109, -0.4800, -0.2688],
         [ 0.0951, -0.0503,  0.0755,  ...,  0.0661, -0.3951,  0.1105]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0499],
         [-0.0059],
         [ 0.0369],
         [-0.0280],
         [ 0.0913],
         [ 0.0442],
         [-0.0298],
         [ 0.0684],
         [-0.0129],
         [ 0.0804]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1514])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1514])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 624,  917, 1128,    0,    0,    0,    0,    0,    0,    0,

validation_step
qid:  5a76a401554299373536012b
input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.0847, -0.0421,  0.0688,  ..., -0.3255, -0.1600,  0.1773],
         [ 0.1516,  0.2865, -0.0853,  ..., -0.6568, -0.4253, -0.1191],
         [ 0.0670, -0.1030,  0.1787,  ..., -0.4241, -0.1991,  0.0600],
         ...,
         [ 0.1020,  0.1746, -0.1284,  ..., -0.6093, -0.4608, -0.1218],
         [ 0.2098,  0.0384,  0.0575,  ..., -0.3998,  0.0084,  0.0500],
         [ 0.3077,  0.0753,  0.0384,  ..., -0.4304, -0.3542,  0.0465]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0120],
         [ 0.0999],
         [-0.0244],
         [-0.0955],
         [-0.0210],
         [-0.0192],
         [ 0.0590],
         [-0.0110],
         [ 0.0535],
         [ 0.0107]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.0623,  0.0339,  0.2014,  ...,  0.1768, -0.3218, -0.1019],
         [-0.1463,  0.0625, -0.2073,  ..., -0.0211, -0.1685, -0.3247],
         [ 0.0282, -0.0422, -0.1090,  ..., -0.1675, -0.2729, -0.0836],
         ...,
         [ 0.0441,  0.1917,  0.1422,  ..., -0.3037, -0.2796, -0.2393],
         [ 0.1187,  0.3458,  0.2146,  ..., -0.0519, -0.4219, -0.0456],
         [-0.2058,  0.1245, -0.0975,  ..., -0.1668, -0.2915, -0.3007]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0226],
         [ 0.0365],
         [-0.0777],
         [-0.0844],
         [-0.0087],
         [-0.0366],
         [-0.0658],
         [-0.0235],
         [-0.0708],
         [-0.0087]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1326])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1326])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1324,    0,    0,    0,    0,    0,    0,    0,    0,    0,

para_sent_logits: tensor([-0.0439, -0.0080, -0.0123], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-0.0927, -0.0642], device='cuda:0')
para_sents_offset: [0, 2, 5]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([0.0331, 0.0046], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-0.0927, -0.0642,  0.0377], device='cuda:0')
para_sents_offset: [0, 2, 5, 7]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([ 0.0617,  0.0046,  0.0206, -0.0142], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-0.0927, -0.0642,  0.0377,  0.0727], device='cuda:0')
para_sents_offset: [0, 2, 5, 7, 11]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cud


Epoch 00000: avg_val_f1 reached 0.00000 (best 0.00000), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_0.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(27.2731, device='cuda:0')	avg_answer_loss:  tensor(6.4102, device='cuda:0')	avg_type_loss:  tensor(1.3981, device='cuda:0')	avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  tensor(0.0402, device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0.0690, device='cuda:0')	avg_val_sp_sent_recall:  tensor(0.0287, device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')




input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.2269, -0.2732,  0.3011,  ...,  0.3894, -0.5265,  0.1928],
         [-0.1099,  0.1951,  0.0692,  ..., -0.1992, -0.4763, -0.0742],
         [-0.0318,  0.2468, -0.0506,  ...,  0.2590, -0.0645,  0.3830],
         ...,
         [ 0.1301,  0.0829,  0.1002,  ..., -0.5629, -0.0943,  0.2597],
         [-0.1271,  0.4070,  0.1372,  ...,  0.1189,  0.0742,  0.0723],
         [-0.1170,  0.0521,  0.0360,  ..., -0.0700, -0.4527,  0.0265]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0267],
         [ 0.0252],
         [ 0.0289],
         [-0.0093],
         [ 0.0076],
         [ 0.1455],
         [ 0.0264],
         [ 0.0083],
         [ 0.0412],
         [ 0.0016]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  t

logits:  tensor([[ 9.4629e-01,  6.0986e-01,  4.2749e-01,  5.2393e-01,  2.9761e-01,
          4.2920e-01,  2.1887e-01,  4.9316e-01,  2.1204e-01,  3.1445e-01,
          3.3276e-01,  1.2158e+00,  6.6260e-01,  2.6636e-01,  4.7070e-01,
          3.8965e-01,  3.2544e-01,  1.8872e-01,  5.5713e-01,  1.0869e+00,
          3.6841e-01,  4.5020e-01,  6.6748e-01,  4.0381e-01,  3.4155e-01,
         -1.3745e-01,  5.2490e-01,  3.6572e-01,  7.0557e-01,  5.4443e-01,
          5.1367e-01,  2.4451e-01,  3.0762e-01,  2.5269e-01,  2.6343e-01,
          3.4766e-01,  8.3252e-01,  4.1821e-01,  2.7100e-01,  5.0293e-01,
          6.0254e-01,  4.6533e-01,  2.9053e-01,  5.7666e-01,  3.2397e-01,
          3.7915e-01,  6.6406e-01,  5.5566e-01,  6.0400e-01,  2.0178e-01,
          9.4873e-01,  4.0601e-01,  6.5857e-02,  8.1982e-01,  9.2285e-01,
          5.5566e-01,  3.2275e-01,  8.0615e-01,  5.5127e-01,  3.9520e-02,
          5.7324e-01,  4.7070e-01,  4.0015e-01,  3.4058e-01,  3.4106e-01,
         -3.0457e-02,  4.1846

sp_para_output:  tensor([[[ 1.2163e-01,  3.1898e-02,  9.5316e-02,  ...,  3.1169e-01,
          -2.6663e-01,  3.3187e-01],
         [ 3.5558e-01,  1.9265e-01,  4.7111e-02,  ...,  4.2890e-02,
           9.6017e-02, -5.5065e-02],
         [ 2.2631e-01, -7.8662e-02,  1.6017e-01,  ...,  2.0370e-01,
          -2.2459e-01, -2.4053e-01],
         ...,
         [ 1.0709e-01, -1.7530e-02,  2.7707e-04,  ...,  2.2849e-02,
          -2.9283e-01,  1.5913e-01],
         [ 2.4126e-01,  2.8670e-01,  1.8370e-01,  ..., -3.2787e-01,
          -5.0467e-01, -8.7928e-02],
         [ 6.5476e-02, -5.1941e-02,  7.5422e-03,  ...,  4.0060e-01,
          -2.2887e-02,  1.8789e-01]]], device='cuda:0',
       grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0097],
         [-0.0961],
         [-0.0414],
         [-0.0464],
         [-0.0461],
         [-0.0458],
         [-0.1099],
         [ 0.0022],
         [-0.0592],
         [-0.1095]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>

start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 642, 795,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.3472,  0.5552,  0.2751,  0.4585,  0.2026,  0.9941,  0.2065,  0.4609,
          0.2102,  0.2778,  0.6440,  0.0484,  0.3274,  0.3408,  0.1729,  0.2908,
          0.2484,  0.3738,  0.3057,  0.2036,  0.6973,  0.4597,  0.3320,  0.3518,
          0.4485,  0.1855,  0.3972,  0.3284,  0.2443,  0.0429,  0.3110,  0.2751,
          0.3186,  0.5439,  0.3250,  0.1743,  0.2465,  0.2988,  0.3120,  0.3127,
          0.57

input_ids:  tensor([[    0, 50268,  8155,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  39,  184,  322,  407,  550,  683,  792,  877, 1037, 1088],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.4588,  0.0668, -0.0259,  ..., -0.7679, -0.5238,  0.1524],
         [ 0.0673, -0.0640, -0.0089,  ..., -0.3253,  0.0850, -0.1996],
         [-0.0137, -0.0634,  0.1016,  ...,  0.1205, -0.3383, -0.3571],
         ...,
         [ 0.1711,  0.1694,  0.0248,  ..., -0.2117, -0.1649,  0.3325],
         [ 0.0050, -0.0010, -0.0045,  ..., -0.2812, -0.3738,  0.3281],
         [ 0.2342, -0.1779,  0.0432,  ..., -0.2045, -0.3896,  0.2721]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0060],
         [ 0.0452],
         [-0.0446],
         [-0.0462],
         [ 0.0589],
         [-0.0196],
         [-0.1089],
         [-0.0723],
         [-0.0079],
         [ 0.0324]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

logits:  tensor([[ 5.7922e-02, -3.9062e-01, -5.7068e-02, -5.7275e-01, -1.4685e-01,
         -1.6284e-01,  3.6538e-05, -3.9990e-01, -6.6162e-01, -5.1514e-02,
         -3.1104e-01, -7.2388e-02, -3.6353e-01, -1.0162e-01, -1.7468e-01,
          5.0171e-02,  3.2898e-02, -5.1172e-01, -9.7351e-02, -1.1493e-01,
          1.0779e-01, -5.1758e-01, -2.1545e-01, -9.6130e-02, -1.0809e-01,
         -2.9077e-01, -1.7944e-01, -3.1250e-01,  2.1387e-01, -5.5859e-01,
         -2.2876e-01, -3.2373e-01, -1.5088e-01,  1.3733e-01,  1.7041e-01,
         -1.8518e-01, -2.2705e-02,  1.2683e-01,  1.5039e-01,  2.4841e-01,
          8.1787e-02,  9.4543e-02, -8.5388e-02,  1.1670e-01, -1.0870e-01,
         -1.0651e-01, -4.6680e-01, -2.6709e-01, -2.8418e-01, -2.3218e-01,
         -3.4790e-02,  2.8473e-02, -2.3804e-01, -1.3574e-01,  4.0497e-02,
          3.7500e-01, -1.6406e-01, -1.7432e-01,  2.7523e-03, -2.7051e-01,
         -2.1863e-01,  9.8389e-02, -1.0260e-01, -4.9866e-02, -1.1322e-01,
         -3.7915e-01, -5.5859

sp_para_output:  tensor([[[ 0.1593,  0.1626,  0.0809,  ..., -0.5259,  0.0892,  0.0603],
         [ 0.1551,  0.1803, -0.0089,  ..., -0.5252, -0.0303,  0.1115],
         [-0.2687,  0.1169,  0.2234,  ..., -0.5750, -0.6865, -0.2760],
         ...,
         [-0.3116, -0.2399,  0.2593,  ...,  0.1116, -0.2272,  0.3179],
         [ 0.0226,  0.1937,  0.0792,  ..., -0.0830,  0.0787, -0.0244],
         [ 0.0491,  0.1156,  0.0055,  ..., -0.9227, -0.0999, -0.0554]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0009],
         [-0.0090],
         [ 0.0057],
         [-0.1124],
         [-0.0766],
         [ 0.0362],
         [ 0.0279],
         [ 0.0794],
         [ 0.0497],
         [ 0.0097]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110

input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1716,  0.3336,  0.0276,  ..., -0.6926, -0.0786, -0.0519],
         [ 0.1944,  0.2591, -0.1237,  ..., -0.6237,  0.1108,  0.0770],
         [ 0.0684,  0.2008, -0.0905,  ..., -0.4408, -0.1092, -0.1474],
         ...,
         [ 0.0948, -0.0531, -0.1608,  ..., -0.7328, -0.2162, -0.3137],
         [ 0.2833,  0.3552,  0.0775,  ...,  0.0890,  0.1246,  0.1045],
         [ 0.1839,  0.1631,  0.0819,  ..., -0.5635, -0.2968, -0.0382]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0199],
         [ 0.0282],
         [ 0.0102],
         [-0.0583],
         [-0.0723],
         [ 0.0031],
         [ 0.0587],
         [ 0.0250],
         [-0.0826],
         [ 0.0604]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

masked_target:  tensor([[ 68, 225, 248, 302, 386, 417, 495, 540, 647, 656, 693, 747, 788, 805,
         838,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.3333,  0.5420,  0.2854,  0.2146,  1.0771,  0.3076,  0.5889,  0.4021,
          0.5142,  0.4138,  0.6592,  0.4075,  0.3164,  0.4304,  0.2732,  0.1406,
          0.3569,  0.3481,  0.3018,  0.3914,  0.4155, -0.0782,  0.8022,  0.5801,
          0.3352,  0.4368,  0.9473,  0.6294,  0.0643,  0.6436,  0.5840,  0.3809,
          0.4260,  0.7046,  0.1471,  0.0426,  0.3779,  0.2898,  0.2306,  0.5703,
          0.2421,  0.5864,  0.7148,  0.2438,  0.0252,  0.7305,  0.4587,  0.2217,
          0.5562,  0.8838,  0.5869,  0.2800,  0.4592,  0.3794, -0.2335,  0.7563,
          0.4775, -0.019

input_ids:  tensor([[    0, 50268,  8015,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   87,  154,  235,  298,  423,  471,  555,  788, 1046],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1632, -0.3676,  0.2576,  ...,  0.2054, -0.2373,  0.3466],
         [-0.1065, -0.5428, -0.5812,  ..., -0.1944, -0.1682,  0.0795],
         [-0.3559,  0.3527, -0.2093,  ...,  0.1107, -0.3095,  0.4230],
         ...,
         [ 0.0669, -0.2038, -0.1170,  ..., -0.2772, -0.2989,  0.3366],
         [-0.1730, -0.2954,  0.0808,  ...,  0.2297,  0.1698,  0.2214],
         [ 0.1288,  0.0874, -0.0090,  ...,  0.2182,  0.1021,  0.3381]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2450],
         [-0.0195],
         [-0.1345],
         [-0.1244],
         [-0.1716],
         [-0.0797],
         [-0.1201],
         [-0.0863],
         [-0.1464],
         [-0.0970]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

sp_para_output:  tensor([[[ 1.5859e-02,  2.4725e-01,  2.0030e-01,  ...,  1.6505e-01,
           1.3964e-01,  1.5310e-01],
         [ 2.1315e-01,  3.9497e-01,  1.6755e-01,  ..., -6.0459e-01,
          -2.4436e-01, -1.3930e-01],
         [ 5.3154e-04,  2.2671e-01,  5.5575e-02,  ..., -2.5895e-01,
          -4.9063e-02, -4.4125e-03],
         ...,
         [-8.2675e-03,  3.5195e-01,  8.0377e-02,  ..., -5.7709e-01,
           1.4345e-01,  2.6207e-01],
         [-3.6402e-01,  1.5304e-01,  2.3370e-01,  ..., -1.0845e-01,
          -1.2021e-01,  3.5022e-01],
         [ 7.4765e-02,  3.0778e-01,  1.2878e-01,  ..., -2.6240e-01,
          -6.8112e-02, -3.6395e-01]]], device='cuda:0',
       grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[ 0.0016],
         [ 0.0039],
         [-0.0310],
         [-0.0790],
         [ 0.0211],
         [-0.0370],
         [-0.1326],
         [ 0.0358],
         [-0.0373],
         [-0.0875]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1831, -0.1801, -0.2810,  ..., -0.3353, -0.2542, -0.1925],
         [ 0.2332,  0.0607, -0.2440,  ..., -0.0828, -0.2131,  0.3151],
         [ 0.0143,  0.1786, -0.0874,  ..., -0.5686, -0.1353, -0.0593],
         ...,
         [ 0.0734,  0.2577, -0.2738,  ..., -0.1091, -0.0987, -0.0630],
         [ 0.2126, -0.0384, -0.2114,  ...,  0.1145, -0.3488, -0.0313],
         [ 0.0917,  0.1191, -0.2536,  ..., -0.0276, -0.3917,  0.0352]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0672],
         [-0.0775],
         [-0.0471],
         [-0.1461],
         [ 0.0267],
         [-0.0323],
         [-0.0153],
         [-0.0018],
         [-0.0577],
         [-0.0639]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.0421,  0.0915, -0.0395,  ..., -0.4041, -0.1894,  0.1631],
         [ 0.2626,  0.2964,  0.0210,  ..., -0.2057, -0.3279,  0.1379],
         [ 0.1749, -0.0300,  0.0426,  ..., -0.3094, -0.3831,  0.0710],
         ...,
         [ 0.1974, -0.0278,  0.0035,  ..., -0.3554, -0.2478,  0.1818],
         [ 0.0829,  0.0727,  0.0473,  ..., -0.3932, -0.2042,  0.1853],
         [ 0.4968,  0.0641,  0.2291,  ..., -0.6909, -0.2780,  0.0120]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0778],
         [-0.0161],
         [-0.1028],
         [-0.1147],
         [-0.0692],
         [-0.0876],
         [-0.0573],
         [-0.1228],
         [-0.0223],
         [-0.1229]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.0909, -0.1331,  0.1706,  ..., -0.5037,  0.0356, -0.0136],
         [ 0.0619,  0.0516,  0.0340,  ..., -0.2818,  0.0272, -0.1878],
         [ 0.1689,  0.1978, -0.1681,  ..., -0.5244,  0.1304,  0.3362],
         ...,
         [ 0.3879, -0.0259,  0.1469,  ..., -0.5202, -0.2818, -0.1196],
         [ 0.3480, -0.1012,  0.1274,  ...,  0.0577, -0.2585,  0.0926],
         [ 0.3477, -0.3480,  0.0092,  ..., -0.6815, -0.0196,  0.1669]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0464],
         [-0.0966],
         [-0.0023],
         [-0.0332],
         [-0.0267],
         [-0.1456],
         [-0.0714],
         [-0.1671],
         [-0.2467],
         [-0.0495]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.0877, -0.0431,  0.3270,  ...,  0.0567, -0.2849,  0.0965],
         [ 0.0080, -0.2065,  0.0199,  ...,  0.1008, -0.3336, -0.0757],
         [ 0.1249,  0.3149,  0.0218,  ...,  0.5049, -0.2487,  0.3016],
         ...,
         [-0.0952,  0.1013,  0.0443,  ..., -0.3021,  0.0667,  0.1216],
         [-0.1849,  0.3424,  0.0972,  ..., -0.4797,  0.4628, -0.0273],
         [-0.0533,  0.0375,  0.2372,  ...,  0.0519, -0.2172, -0.0800]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1427],
         [-0.1047],
         [-0.0201],
         [-0.2142],
         [-0.1558],
         [-0.0816],
         [-0.1559],
         [-0.1300],
         [-0.1248],
         [-0.1221]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

validation_step
qid:  5a8d7341554299441c6b9fe5
input_ids:  tensor([[    0, 50268, 13792,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  29,  186,  346,  509,  663,  734, 1082, 1242, 1484, 1664],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1586,  0.0717, -0.0459,  ..., -0.1779, -0.3024,  0.0971],
         [ 0.1957,  0.2853, -0.0615,  ..., -0.2199, -0.4954, -0.0273],
         [ 0.2096,  0.2777,  0.0471,  ..., -0.2919, -0.2811,  0.0785],
         ...,
         [ 0.1246,  0.0897, -0.0383,  ...,  0.0136, -0.4311,  0.1416],
         [ 0.5272, -0.0356, -0.0031,  ..., -0.1072, -0.3130,  0.0192],
         [ 0.3860, -0.1857, -0.0548,  ..., -0.2542, -0.3801, -0.1599]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1244],
         [-0.1094],
         [-0.1781],
         [-0.0501],
         [-0.1279],
         [-0.1058],
         [-0.1210],
         [-0.1245],
         [-0.0875],
         [-0.1145]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[-0.1519,  0.2589, -0.0725,  ...,  0.3052,  0.0254, -0.0054],
         [ 0.0461,  0.3424, -0.1085,  ..., -0.0568,  0.0024,  0.1755],
         [ 0.1744,  0.1632, -0.0523,  ..., -0.5930, -0.3591,  0.0295],
         ...,
         [ 0.1077,  0.0443,  0.0753,  ...,  0.1634, -0.4175, -0.0025],
         [ 0.0179,  0.5198, -0.1613,  ..., -0.0775, -0.1342, -0.1341],
         [ 0.1290,  0.1648,  0.0047,  ..., -0.2268, -0.4437, -0.1051]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1056],
         [-0.1427],
         [-0.1354],
         [-0.1538],
         [-0.1693],
         [-0.1418],
         [-0.1575],
         [-0.0903],
         [-0.1487],
         [-0.1876]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[448,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.2456,  0.0673,  0.1208,  ...,  0.5221, -0.2105, -0.0644],
         [ 0.1750,  0.1594,  0.0889,  ...,  0.2919,  0.0596, -0.1696],
         [ 0.1758,  0.2122,  0.1960,  ...,  0.3395, -0.1538, -0.1508],
         ...,
         [ 0.0034,  0.3096,  0.2320,  ...,  0.2286, -0.1508, -0.0161],
         [-0.0516,  0.2862,  0.0516,  ...,  0.2201, -0.4863, -0.4614],
         [ 0.2238,  0.0291,  0.1870,  ...,  0.6049, -0.3639, -0.2456]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1824],
         [-0.1632],
         [-0.1677],
         [-0.1423],
         [-0.1692],
         [-0.1611],
         [-0.1873],
         [-0.1412],
         [-0.1919],
         [-0.2156]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1098])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 63, 299, 583, 586, 589, 594, 667, 670,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2859,  0.1046, -0.0442,  ...,  0.1943, -0.3953, -0.2197],
         [-0.0321,  0.3228,  0.0100,  ..., -0.2607,  0.0702,  0.3232],
         [ 0.3209,  0.2731,  0.0891,  ...,  0.0968, -0.4958, -0.3012],
         ...,
         [ 0.3262,  0.0929, -0.1002,  ...,  0.0009, -0.5186, -0.2743],
         [ 0.2827, -0.0191,  0.1649,  ...,  0.0156, -0.4506, -0.0518],
         [ 0.3326,  0.1976, -0.0713,  ..., -0.1514, -0.4888, -0.2735]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1741],
         [-0.1161],
         [-0.1285],
         [-0.1501],
         [-0.1349],
         [-0.1589],
         [-0.1031],
         [-0.2190],
         [-0.1110],
         [-0.1309]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1681])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1681])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[955, 961,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1666,  0.3737,  0.1817,  ...,  0.2773,  0.0460, -0.0624],
         [-0.1643,  0.5029, -0.2996,  ..., -0.4602,  0.2909, -0.2605],
         [-0.0161,  0.1006, -0.1454,  ...,  0.1885, -0.0506, -0.2805],
         ...,
         [-0.0092,  0.2213, -0.1802,  ...,  0.1740, -0.0646, -0.1694],
         [ 0.0124,  0.3070, -0.0461,  ...,  0.1796, -0.1973,  0.0376],
         [-0.2019,  0.1236, -0.1197,  ...,  0.1607, -0.1184,  0.0241]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1042],
         [-0.0397],
         [-0.0974],
         [-0.1315],
         [-0.1312],
         [-0.1116],
         [-0.1725],
         [-0.0830],
         [-0.1567],
         [-0.1764]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 642, 795,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.1082,  0.0219,  0.0620,  ...,  0.2038, -0.5433, -0.1199],
         [-0.0347,  0.0265, -0.0403,  ...,  0.1788, -0.2339, -0.0173],
         [ 0.0129,  0.4619, -0.0741,  ..., -0.5730, -0.0255,  0.0425],
         ...,
         [ 0.0445,  0.2188,  0.0910,  ...,  0.1282, -0.3878, -0.0186],
         [ 0.0271, -0.2022, -0.0442,  ..., -0.3666, -0.2981,  0.1684],
         [ 0.1810, -0.1901,  0.0639,  ...,  0.3147, -0.3138,  0.0954]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1621],
         [-0.0732],
         [-0.1003],
         [-0.1415],
         [-0.1801],
         [-0.1261],
         [-0.1638],
         [-0.1523],
         [-0.0650],
         [-0.0834]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[551, 556, 827,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 1.2587e-01,  2.8466e-02, -6.7368e-02,  ...,  1.4620e-01,
          -2.8599e-01,  9.3440e-05],
         [ 3.7472e-01,  1.5910e-01, -1.2964e-01,  ..., -1.7870e-01,
          -2.0443e-01, -2.5609e-01],
         [ 1.9409e-01,  2.4192e-01, -7.7579e-03,  ..., -2.0322e-01,
          -5.2725e-02, -3.1537e-01],
         ...,
         [ 3.4588e-01, -8.7219e-02,  8.8514e-02,  ...,  1.4843e-01,
          -2.4104e-01, -2.1073e-02],
         [ 3.9501e-01,  1.2027e-01, -1.3152e-01,  ..., -7.4011e-02,
          -2.6299e-01, -3.0400e-01],
         [ 2.0296e-02,  5.1899e-01,  4.3448e-02,  ..., -3.8620e-01,
          -2.0189e-01, -1.0870e-01]]], device='cuda:0')
sp_para_output_t:  tensor([[[-0.1113],
         [-0.0717],
         [-0.1387],
         [-0.1429],
         [-0.0625],
         [-0.1628],
         [-0.1636],
         [-0.1310],
         [-0.0346],
         [ 0.0065]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.s

sp_para_output:  tensor([[[-0.1237,  0.2781, -0.0818,  ..., -0.1311,  0.0559, -0.2349],
         [-0.0250, -0.4711, -0.4421,  ...,  0.0157,  0.0150,  0.0937],
         [ 0.0470, -0.2536,  0.2004,  ...,  0.7276, -0.6570, -0.1949],
         ...,
         [ 0.0547, -0.0307,  0.0724,  ...,  0.4901, -0.3574,  0.1327],
         [ 0.0303,  0.1569,  0.1229,  ...,  0.0716, -0.2907, -0.1390],
         [-0.1206, -0.1776,  0.2253,  ...,  0.6877, -0.4028, -0.2384]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0606],
         [-0.0862],
         [-0.0850],
         [-0.0529],
         [-0.1307],
         [-0.0559],
         [-0.0399],
         [-0.0811],
         [-0.1575],
         [-0.1324]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 988])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 988])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[93,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

sp_para_output:  tensor([[[ 0.2953,  0.1059,  0.0258,  ..., -0.1209, -0.1430, -0.1707],
         [ 0.1819,  0.2676, -0.0730,  ..., -0.4257, -0.2326, -0.0831],
         [ 0.1080,  0.0809, -0.1005,  ...,  0.2601, -0.3202, -0.1625],
         ...,
         [ 0.0331, -0.0155,  0.0772,  ..., -0.0393,  0.0509,  0.0812],
         [ 0.0613,  0.3192, -0.0825,  ..., -0.1470, -0.1158, -0.0593],
         [ 0.0507,  0.1820, -0.0390,  ...,  0.0707, -0.3384, -0.2192]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1104],
         [-0.0518],
         [-0.0536],
         [-0.1746],
         [-0.1849],
         [-0.0786],
         [-0.1067],
         [-0.1177],
         [-0.0256],
         [-0.0734]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1351, -0.0064,  0.0677,  ...,  0.0345, -0.1664, -0.0791],
         [ 0.1034,  0.0288, -0.1250,  ...,  0.0941, -0.2340, -0.1401],
         [ 0.1644,  0.2244,  0.0229,  ...,  0.0814, -0.2565,  0.0019],
         ...,
         [ 0.0535,  0.0884, -0.0515,  ..., -0.0050, -0.2776, -0.2220],
         [ 0.1283, -0.1888,  0.2040,  ..., -0.0074, -0.2382,  0.1158],
         [ 0.1340, -0.0006,  0.0506,  ...,  0.0332, -0.1113, -0.1102]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1132],
         [-0.1995],
         [-0.1862],
         [-0.1217],
         [-0.0696],
         [-0.0613],
         [-0.1268],
         [-0.1837],
         [-0.2185],
         [-0.1411]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1172,  0.4000,  0.0309,  ...,  0.1074, -0.3253, -0.0272],
         [ 0.0850,  0.2112, -0.0901,  ..., -0.0523, -0.2823, -0.1630],
         [-0.1402,  0.1902,  0.1364,  ..., -0.0662, -0.0030, -0.0138],
         ...,
         [ 0.0490,  0.1968,  0.1412,  ...,  0.1053, -0.2248, -0.1415],
         [ 0.0837,  0.1853, -0.0679,  ...,  0.5941, -0.2661, -0.2161],
         [ 0.1042, -0.0656, -0.0315,  ..., -0.0228, -0.1625, -0.1088]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1064],
         [-0.1202],
         [-0.1044],
         [-0.0654],
         [-0.0574],
         [-0.0562],
         [-0.0954],
         [-0.1779],
         [-0.1217],
         [-0.0938]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1347])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1347])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[507,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1005,  0.3591,  0.1283,  ..., -0.0261, -0.0367, -0.1060],
         [-0.0174,  0.0209, -0.2213,  ..., -0.2234, -0.1998, -0.0949],
         [-0.0267,  0.0237, -0.1560,  ..., -0.1483, -0.1378,  0.0614],
         ...,
         [ 0.1465, -0.0065, -0.1423,  ..., -0.1418, -0.3046, -0.0623],
         [-0.0417, -0.1526, -0.1898,  ..., -0.0756, -0.2447, -0.3122],
         [-0.0066, -0.1667, -0.1539,  ..., -0.0554, -0.1104, -0.0673]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0767],
         [-0.0912],
         [-0.1602],
         [-0.1217],
         [-0.0197],
         [-0.1206],
         [-0.0714],
         [-0.0616],
         [-0.1578],
         [-0.0995]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1872,  0.2416,  0.1060,  ...,  0.3189, -0.1682,  0.0726],
         [ 0.2232,  0.3106,  0.0435,  ..., -0.0496, -0.2206,  0.0505],
         [ 0.0360,  0.1440,  0.1665,  ...,  0.0343, -0.0036, -0.1192],
         ...,
         [ 0.1186,  0.2213, -0.0672,  ...,  0.2223, -0.3043,  0.1333],
         [ 0.1957,  0.3673, -0.0611,  ..., -0.1629,  0.0939, -0.0245],
         [ 0.1697,  0.3225, -0.0725,  ...,  0.0189, -0.3745, -0.1906]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1405],
         [-0.0937],
         [-0.0642],
         [-0.1218],
         [-0.1442],
         [-0.1377],
         [-0.0730],
         [-0.2070],
         [-0.1409],
         [-0.0487]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1136])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1136])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[284, 535,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.1255,  0.3284, -0.0183,  ..., -0.3458,  0.0948,  0.1214],
         [ 0.2264, -0.0335, -0.1794,  ...,  0.1703, -0.3118,  0.2157],
         [ 0.3571,  0.0770, -0.1273,  ..., -0.0579, -0.2647, -0.0868],
         ...,
         [ 0.2549,  0.1663, -0.1877,  ..., -0.2411, -0.1610,  0.1324],
         [ 0.1721,  0.2980,  0.0777,  ..., -0.2474,  0.0355,  0.0793],
         [ 0.4199, -0.2369, -0.3050,  ...,  0.2417, -0.2226, -0.0498]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0558],
         [-0.1337],
         [-0.2102],
         [-0.1239],
         [-0.0938],
         [-0.0952],
         [-0.2034],
         [-0.0812],
         [-0.0423],
         [-0.1024]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[203, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2323, -0.2632, -0.0105,  ...,  0.3149, -0.0837, -0.1557],
         [ 0.0069,  0.3436, -0.1697,  ...,  0.0412, -0.0147,  0.0064],
         [-0.0506,  0.1998, -0.0841,  ...,  0.1369, -0.4603, -0.3494],
         ...,
         [ 0.1484, -0.1644, -0.1029,  ...,  0.2697, -0.3329, -0.2705],
         [ 0.1180, -0.0703, -0.0163,  ...,  0.2395, -0.3069, -0.1289],
         [ 0.0864,  0.1138, -0.0871,  ..., -0.0045, -0.3277, -0.2755]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0616],
         [-0.0648],
         [-0.0975],
         [-0.1082],
         [-0.0457],
         [-0.1294],
         [-0.0986],
         [-0.1400],
         [-0.0970],
         [-0.1207]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1425])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1425])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1363,    0,    0,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[ 0.1754,  0.0247, -0.0042,  ..., -0.3448, -0.3000,  0.1335],
         [-0.0640,  0.2748, -0.0129,  ..., -0.1578, -0.1148,  0.0415],
         [-0.0580,  0.4511, -0.0222,  ..., -0.4007, -0.2274,  0.0915],
         ...,
         [ 0.0920,  0.2029, -0.0537,  ..., -0.1403, -0.3360, -0.1336],
         [ 0.0762,  0.1194,  0.0431,  ...,  0.0029,  0.0838,  0.3041],
         [ 0.0691,  0.1668, -0.0732,  ..., -0.3159, -0.5259, -0.2282]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1361],
         [-0.0718],
         [-0.1127],
         [-0.0800],
         [-0.0937],
         [-0.0209],
         [-0.1512],
         [-0.1272],
         [-0.0267],
         [-0.1064]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1383])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1383])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[634, 969,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.3049,  0.0171,  0.0720,  ...,  0.2326, -0.1624, -0.1183],
         [ 0.4060, -0.1404, -0.0199,  ...,  0.0014, -0.3998, -0.0040],
         [ 0.1429,  0.0447,  0.0884,  ...,  0.1854, -0.1934, -0.0401],
         ...,
         [ 0.2916,  0.3099,  0.2186,  ..., -0.2081, -0.5321, -0.0487],
         [ 0.2901, -0.0449,  0.0619,  ...,  0.1463, -0.1319, -0.0757],
         [ 0.0802, -0.0088,  0.1181,  ..., -0.1004, -0.0936, -0.2175]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0912],
         [-0.1602],
         [-0.1406],
         [-0.1755],
         [-0.0764],
         [-0.0354],
         [-0.0758],
         [-0.0814],
         [-0.0718],
         [-0.1327]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 917])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 917])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 68, 225, 248, 302, 386, 417, 495, 540, 647, 656, 693, 747, 7

sp_para_output:  tensor([[[ 0.0206,  0.1148,  0.0282,  ...,  0.2305, -0.2788,  0.0839],
         [ 0.1042, -0.2258, -0.3592,  ..., -0.3113, -0.0767, -0.0702],
         [ 0.0042,  0.2312, -0.2334,  ...,  0.1882, -0.2966,  0.2293],
         ...,
         [ 0.2271, -0.0381, -0.0233,  ...,  0.2566, -0.1377,  0.0140],
         [-0.0662, -0.0336, -0.0142,  ...,  0.3360, -0.0910,  0.1326],
         [ 0.1557, -0.2083,  0.0385,  ...,  0.4942, -0.2132,  0.0793]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1630],
         [-0.1107],
         [-0.1260],
         [-0.1617],
         [-0.2812],
         [-0.1769],
         [-0.1566],
         [-0.2183],
         [-0.1785],
         [-0.1982]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268, 288, 337, 362, 387, 430, 434,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.0221,  0.1744,  0.1338,  ...,  0.4442,  0.0093, -0.0615],
         [-0.0252,  0.6084, -0.0882,  ...,  0.1684, -0.1569, -0.3416],
         [-0.1505,  0.2478, -0.1100,  ..., -0.0894, -0.0135, -0.0740],
         ...,
         [-0.0031,  0.4996, -0.0038,  ..., -0.1248,  0.1813,  0.1363],
         [-0.0584,  0.1495, -0.0591,  ...,  0.0693, -0.0958,  0.0971],
         [-0.1807,  0.2701, -0.0499,  ...,  0.2581,  0.3692, -0.2034]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1174],
         [-0.0812],
         [-0.1263],
         [-0.1353],
         [-0.1052],
         [-0.1038],
         [-0.1158],
         [-0.0488],
         [-0.0816],
         [-0.1676]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 877])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 877])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[271,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.3415,  0.0396,  0.1958,  ..., -0.0533, -0.2455, -0.1776],
         [ 0.1442, -0.0325, -0.2084,  ..., -0.0033, -0.2852, -0.2502],
         [ 0.1257, -0.0890, -0.1127,  ..., -0.0483, -0.0951, -0.0338],
         ...,
         [ 0.0422, -0.0596, -0.0150,  ..., -0.0034, -0.3301, -0.1695],
         [ 0.1318, -0.0971, -0.1115,  ..., -0.0376, -0.1008, -0.0509],
         [-0.1325, -0.0260, -0.1826,  ...,  0.2922, -0.5539, -0.1582]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1737],
         [-0.1527],
         [-0.1443],
         [-0.1888],
         [-0.2233],
         [-0.1213],
         [-0.1620],
         [-0.1492],
         [-0.1472],
         [-0.1945]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 835])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 835])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[410, 577, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.2221, -0.2138, -0.2537,  ...,  0.0846, -0.3372, -0.2785],
         [ 0.3610, -0.2682, -0.4511,  ..., -0.1150, -0.2700, -0.0689],
         [-0.1103,  0.3373, -0.2342,  ..., -0.5270, -0.1103, -0.1616],
         ...,
         [ 0.2044, -0.1166, -0.3100,  ...,  0.2429, -0.2266, -0.0143],
         [ 0.0586,  0.1288, -0.3994,  ...,  0.0246, -0.2043, -0.2685],
         [ 0.2244, -0.0839, -0.2527,  ..., -0.2104, -0.3428, -0.0170]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0640],
         [-0.1425],
         [-0.1316],
         [-0.1377],
         [-0.0240],
         [-0.1149],
         [-0.1215],
         [-0.1222],
         [-0.0637],
         [-0.1493]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1177])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1177])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[608, 612,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_pred: tensor([1, 0], device='cuda:0')
pre_answer:	 yes 	gold_answer:	 16yearold 	sp_sent_pred:	 [] 	sp_sent_gold:	 [0, 18] 	sp_para_pred:	 tensor([1, 0], device='cuda:0') 	sp_para_gold:	 [0, 4]
validation_step
qid:  5ac2c3545542990b17b1548b
input_ids:  tensor([[    0, 50268,   611,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  171,  429,  512,  592,  747,  906, 1208, 1349, 1429],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2077,  0.0877,  0.0537,  ...,  0.5004, -0.4378,  0.0025],
         [ 0.0108,  0.2998, -0.1231,  ..., -0.3425,  0.1101,  0.1630],
         [ 0.1270,  0.0396, -0.0466,  ...,  0.6033, -0.4674, -0.1416],
         ...,
         [ 0.1088,  0.0553, -0.0361,  ...,  0.3621, -0.5600, -0.2533],
         [ 0.0799,  0.2273, -0.0217,  ...,  0.4305, -0.4926, -0.3378],
         [ 0.1595,  0.0069,  0.0245,  ...,  0.3796, -0.4102,  0.0182]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0987],
         [-0.0724],
         [-0.

validation_step
qid:  5ae3b4d05542992f92d82349
input_ids:  tensor([[    0, 50268,  8155,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  23,  313,  411,  579,  720, 1163, 1269, 1293, 1416, 1697],
       device='cuda:0')
sp_para_output:  tensor([[[-0.2736,  0.2864, -0.1464,  ...,  0.1795, -0.1052,  0.1243],
         [ 0.0649,  0.1102,  0.0924,  ..., -0.1547, -0.2677,  0.0509],
         [ 0.0579,  0.1596, -0.0043,  ..., -0.1579, -0.1784, -0.0375],
         ...,
         [ 0.1078,  0.3160, -0.2061,  ...,  0.0080, -0.4488, -0.1179],
         [ 0.0559,  0.1600, -0.1662,  ..., -0.0305, -0.3444, -0.1633],
         [-0.0704,  0.1117, -0.1006,  ...,  0.0977, -0.1784,  0.0470]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0480],
         [-0.2230],
         [-0.1727],
         [-0.0829],
         [-0.0854],
         [-0.1061],
         [-0.0672],
         [-0.1355],
         [-0.1609],
         [-0.0840]]], device='cuda:0', dtype=torch.float16)
start_logits.si

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1380,  0.0058,  0.0458,  ...,  0.0039, -0.2263,  0.0835],
         [ 0.1955,  0.3255, -0.0803,  ..., -0.3276, -0.4554, -0.2214],
         [ 0.1039, -0.0592,  0.1458,  ..., -0.1067, -0.2621, -0.0272],
         ...,
         [ 0.1608,  0.1890, -0.1385,  ..., -0.2864, -0.4924, -0.1864],
         [ 0.2551,  0.0887,  0.0384,  ..., -0.1034, -0.0538, -0.0497],
         [ 0.3673,  0.1109,  0.0368,  ..., -0.0412, -0.3987, -0.0745]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1404],
         [-0.0287],
         [-0.1384],
         [-0.2372],
         [-0.1594],
         [-0.1466],
         [-0.0793],
         [-0.1282],
         [-0.0744],
         [-0.1394]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1199])
start_positions.si

sp_para_output:  tensor([[[ 0.1179,  0.0984,  0.1916,  ...,  0.4918, -0.3718, -0.1968],
         [-0.0477,  0.1383, -0.1797,  ...,  0.2940, -0.2611, -0.3895],
         [ 0.0708,  0.0325, -0.1275,  ...,  0.1229, -0.3265, -0.1679],
         ...,
         [ 0.1262,  0.2426,  0.1392,  ...,  0.0526, -0.3255, -0.3152],
         [ 0.1549,  0.3952,  0.1956,  ...,  0.2684, -0.4340, -0.1130],
         [-0.1408,  0.1910, -0.1197,  ...,  0.1405, -0.3418, -0.3624]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1812],
         [-0.0945],
         [-0.1949],
         [-0.2140],
         [-0.1272],
         [-0.1763],
         [-0.2250],
         [-0.1624],
         [-0.1946],
         [-0.1361]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1326])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1326])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1324,    0,    0,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[ 0.1265, -0.1862,  0.1533,  ..., -0.4574, -0.1733,  0.0311],
         [ 0.1554, -0.1280, -0.1718,  ..., -0.1620, -0.0925, -0.3073],
         [ 0.1564,  0.1401, -0.1495,  ..., -0.5159, -0.0209, -0.1823],
         ...,
         [ 0.1736,  0.1793,  0.0204,  ..., -0.4385, -0.2405, -0.1088],
         [-0.0353,  0.3278, -0.2195,  ..., -0.2204, -0.1263, -0.2637],
         [ 0.1143,  0.1995, -0.2434,  ..., -0.0975, -0.1977, -0.2109]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.1232],
         [-0.0978],
         [-0.0866],
         [-0.0892],
         [-0.0729],
         [-0.0927],
         [-0.0905],
         [-0.0421],
         [ 0.0188],
         [-0.1350]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 875])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 875])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[873,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[-0.0400, -0.1746,  0.1226,  ..., -0.3734,  0.0202,  0.1130],
         [ 0.0472, -0.0062, -0.0869,  ..., -0.2456, -0.0698, -0.0460],
         [-0.0266,  0.0638, -0.0110,  ..., -0.3488, -0.0239, -0.0813],
         ...,
         [ 0.0557, -0.1429, -0.0593,  ..., -0.2299, -0.1274, -0.0128],
         [ 0.1780,  0.3027,  0.0740,  ...,  0.0140,  0.0314, -0.0538],
         [-0.0072, -0.0580,  0.1628,  ..., -0.1119, -0.2730,  0.0841]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0161],
         [-0.0762],
         [-0.0575],
         [-0.0826],
         [-0.0348],
         [-0.0479],
         [-0.0716],
         [-0.1565],
         [-0.0685],
         [-0.0337]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1667])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1667])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 120,  151,  180,  479,  538,  577,  892,  922, 1002, 1464,


Epoch 00001: avg_val_f1 reached 0.00000 (best 0.00000), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_1.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(25.0301, device='cuda:0')	avg_answer_loss:  tensor(6.4050, device='cuda:0')	avg_type_loss:  tensor(1.2331, device='cuda:0')	avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.1402,  0.1593,  0.2932,  ..., -0.0840, -0.1573,  0.0425],
         [ 0.0154, -0.2291, -0.0

sp_para_output:  tensor([[[ 0.1495, -0.0845,  0.2416,  ...,  0.2343, -0.1243,  0.0869],
         [ 0.0134,  0.2245,  0.0618,  ..., -0.3998, -0.2745, -0.0745],
         [ 0.0139,  0.2603, -0.0501,  ..., -0.0091, -0.8101, -0.1506],
         ...,
         [ 0.2339, -0.0230,  0.2007,  ..., -0.0328, -0.2171,  0.0039],
         [ 0.2475, -0.0385, -0.1374,  ...,  0.1578, -0.2437, -0.4099],
         [ 0.0824,  0.2590,  0.2484,  ...,  0.3632, -0.3922,  0.3250]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.0548],
         [-0.1360],
         [-0.1216],
         [-0.2257],
         [-0.2474],
         [-0.2450],
         [-0.0467],
         [-0.1229],
         [-0.2433],
         [-0.1104]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[448, 

para_indexes:  tensor([ 20, 125, 185, 346, 416, 577, 675, 764, 808, 950], device='cuda:0')
sp_para_output:  tensor([[[ 0.2139,  0.3352, -0.1271,  ..., -0.0840, -0.1940,  0.0146],
         [ 0.4607,  0.8912, -0.0871,  ...,  0.7904,  0.4582, -0.3705],
         [ 0.1310, -0.0666,  0.3554,  ...,  0.1050, -0.2590, -0.3723],
         ...,
         [ 0.0466,  0.0700,  0.2490,  ..., -0.0349, -0.0730, -0.2110],
         [-0.1264,  0.5779,  0.1859,  ...,  0.1387, -0.3638, -0.3113],
         [ 0.1179,  0.1335,  0.1555,  ...,  0.3259, -0.4177, -0.4128]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.1045],
         [-0.2421],
         [-0.0890],
         [-0.0605],
         [-0.1671],
         [-0.2072],
         [-0.1333],
         [-0.1598],
         [-0.0681],
         [-0.1964]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  to

sp_para_output:  tensor([[[ 0.2397,  0.2705,  0.2739,  ...,  0.0913, -0.0933,  0.1342],
         [ 0.0313,  0.4287, -0.2668,  ..., -0.3586,  0.1777,  0.0088],
         [ 0.2439,  0.2378,  0.0085,  ...,  0.5964,  0.0161,  0.0190],
         ...,
         [ 0.0673,  0.0892, -0.1016,  ...,  0.0584, -0.0874, -0.1481],
         [ 0.1964,  0.3137,  0.0391,  ...,  0.2418, -0.0610, -0.1280],
         [ 0.0101,  0.1084,  0.1022,  ..., -0.3967, -0.0492, -0.0185]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.1392],
         [-0.0718],
         [-0.1229],
         [-0.0900],
         [-0.0651],
         [-0.0291],
         [-0.1549],
         [-0.0940],
         [-0.2778],
         [-0.2048]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 

input_ids:  tensor([[    0, 50268,  8155,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  39,  184,  322,  407,  550,  683,  792,  877, 1037, 1088],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2826,  0.2141,  0.0314,  ..., -0.2786, -0.5365, -0.1733],
         [ 0.2353,  0.0710,  0.0648,  ...,  1.1264, -0.6138, -0.0517],
         [ 0.1253,  0.2561, -0.1496,  ..., -0.1590, -0.6489, -0.3664],
         ...,
         [ 0.2456,  0.0841,  0.0093,  ...,  0.0573, -0.2080, -0.0538],
         [ 0.1027, -0.0097, -0.0518,  ..., -0.4028, -0.2817,  0.3053],
         [ 0.2416, -0.3292,  0.2367,  ...,  0.2470, -0.3345,  0.1480]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.1624],
         [-0.1625],
         [-0.1581],
         [-0.2450],
         [-0.2189],
         [-0.1902],
         [-0.2484],
         [-0.1848],
         [-0.1190],
         [-0.1218]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

sp_para_output:  tensor([[[ 0.2764, -0.0689,  0.2079,  ...,  0.8194, -0.3987,  0.2390],
         [-0.0350,  0.0623, -0.1584,  ...,  0.2490,  0.1714,  0.2872],
         [ 0.3829, -0.5325, -0.0654,  ...,  1.1876, -0.5676,  0.1573],
         ...,
         [ 0.1929, -0.0593,  0.0174,  ...,  0.6368, -0.0518,  0.0817],
         [ 0.2631,  0.0712, -0.0293,  ...,  0.5059, -0.1729,  0.2095],
         [-0.1423, -0.2584,  0.1088,  ...,  0.8588, -0.5511,  0.0690]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2224],
         [-0.1274],
         [-0.1008],
         [-0.1759],
         [-0.1473],
         [-0.0330],
         [-0.0050],
         [-0.0849],
         [-0.1729],
         [-0.1300]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 988])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 988])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[93,  

input_ids:  tensor([[    0, 50268, 16963,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  19,  106,  186,  286,  389,  493,  737,  847,  981, 1064],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2943, -0.0618, -0.0319,  ..., -0.1427, -0.1015,  0.3654],
         [ 0.1042,  0.0812,  0.1821,  ..., -0.0854,  0.1606,  0.1308],
         [ 0.0902,  0.0342,  0.1346,  ...,  0.2645, -0.1766, -0.4517],
         ...,
         [ 0.2468,  0.0650,  0.2171,  ..., -0.6604, -0.1824, -0.2194],
         [ 0.4379, -0.0844,  0.0676,  ..., -0.0946,  0.0370,  0.0337],
         [ 0.3774,  0.0150, -0.0954,  ..., -0.1263, -0.1422, -0.0415]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.1337],
         [-0.0349],
         [-0.1405],
         [-0.1359],
         [-0.1027],
         [-0.0273],
         [-0.1170],
         [-0.2126],
         [-0.1136],
         [-0.1161]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.1912, -0.2563, -0.2222,  ..., -0.2349, -0.1628,  0.1320]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SqueezeBackward1>)
input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2609,  0.4366, -0.0371,  ...,  0.6285, -0.2993, -0.0570],
         [ 0.1033,  0.2734, -0.0497,  ...,  0.1251, -0.2686,  0.0975],
         [ 0.1134,  0.3822,  0.0543,  ...,  0.0325, -0.1537, -0.1373],
         ...,
         [ 0.3035,  0.6076,  0.1613,  ...,  0.0977, -0.4741, -0.3376],
         [ 0.1967,  0.5530, -0.0690,  ..., -0.2040, -0.3156, -0.1843],
         [-0.0527,  0.3112, -0.0141,  ..., -0.1880, -0.1743, -0.1659]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2455],
         [-0.1205],
         [-0.178

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

logits:  tensor([[ 0.3481,  0.4873,  0.3723,  0.2983,  0.4302,  0.1158,  0.7778,  0.3857,
          0.4055,  0.4695,  0.6133,  0.3804,  0.3689,  0.4585,  0.2460,  0.2494,
          0.4138,  0.4202,  0.4465,  0.4583,  0.5718,  0.0891,  0.8877,  0.3040,
          0.5190,  0.5312,  0.6177,  0.3357, -0.1143,  0.6338,  0.3828,  0.0643,
          0.3174,  0.2637,  0.1544, -0.1919,  0.3860,  0.2610,  0.2700,  0.7568,
          0.2374,  0.7188,  0.4795,  0.4365,  0.2169,  0.2959,  0.2820,  0.1176,
          0.9536,  0.5815,  0.6528,  0.3232,  0.3875,  0.6934, -0.0759,  0.7656,
          0.3171,  0.1135,  0.2142,  0.4565, -0.0782,  0.0661,  0.7905,  0.9648,
          0.0857,  0.2632,  0.5005,  0.2886,  0.4668,  0.5386,  0.4790,  0.1100,
          0.3933,  0.5322,  0.4519,  0.2693,  0.2839,  0.6787,  0.3030,  0.5747,
          0.3972,  0.4365,  0.3323,  0.6245,  0.0612,  0.2411,  0.1558,  0.1702,
          0.2438,  0.6128,  0.4338,  0.3772,  0.3147,  0.7661,  0.3896,  0.1863,
          0.1638,  

input_ids:  tensor([[    0, 50268,  8015,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   87,  154,  235,  298,  423,  471,  555,  788, 1046],
       device='cuda:0')
sp_para_output:  tensor([[[-0.0061,  0.1210,  0.0836,  ...,  0.4637,  0.2082,  0.1509],
         [ 0.1062,  0.1167, -0.2277,  ...,  0.1917, -0.1942,  0.0599],
         [ 0.2302,  0.2664, -0.1654,  ...,  0.3608,  0.0353, -0.0024],
         ...,
         [ 0.1667,  0.2928,  0.1076,  ...,  0.3451,  0.0549,  0.0739],
         [-0.0421, -0.0874, -0.0952,  ..., -0.1908, -0.0615,  0.0765],
         [-0.0284,  0.0987,  0.0661,  ...,  0.8993, -0.2898,  0.0417]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2484],
         [-0.1855],
         [-0.2532],
         [-0.2537],
         [-0.4053],
         [-0.3325],
         [-0.2233],
         [-0.2942],
         [-0.1790],
         [-0.1893]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

masked_target:  tensor([[271,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.2399,  0.9331,  0.1858,  0.5415,  0.6768,  0.1492,  0.0368,  0.2325,
          0.5566,  0.1593,  0.1141,  0.2600,  0.6548,  0.1698,  0.1614,  0.0316,
          0.2101,  0.2932,  0.0989,  0.6196,  0.2520,  0.4502,  0.4587,  0.2861,
          0.6343,  0.3408,  0.3853,  0.1770,  0.9429,  0.3816,  0.8423,  0.3281,
          0.1458,  0.4082,  0.3232,  0.3059,  0.3992,  0.3286,  0.1721,  0.1353,
          0.3230,  0.1049, -0.0453, -0.0577,  0.1871,  0.3601,  0.7974,  0.4175,
          0.5322,  0.4460,  0.4280,  0.4607,  0.2450,  0.5674,  0.8721,  0.1328,
          0.3333,  0.423

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2696,  0.3144, -0.3171,  ...,  0.1237, -0.4681,  0.0295],
         [ 0.2870, -0.0661, -0.3764,  ...,  0.1357, -0.2375, -0.0756],
         [ 0.0922,  0.1017, -0.3904,  ...,  0.3348, -0.1346, -0.1542],
         ...,
         [-0.0658,  0.2688, -0.3452,  ...,  1.4572, -0.3854,  0.1816],
         [ 0.1909,  0.1018, -0.3586,  ...,  0.8559, -0.3272,  0.0413],
         [ 0.3487,  0.1656, -0.3792,  ..., -0.0418, -0.4977, -0.0405]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2559],
         [-0.2185],
         [-0.2891],
         [-0.1759],
         [-0.2052],
         [-0.1796],
         [-0.1727],
         [-0.2170],
         [-0.2874],
         [-0.2206]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2267,  0.1081,  0.1525,  ...,  0.2570, -0.5690, -0.1499],
         [ 0.1864,  0.1917, -0.0627,  ...,  0.2647, -0.2123, -0.2870],
         [ 0.0648,  0.1573,  0.0666,  ...,  0.0721, -0.7080,  0.2929],
         ...,
         [ 0.0878,  0.2330, -0.2087,  ...,  0.0903, -0.5700, -0.3107],
         [ 0.1658,  0.0599,  0.0110,  ...,  0.0641, -0.2215, -0.0344],
         [ 0.4832,  0.2891,  0.1514,  ...,  0.7915, -0.4968,  0.3296]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.3130],
         [-0.2078],
         [-0.2812],
         [-0.3813],
         [-0.2617],
         [-0.2629],
         [-0.2338],
         [-0.1853],
         [-0.2025],
         [-0.2927]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1554,  0.4489,  0.0671,  ..., -0.0942,  0.0505, -0.0261],
         [ 0.2574,  0.1864, -0.0546,  ...,  0.4884, -0.3939, -0.3353],
         [ 0.0513,  0.3227, -0.0194,  ..., -0.0850, -0.0686, -0.0105],
         ...,
         [ 0.2191,  0.1653,  0.0064,  ..., -0.0945, -0.3574, -0.2740],
         [ 0.2589,  0.4427,  0.1646,  ...,  0.1365, -0.3179, -0.2932],
         [ 0.4495,  0.1647,  0.0328,  ...,  0.0939, -0.0171, -0.0833]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.2059],
         [-0.3611],
         [-0.2118],
         [-0.3101],
         [-0.2568],
         [-0.4241],
         [-0.3162],
         [-0.3691],
         [-0.3538],
         [-0.2625]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.1689,  0.1725,  0.2136,  ...,  0.4707, -0.3124,  0.0013],
         [ 0.1503,  0.0123, -0.0383,  ...,  0.5139, -0.3708, -0.1662],
         [ 0.1632,  0.4326, -0.0145,  ...,  1.0082, -0.3095,  0.0762],
         ...,
         [ 0.0808,  0.1468,  0.0150,  ...,  0.1752, -0.0216, -0.0207],
         [-0.1847,  0.4490,  0.0178,  ..., -0.0799,  0.2958, -0.1591],
         [ 0.1019,  0.1692,  0.1041,  ...,  0.5462, -0.2075, -0.2308]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3889],
         [-0.3342],
         [-0.1963],
         [-0.4563],
         [-0.3750],
         [-0.3015],
         [-0.3965],
         [-0.3311],
         [-0.2891],
         [-0.3884]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

sp_para_output:  tensor([[[ 0.2059,  0.2561, -0.1132,  ...,  0.5111, -0.4447, -0.0872],
         [ 0.2643,  0.4384, -0.0662,  ...,  0.5111, -0.6473, -0.2613],
         [ 0.2831,  0.3942, -0.1354,  ...,  0.5802, -0.4277, -0.2174],
         ...,
         [ 0.2085,  0.2055, -0.1364,  ...,  0.6469, -0.5158, -0.0345],
         [ 0.5132,  0.0313,  0.0350,  ...,  0.7024, -0.5379, -0.1489],
         [ 0.4519,  0.0420, -0.1159,  ...,  0.4862, -0.5111, -0.2653]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3914],
         [-0.4060],
         [-0.4612],
         [-0.3179],
         [-0.3821],
         [-0.3936],
         [-0.3879],
         [-0.3928],
         [-0.3506],
         [-0.3806]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1810])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1810])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[707,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

         10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 14, 108, 170, 333, 378, 430, 492, 626, 662, 742], device='cuda:0')
sp_para_output:  tensor([[[-0.0540,  0.3415, -0.1378,  ...,  0.5545, -0.1575, -0.1775],
         [-0.0062,  0.3863, -0.2633,  ...,  0.3324,  0.0948, -0.0894],
         [ 0.3094,  0.4076, -0.1550,  ...,  0.0799, -0.5565, -0.0553],
         ...,
         [ 0.2688,  0.1619,  0.0331,  ...,  0.7191, -0.5119, -0.1075],
         [ 0.1050,  0.6689, -0.1581,  ...,  0.4905, -0.4137, -0.3852],
         [ 0.2511,  0.3648, -0.0834,  ...,  0.3918, -0.5031, -0.2733]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.2676],
         [-0.2876],
         [-0.3682],
         [-0.4109],
         [-0.4211],
         [-0.3677],
         [-0.3848],
         [-0.3535],
         [-0.3601],
         [-0.4443]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size

para_sent_logits_sum: tensor([-2.1807, -2.2969, -4.3862], device='cuda:0')
para_sents_offset: [0, 3, 6, 12]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.8408], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-2.1807, -2.2969, -4.3862, -0.8408], device='cuda:0')
para_sents_offset: [0, 3, 6, 12, 13]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.6938], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-2.1807, -2.2969, -4.3862, -0.8408, -0.6938], device='cuda:0')
para_sents_offset: [0, 3, 6, 12, 13, 14]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], d

sp_para_output:  tensor([[[ 0.3839,  0.2638, -0.0252,  ...,  0.8440, -0.5256, -0.4131],
         [ 0.0199,  0.4587,  0.0282,  ...,  0.1621, -0.0920,  0.1153],
         [ 0.3909,  0.3629,  0.1360,  ...,  0.7501, -0.5975, -0.4560],
         ...,
         [ 0.3794,  0.2631, -0.0887,  ...,  0.6555, -0.6714, -0.5032],
         [ 0.3787,  0.1517,  0.0819,  ...,  0.7325, -0.6459, -0.2923],
         [ 0.3837,  0.2794, -0.0652,  ...,  0.6322, -0.6596, -0.4763]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.4265],
         [-0.2896],
         [-0.4028],
         [-0.4087],
         [-0.4468],
         [-0.3887],
         [-0.3291],
         [-0.4688],
         [-0.3899],
         [-0.4062]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1681])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1681])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[955, 961,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

validation_step
qid:  5adf44985542993a75d2646d
input_ids:  tensor([[    0, 50268,  5488, 44878,     9, 38727,    11, 14490,  3821,  8377,
           247,  6308,   129,    65,  4707, 50269,     2, 50265,  9473,   493,
         50266,  9473,   493,  4142, 16441,     9,  9473,   493,   741,   298,
         18195,  6528, 26848, 48312,  6382,   271, 18195,   267,  2636,    16,
           247,    11,  2077,    25,   493, 50267,    24,    16,  3821,  8377,
           247,    30,   443,   200,  7877, 24347,   247,    19,    81,   316,
           325,    82,     8,   144, 24347,  4593,    11,   232, 50267,    24,
            16, 43490,    30,  9473,   811,  6444,    15,  2077,  4709,   873,
           811,  3342,    15, 10103,     8, 11751,     9,   741,  3314,   337,
            15, 10171, 50267,    24,   327,  1212,  7562,    19,   181,   677,
          7566,     7,  3072,  1855,  1243,  3087, 18239,     8,   741,   298,
         22165,     7,  9489,     8,   127, 43918, 12601,  1916,     8, 

sp_para_output:  tensor([[[ 0.2618,  0.1079,  0.0618,  ...,  0.9034, -0.5691, -0.2968],
         [ 0.0610,  0.1116, -0.0891,  ...,  0.7909, -0.3034, -0.2027],
         [ 0.0811,  0.5964, -0.1683,  ..., -0.4339, -0.0712, -0.2202],
         ...,
         [ 0.0890,  0.2741,  0.0327,  ...,  0.7691, -0.4839, -0.2181],
         [ 0.1683, -0.0348, -0.0911,  ...,  0.4128, -0.4111,  0.0529],
         [ 0.2579, -0.0327,  0.0360,  ...,  0.9239, -0.3855, -0.1005]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.4319],
         [-0.3469],
         [-0.2661],
         [-0.3877],
         [-0.4526],
         [-0.3594],
         [-0.3679],
         [-0.3899],
         [-0.3162],
         [-0.3381]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[551, 556, 827,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.3310,  0.1443, -0.1269,  ...,  0.8949, -0.3475, -0.2480],
         [ 0.4361,  0.3464, -0.1250,  ...,  0.4977, -0.2728, -0.3479],
         [ 0.3930,  0.4341, -0.1013,  ...,  0.3437, -0.1616, -0.4455],
         ...,
         [ 0.4674,  0.0050, -0.0304,  ...,  0.6270, -0.3483, -0.2230],
         [ 0.4517,  0.3072, -0.1456,  ...,  0.6216, -0.2868, -0.4344],
         [ 0.1998,  0.5644, -0.1664,  ..., -0.0754, -0.3378, -0.2569]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3936],
         [-0.3467],
         [-0.3921],
         [-0.3821],
         [-0.2871],
         [-0.4229],
         [-0.4136],
         [-0.3582],
         [-0.2986],
         [-0.1691]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1340])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1173,    0,    0,    0,    0,    0,    0,    0,    0,    0,

validation_step
qid:  5a89372855429951533612e6
input_ids:  tensor([[    0, 50268, 12196,    16,  5933,     9,  1349,   147,  1014, 23321,
           118,   475, 15022, 11824, 30141,   316,  1946,    21, 10899, 50269,
             2, 50265, 14206,  5730, 29762,  9326, 50266, 14206,  5730, 29762,
          9326,    16,  4243,  4930,  1349,  2034,    11, 11824, 30141,    92,
          2077,   885,  4575, 28410,  7085,   493, 50267,    24,    16, 17949,
            15,  9910,    19,  6594,   781,  2523,     9, 14206,  5730, 29762,
             8,   885,   895,  6487,   257,     8,    16,   275,   684,    25,
           184,     9, 11824, 30141, 10775,  4243,  1015,   547,   349, 16874,
         24761,     8, 11824, 30141,   316,  1946,   515,   547,   349, 10668,
           428, 48540, 50267,  5356,  1558,  6301,   251,  1349,    16, 12684,
          2014,  9326,     8,    16,   285,   921,    19,  2340,  2078,  5165,
            77,   117,  4930,  1061,    32,   145,   422,     8,    89, 

logits:  tensor([[ 0.3503,  0.7275,  0.4033,  0.4038,  0.4131,  0.0609,  0.0684,  0.5674,
          0.3047,  0.4312,  0.2216,  0.3843,  0.4487,  0.4175,  0.2125,  0.3530,
          0.0172,  0.3601,  0.3340,  0.5654,  0.2585,  0.5796,  0.3020,  0.4189,
          0.4424,  0.1576,  0.3508,  0.2766,  0.4214,  0.4504,  0.4197,  0.0877,
          0.5195,  0.8853,  0.4922,  0.1678,  0.2944,  0.5088,  0.2150,  0.4734,
          0.6387,  0.5269,  0.3877,  0.2163,  0.0773,  0.6675,  0.4775,  0.3042,
          0.1091,  0.3821,  0.3896,  0.5420,  0.5288,  0.3464,  0.4458,  0.5918,
          0.1996,  0.2622,  0.4050,  0.3499,  0.1425,  0.4868,  0.4751,  0.2261,
          0.3335,  0.3149,  0.2421,  0.0080,  0.0887,  0.1311,  0.6680,  0.3496,
          0.5957,  0.2688,  0.3904,  0.5269,  0.8066,  0.2120,  0.0804,  0.3909,
          0.2874,  0.1224,  0.6636,  0.1740,  0.3354,  0.3416,  0.4539,  0.0638,
         -0.0309,  0.4067,  0.1136,  0.2408,  0.7930,  0.2362,  0.1127,  0.5229,
          0.4478,  

validation_step
qid:  5abd90545542996e802b47d7
input_ids:  tensor([[    0, 50268, 16963,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  19,  106,  186,  286,  389,  493,  737,  847,  981, 1064],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.3768,  0.2297, -0.0058,  ...,  0.2965, -0.2080, -0.2735],
         [ 0.3229,  0.3689, -0.1120,  ...,  0.1828, -0.3291, -0.2425],
         [ 0.1820,  0.2038, -0.1095,  ...,  0.7026, -0.3299, -0.2474],
         ...,
         [ 0.1348,  0.0847, -0.0459,  ...,  0.3567, -0.0375, -0.0366],
         [ 0.1706,  0.3720, -0.1299,  ...,  0.5442, -0.2536, -0.3392],
         [ 0.2170,  0.3198, -0.0450,  ...,  0.6116, -0.3627, -0.3305]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3313],
         [-0.2859],
         [-0.2957],
         [-0.3713],
         [-0.4175],
         [-0.2598],
         [-0.3428],
         [-0.3555],
         [-0.2722],
         [-0.3276]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.2041,  0.1147, -0.0650,  ...,  0.5229, -0.3524, -0.3016],
         [ 0.1428,  0.1955, -0.1989,  ...,  0.5831, -0.3267, -0.3845],
         [ 0.2456,  0.3317, -0.1052,  ...,  0.4935, -0.4014, -0.3225],
         ...,
         [ 0.1163,  0.2220, -0.1058,  ...,  0.4956, -0.4641, -0.3857],
         [ 0.1914, -0.0452,  0.0713,  ...,  0.5399, -0.3057, -0.1786],
         [ 0.1994,  0.0144, -0.1019,  ...,  0.5301, -0.2927, -0.2844]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3816],
         [-0.4473],
         [-0.4436],
         [-0.3638],
         [-0.3481],
         [-0.3188],
         [-0.3745],
         [-0.4368],
         [-0.4631],
         [-0.4001]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2981,  0.4854,  0.0302,  ...,  0.6765, -0.4578, -0.2509],
         [ 0.1816,  0.3225, -0.1136,  ...,  0.4154, -0.3696, -0.3185],
         [ 0.0462,  0.3346,  0.0719,  ...,  0.2910, -0.0885, -0.2394],
         ...,
         [ 0.1328,  0.3073,  0.0197,  ...,  0.5632, -0.3321, -0.3155],
         [ 0.1709,  0.2747,  0.0023,  ...,  1.0425, -0.3795, -0.3203],
         [ 0.2088,  0.0534, -0.0351,  ...,  0.5349, -0.2212, -0.2426]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3796],
         [-0.3706],
         [-0.3594],
         [-0.3022],
         [-0.3047],
         [-0.2898],
         [-0.3748],
         [-0.4238],
         [-0.3682],
         [-0.3374]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1347])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1347])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[507,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2581,  0.3438,  0.0335,  ...,  0.4558, -0.1756, -0.2218],
         [ 0.0414,  0.1499, -0.2918,  ...,  0.2594, -0.3855, -0.3197],
         [ 0.0825,  0.0654, -0.2203,  ...,  0.3294, -0.3741, -0.1572],
         ...,
         [ 0.2425,  0.1397, -0.2103,  ...,  0.4029, -0.4766, -0.2468],
         [ 0.0693,  0.0090, -0.2862,  ...,  0.4121, -0.4232, -0.4853],
         [ 0.1112, -0.0984, -0.2160,  ...,  0.3855, -0.3165, -0.2112]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.2922],
         [-0.3279],
         [-0.4148],
         [-0.3521],
         [-0.2006],
         [-0.3933],
         [-0.2830],
         [-0.3005],
         [-0.4355],
         [-0.3657]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2538,  0.3871,  0.1275,  ...,  0.9205, -0.2985, -0.0435],
         [ 0.2773,  0.4882,  0.0063,  ...,  0.5166, -0.3079, -0.1345],
         [ 0.1108,  0.2434,  0.1159,  ...,  0.5524, -0.1488, -0.2357],
         ...,
         [ 0.1595,  0.3676, -0.1162,  ...,  0.7217, -0.4692, -0.0266],
         [ 0.2679,  0.5571, -0.0644,  ...,  0.5149,  0.0065, -0.2887],
         [ 0.2298,  0.4598, -0.0664,  ...,  0.7209, -0.4199, -0.2887]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3567],
         [-0.3188],
         [-0.2844],
         [-0.3569],
         [-0.3960],
         [-0.3650],
         [-0.2942],
         [-0.4312],
         [-0.3706],
         [-0.2976]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1136])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1136])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[284, 535,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.3115,  0.3703, -0.0139,  ...,  0.2305, -0.0950, -0.1766],
         [ 0.2829,  0.2153, -0.1682,  ...,  0.7566, -0.4391, -0.1893],
         [ 0.4239,  0.2707, -0.1305,  ...,  0.5531, -0.3668, -0.3255],
         ...,
         [ 0.3575,  0.3330, -0.2286,  ...,  0.3035, -0.3128, -0.1289],
         [ 0.3546,  0.2120,  0.1151,  ...,  0.0115,  0.0120, -0.0726],
         [ 0.4889, -0.0657, -0.3584,  ...,  0.8761, -0.3218, -0.3312]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.2507],
         [-0.3970],
         [-0.4541],
         [-0.3665],
         [-0.3376],
         [-0.3640],
         [-0.4490],
         [-0.3110],
         [-0.1758],
         [-0.3828]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[203, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

validation_step
qid:  5ac2a5d455429921a00ab01b
input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  23,  136,  330,  467,  616,  808,  859,  950, 1139, 1298],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2772, -0.1147, -0.0714,  ...,  0.8128, -0.2662, -0.3798],
         [ 0.1623,  0.3302, -0.2929,  ...,  0.4501, -0.0618, -0.2410],
         [-0.0053,  0.2521, -0.1363,  ...,  0.6595, -0.4702, -0.5114],
         ...,
         [ 0.1954, -0.0671, -0.1842,  ...,  0.7063, -0.4141, -0.4468],
         [ 0.1686,  0.0483, -0.0438,  ...,  0.7600, -0.3377, -0.2465],
         [ 0.1289,  0.1993, -0.0929,  ...,  0.5184, -0.3897, -0.4731]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3149],
         [-0.2764],
         [-0.3530],
         [-0.3259],
         [-0.2983],
         [-0.3809],
         [-0.3364],
         [-0.3767],
         [-0.3398],
         [-0.3672]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.2464,  0.2153, -0.0130,  ...,  0.4256, -0.4024, -0.0304],
         [ 0.0280,  0.4806, -0.0457,  ...,  0.4598, -0.2697, -0.2849],
         [-0.0041,  0.6357, -0.1231,  ...,  0.1875, -0.3955, -0.0885],
         ...,
         [ 0.1574,  0.3301, -0.0760,  ...,  0.4323, -0.3712, -0.2858],
         [ 0.0186,  0.1822,  0.0172,  ...,  0.4120, -0.0397,  0.2359],
         [ 0.1377,  0.3575, -0.0516,  ...,  0.4225, -0.5328, -0.3177]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3894],
         [-0.2805],
         [-0.3276],
         [-0.3335],
         [-0.3462],
         [-0.2717],
         [-0.3870],
         [-0.3853],
         [-0.1913],
         [-0.3687]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1383])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1383])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[634, 969,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 4.1434e-01,  1.5922e-01,  3.8695e-04,  ...,  5.2805e-01,
          -1.6118e-01, -3.6043e-01],
         [ 4.5312e-01,  5.2671e-02, -4.9368e-02,  ...,  4.5268e-01,
          -3.4181e-01, -1.8498e-01],
         [ 3.1354e-01,  2.4162e-01, -5.1873e-02,  ...,  6.0600e-01,
          -2.1092e-01, -2.5079e-01],
         ...,
         [ 4.6284e-01,  4.8849e-01,  1.2943e-01,  ...,  3.6622e-01,
          -5.1475e-01, -2.1460e-01],
         [ 3.7924e-01,  1.8501e-01, -9.5174e-03,  ...,  5.3109e-01,
          -1.5433e-01, -2.7214e-01],
         [ 2.0614e-01,  2.2050e-01,  1.9740e-02,  ...,  2.9384e-01,
          -1.3557e-01, -3.4376e-01]]], device='cuda:0')
sp_para_output_t:  tensor([[[-0.3301],
         [-0.3821],
         [-0.3689],
         [-0.4285],
         [-0.3079],
         [-0.2751],
         [-0.3225],
         [-0.3169],
         [-0.2913],
         [-0.3608]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 917])
start_positions.si

sp_para_output:  tensor([[[ 0.0805,  0.2891, -0.0596,  ...,  0.7433, -0.3882,  0.0334],
         [ 0.2348, -0.0982, -0.3502,  ...,  0.1660, -0.2242, -0.1089],
         [ 0.1075,  0.3069, -0.2687,  ...,  0.6589, -0.3249,  0.1073],
         ...,
         [ 0.3213,  0.1811, -0.0771,  ...,  0.6934, -0.2611, -0.0676],
         [-0.0052,  0.0065, -0.0647,  ...,  0.7418, -0.1292,  0.0033],
         [ 0.1898, -0.0477,  0.0192,  ...,  0.9433, -0.3442,  0.0340]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.4185],
         [-0.3237],
         [-0.3643],
         [-0.4043],
         [-0.5078],
         [-0.4109],
         [-0.3979],
         [-0.4514],
         [-0.3845],
         [-0.4355]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268, 288, 337, 362, 387, 430, 434,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 1.6979e-01,  3.6879e-01,  1.3506e-01,  ...,  8.7043e-01,
          -3.9464e-02, -1.6813e-01],
         [ 9.3890e-02,  7.5204e-01, -1.2595e-01,  ...,  6.2832e-01,
          -2.2662e-01, -3.6766e-01],
         [-2.6281e-02,  4.3827e-01, -1.2732e-01,  ...,  3.9725e-01,
          -5.5156e-02, -1.8325e-01],
         ...,
         [ 4.7732e-02,  6.0617e-01, -6.0498e-02,  ...,  2.9982e-01,
           1.2508e-01, -1.8231e-03],
         [ 1.1387e-01,  3.8865e-01, -7.1477e-02,  ...,  5.5195e-01,
          -1.3447e-01, -1.0477e-01],
         [ 1.8235e-04,  4.0030e-01, -9.0560e-02,  ...,  6.9286e-01,
           1.8788e-01, -3.2990e-01]]], device='cuda:0')
sp_para_output_t:  tensor([[[-0.3435],
         [-0.3091],
         [-0.3342],
         [-0.3408],
         [-0.3479],
         [-0.3320],
         [-0.3367],
         [-0.2124],
         [-0.2947],
         [-0.3645]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 877])
start_positions.si

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

sp_para_output:  tensor([[[ 0.2036, -0.1518, -0.3330,  ...,  0.6988, -0.3654, -0.3335],
         [ 0.4030, -0.1176, -0.5143,  ...,  0.5226, -0.3911, -0.2486],
         [-0.0620,  0.3445, -0.5400,  ...,  0.2084, -0.2012, -0.3607],
         ...,
         [ 0.2239, -0.0326, -0.4352,  ...,  0.8214, -0.2973, -0.1611],
         [ 0.1041,  0.1894, -0.5367,  ...,  0.6322, -0.2674, -0.3807],
         [ 0.2190, -0.0260, -0.3930,  ...,  0.4542, -0.4623, -0.1670]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3362],
         [-0.3877],
         [-0.3828],
         [-0.3730],
         [-0.2727],
         [-0.3442],
         [-0.3420],
         [-0.3447],
         [-0.3096],
         [-0.4009]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1177])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1177])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[608, 612,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.3900,  0.4365,  0.0703,  ...,  0.8711, -0.4043, -0.3713],
         [ 0.3537,  0.0687, -0.1125,  ...,  0.7527, -0.3433, -0.5825],
         [ 0.2222, -0.1082, -0.0110,  ...,  0.9230, -0.3324, -0.6683],
         ...,
         [ 0.2273,  0.2215, -0.1839,  ...,  0.8059, -0.3784, -0.6910],
         [ 0.3200,  0.0135,  0.0306,  ...,  0.8275, -0.4123, -0.5990],
         [-0.1206,  0.2514, -0.3977,  ...,  0.8480, -0.1818, -0.4191]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3545],
         [-0.3057],
         [-0.3420],
         [-0.3818],
         [-0.3816],
         [-0.3574],
         [-0.3755],
         [-0.3391],
         [-0.3257],
         [-0.3440]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1395])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1395])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[567,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.2984,  0.2741,  0.0021,  ...,  1.0234, -0.5494, -0.1691],
         [ 0.0537,  0.3208, -0.1611,  ..., -0.1328,  0.0521, -0.0404],
         [ 0.2514,  0.1819, -0.0774,  ...,  0.9673, -0.4915, -0.2788],
         ...,
         [ 0.2532,  0.2001, -0.1076,  ...,  0.9995, -0.6162, -0.4685],
         [ 0.2323,  0.3342, -0.0855,  ...,  0.9304, -0.5164, -0.4858],
         [ 0.2888,  0.1176, -0.0326,  ...,  0.8571, -0.4455, -0.1120]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3655],
         [-0.2510],
         [-0.3477],
         [-0.1377],
         [-0.3079],
         [-0.3459],
         [-0.4412],
         [-0.3638],
         [-0.3853],
         [-0.3477]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1514])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1514])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 624,  917, 1128,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[-0.1990,  0.3980, -0.1727,  ...,  0.7842, -0.2581, -0.0881],
         [ 0.1054,  0.2942,  0.0155,  ...,  0.5547, -0.4567, -0.2221],
         [ 0.1380,  0.2483, -0.0891,  ...,  0.5388, -0.2673, -0.2246],
         ...,
         [ 0.1906,  0.4407, -0.2253,  ...,  0.6936, -0.5128, -0.3594],
         [ 0.1163,  0.2517, -0.2197,  ...,  0.6480, -0.3800, -0.3677],
         [ 0.0332,  0.2890, -0.1430,  ...,  0.7779, -0.2431, -0.1725]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.3142],
         [-0.4788],
         [-0.4207],
         [-0.3508],
         [-0.3484],
         [-0.3633],
         [-0.3564],
         [-0.4165],
         [-0.4153],
         [-0.3506]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1806])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1806])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[606,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

answers:  [{'text': ' couple had two children robert and julie  new faces', 'score': tensor([0.5479], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(0.9834, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6074, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([0.0530], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9]
para_sent_logits: tensor([-0.7471, -0.7593], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-1.5063], device='cuda:0')
para_sents_offset: [0, 2]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.6494, -0.4541, -0.7490, -0.8120], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-1.5063, -2.6646], device='cuda:0')
para_sents_offset: [0, 2, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1

       device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-1.5571, -0.7100, -3.9355, -5.2588], device='cuda:0')
para_sents_offset: [0, 2, 3, 9, 16]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.7363, -0.7588, -0.7319, -0.7466, -0.8325], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-1.5571, -0.7100, -3.9355, -5.2588, -3.8062], device='cuda:0')
para_sents_offset: [0, 2, 3, 9, 16, 21]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.7344, -0.6997, -0.6069, -0.7368, -0.3599, -0.7339], d

answers:  [{'text': ' pornographers  new pornographers new', 'score': tensor([0.4458], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(0.8735, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3701, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([0.0934], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 9, 9]
para_sent_logits: tensor([-0.5063, -0.8242], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-1.3306], device='cuda:0')
para_sents_offset: [0, 2]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.7534, -0.7681, -0.7568], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-1.3306, -2.2783], device='cuda:0')
para_sents_offset: [0, 2, 5]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: te

s_to_p_map:  [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9]
para_sent_logits: tensor([-0.7373, -0.7358, -0.7920, -0.6890], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-2.9541], device='cuda:0')
para_sents_offset: [0, 4]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.7402, -0.7148, -0.7549], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-2.9541, -2.2100], device='cuda:0')
para_sents_offset: [0, 4, 7]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-0.7495, -0.7920, -0.4766, -0.7930, -0.7417], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-2.9541, -2.2100, -3.5527], device='cuda:0')
para_sents_offset: [0, 4, 7, 12]
evidence_candidates: {0: tensor([


Epoch 00002: avg_val_f1 reached 0.00406 (best 0.00406), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_2.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(21.5743, device='cuda:0')	avg_answer_loss:  tensor(6.3912, device='cuda:0')	avg_type_loss:  tensor(0.9522, device='cuda:0')	avg_val_f1:  0.004056795146958581	avg_val_em:  0.0	avg_val_prec:  0.0024630542972992205	avg_val_recall:  0.011494253216118648
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.1723,  0.1181,  0.1589,  ...,  0.3732,

start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[448,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 3.5107e-01,  5.5371e-01,  3.7158e-01,  6.5137e-01,  4.8242e-01,
          4.9316e-01,  4.6826e-01,  5.6689e-01,  2.8271e-01,  3.3447e-01,
          1.7737e-01,  6.0645e-01,  4.3604e-01,  2.9248e-01,  4.3823e-01,
          4.5312e-01,  6.1572e-01,  1.4563e-01,  8.1299e-01,  4.9023e-01,
          4.0869e-01,  5.3223e-01,  7.5635e-01,  9.9414e-01,  3.9355e-01,
         -1.4636e-01,  3.5229e-01,  9.6143e-01,  

sp_para_output:  tensor([[[ 0.3672,  0.3144,  0.1487,  ...,  0.9885, -0.1900, -0.1960],
         [ 0.2590,  0.2990, -0.0079,  ...,  0.6747, -0.2218,  0.0278],
         [ 0.2980,  0.5363,  0.1031,  ...,  0.6536, -0.5476, -0.5821],
         ...,
         [ 0.1472,  0.1167,  0.0795,  ...,  0.6454, -0.2084, -0.2289],
         [ 0.2145,  0.7937,  0.0168,  ...,  0.7339, -0.7202, -0.8343],
         [ 0.2010,  0.1992,  0.0949,  ...,  0.5268, -0.5709, -0.4285]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.4138],
         [-0.3350],
         [-0.3379],
         [-0.3665],
         [-0.3657],
         [-0.2898],
         [-0.3711],
         [-0.3325],
         [-0.4121],
         [-0.3606]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1098])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 63

sp_para_output:  tensor([[[ 0.4225,  0.2165,  0.1478,  ...,  0.8646, -0.1530, -0.3845],
         [ 0.2520,  0.3599,  0.0846,  ...,  0.4359, -0.3968, -0.6659],
         [ 0.0541,  0.2734, -0.0087,  ...,  1.0859,  0.0115, -0.3742],
         ...,
         [ 0.2134,  0.5440, -0.1698,  ...,  0.9936, -0.0755,  0.1134],
         [ 0.0205,  0.3548,  0.0857,  ...,  0.7334, -0.2530, -0.3769],
         [ 0.0743,  0.3283, -0.0489,  ...,  0.5522, -0.2283, -0.0928]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.4382],
         [-0.3335],
         [-0.3164],
         [-0.3201],
         [-0.3127],
         [-0.2499],
         [-0.4004],
         [-0.2505],
         [-0.4543],
         [-0.4028]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 

input_ids:  tensor([[    0, 50268,  8155,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  39,  184,  322,  407,  550,  683,  792,  877, 1037, 1088],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.4693,  0.0631,  0.1874,  ...,  0.6907, -0.5545, -0.0697],
         [ 0.3887,  0.1650,  0.2073,  ...,  1.0639, -0.6748, -0.2163],
         [ 0.0067,  0.4582, -0.2318,  ..., -0.2189, -0.3047, -0.3905],
         ...,
         [ 0.2378,  0.4421, -0.1395,  ...,  0.4503, -0.3272, -0.1556],
         [ 0.3876,  0.2899, -0.2068,  ...,  0.2165, -0.6424, -0.2219],
         [ 0.3417, -0.2352,  0.1273,  ...,  0.7078, -0.4530, -0.0435]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.3716],
         [-0.3809],
         [-0.2778],
         [-0.5327],
         [-0.3904],
         [-0.2710],
         [-0.3293],
         [-0.4248],
         [-0.2041],
         [-0.3794]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

masked_target:  tensor([[93,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], device='cuda:0')
logits:  tensor([[ 2.5806e-01,  8.4961e-01,  3.9990e-01,  4.6021e-01,  2.9688e-01,
          2.1985e-01,  4.3970e-01,  5.7031e-01,  2.9150e-01,  4.3311e-01,
          1.4978e-01,  2.1716e-01,  3.4521e-01,  3.1934e-01,  1.5417e-01,
          4.2944e-01,  6.4331e-02,  4.2529e-01,  3.0786e-01,  5.0732e-01,
          2.3230e-01,  3.3374e-01,  3.0957e-01,  4.3799e-01,  3.9819e-01,
          2.4036e-01,  5.2197e-01,  3.5962e-01,  3.6157e-01,  5.2539e-01,
          4.4336e-01,  1.1273e-01,  4.6973e-01,  6.4355e-01,  3.1470e-01,
          3.5376e-01,  4.7021e-01,  4.1431e-01,  3.9111e-01,  4.5508e-01,
          5.8740e-01,  9.9902e-01,  8.7500e-01,  4.0674e-01,  2.2693e-01

para_indexes:  tensor([  19,  106,  186,  286,  389,  493,  737,  847,  981, 1064],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.5095, -0.0629, -0.2380,  ...,  0.4836, -0.3628, -0.3021],
         [ 0.2822,  0.5319, -0.3526,  ...,  0.4225, -0.3567, -0.2673],
         [ 0.4841,  0.2695, -0.3015,  ...,  1.4274, -0.1611, -0.1907],
         ...,
         [ 0.3111,  0.2420, -0.2785,  ...,  0.3835, -0.0892,  0.1092],
         [ 0.4166,  0.3746, -0.2000,  ...,  0.3220, -0.4792, -0.2102],
         [ 0.3688,  0.4063, -0.1390,  ...,  0.4369, -0.2243, -0.1897]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.4119],
         [-0.3745],
         [-0.4124],
         [-0.3228],
         [-0.5537],
         [-0.2698],
         [-0.4214],
         [-0.3181],
         [-0.2849],
         [-0.3293]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_l

input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.2437,  0.2367,  0.0873,  ...,  0.9133, -0.1825, -0.0384],
         [ 0.2645,  0.6133, -0.0271,  ...,  0.7925, -0.2751, -0.2000],
         [ 0.3733,  0.6749, -0.1820,  ...,  0.5484, -0.4827, -0.2648],
         ...,
         [ 0.3034,  0.4016, -0.1441,  ...,  1.0164, -0.5857, -0.2804],
         [ 0.3529,  0.6795,  0.0135,  ...,  0.3567, -0.1982, -0.3090],
         [ 0.3835,  0.3553, -0.1225,  ...,  1.0588, -0.1835,  0.2624]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.3286],
         [-0.4514],
         [-0.4678],
         [-0.4949],
         [-0.5532],
         [-0.5415],
         [-0.3635],
         [-0.5186],
         [-0.4731],
         [-0.4121]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

logits:  tensor([[ 7.1777e-01,  5.9375e-01,  1.8579e-01,  2.2559e-01,  3.5181e-01,
          2.1155e-01,  8.3301e-01,  4.2334e-01,  7.2705e-01,  4.6582e-01,
          3.9722e-01,  4.5605e-01,  3.9331e-01,  2.8931e-01,  2.8589e-01,
          1.2128e-01,  3.0762e-01,  4.4751e-01,  6.7920e-01,  4.9805e-01,
          5.6689e-01, -1.4929e-01,  8.5059e-01,  4.1968e-01,  4.9622e-02,
          4.6362e-01,  5.6982e-01,  2.4011e-01, -1.3196e-01,  5.5615e-01,
          4.7900e-01,  4.3896e-01,  4.3164e-01,  3.2227e-01, -2.6733e-02,
         -2.2766e-01,  3.9014e-01,  5.5762e-01,  3.8037e-01,  2.0825e-01,
          2.6758e-01,  4.1333e-01,  5.8545e-01,  4.9707e-01, -1.3123e-01,
          2.7075e-01,  2.8394e-01,  1.6272e-01,  5.5811e-01,  6.0547e-01,
          8.0029e-01,  3.8770e-01,  4.7119e-01,  5.2930e-01, -1.8042e-01,
          8.2178e-01,  3.3594e-01,  6.2752e-03,  5.7227e-01,  6.5283e-01,
          8.8013e-02, -3.2104e-02,  8.6035e-01,  6.1865e-01,  3.1567e-01,
          3.7451e-01,  3.0957

sp_para_output:  tensor([[[ 0.1049,  0.4877, -0.2818,  ...,  0.8390, -0.1584, -0.1819],
         [ 0.3066,  0.3557, -0.3117,  ...,  0.4716, -0.3026, -0.2735],
         [ 0.1521,  0.1927, -0.3428,  ...,  0.6976, -0.1405,  0.3250],
         ...,
         [ 0.5368,  0.0800, -0.1592,  ...,  1.1115, -0.5907, -0.3915],
         [ 0.0826,  0.3953, -0.2547,  ...,  0.4508,  0.0688, -0.4498],
         [ 0.1931,  0.3841, -0.2245,  ...,  0.6936, -0.3776,  0.1273]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.5566],
         [-0.4968],
         [-0.4526],
         [-0.5269],
         [-0.6118],
         [-0.5215],
         [-0.5498],
         [-0.6597],
         [-0.7036],
         [-0.6665]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268

logits:  tensor([[ 3.1250e-01,  4.9219e-01,  2.6245e-01,  9.1406e-01,  5.1514e-01,
          2.2815e-01, -2.2656e-01,  2.0947e-01,  7.7051e-01,  3.9429e-01,
          2.3938e-01,  4.6899e-01,  6.7676e-01,  3.5815e-01,  2.6123e-01,
          2.1094e-01,  3.1738e-01,  3.2104e-01,  1.3098e-01,  4.5801e-01,
          4.8584e-01,  4.4116e-01,  4.9219e-01,  2.1790e-01,  2.3083e-01,
          2.9004e-01,  5.2783e-01,  1.9470e-01,  4.5679e-01,  2.5317e-01,
          4.3408e-01,  3.9819e-01,  1.0614e-01,  2.5171e-01,  3.8330e-01,
          4.7485e-01,  5.1270e-01,  5.8252e-01,  2.4719e-01,  9.5215e-02,
          2.5220e-01,  1.9019e-01, -2.7856e-01, -3.0380e-02,  1.9446e-01,
          4.7437e-01,  3.5083e-01,  4.1382e-01,  7.0117e-01,  7.7588e-01,
          4.6167e-01,  5.0732e-01,  4.8584e-01,  1.2686e+00,  2.3669e-01,
          1.5564e-01,  1.4758e-01,  4.0942e-01,  3.7012e-01,  5.6299e-01,
          4.2700e-01,  4.7607e-01,  2.6074e-01,  1.7615e-01,  7.1045e-01,
          5.3906e-01,  2.8638

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.4910,  0.0401,  0.1287,  ...,  0.8315, -0.2699, -0.1117],
         [ 0.4966,  0.1674, -0.5519,  ...,  0.7799, -0.6998, -0.1747],
         [ 0.2477,  0.3306, -0.7532,  ...,  0.7874, -0.4975, -0.3763],
         ...,
         [ 0.3015,  0.2098, -0.4540,  ...,  0.9092, -0.4802, -0.2706],
         [ 0.5881,  0.3160, -0.6255,  ...,  0.2340, -0.4758, -0.2099],
         [ 0.2853,  0.0353, -0.6363,  ...,  0.9813, -0.4638, -0.1093]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.5522],
         [-0.5483],
         [-0.6875],
         [-0.5596],
         [-0.4414],
         [-0.6636],
         [-0.6162],
         [-0.5635],
         [-0.5420],
         [-0.6143]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.5999,  0.4046, -0.2858,  ...,  1.2554, -0.6595, -0.1834],
         [ 0.3888,  0.6196, -0.2640,  ...,  0.7561, -0.4606,  0.2082],
         [ 0.3739,  0.3808, -0.2646,  ...,  0.2846, -0.8286, -0.5019],
         ...,
         [ 0.4942,  0.0498, -0.2409,  ...,  0.7382, -0.3554, -0.3813],
         [ 0.5748,  0.1703, -0.3457,  ...,  1.0918, -0.3381, -0.2992],
         [ 0.5205,  0.5358, -0.1922,  ...,  1.1130, -0.7195, -0.0863]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.6538],
         [-0.6572],
         [-0.6504],
         [-0.6479],
         [-0.6704],
         [-0.6118],
         [-0.6016],
         [-0.5513],
         [-0.5029],
         [-0.6748]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.5204,  0.4071, -0.2823,  ...,  0.3703, -0.4714, -0.4136],
         [ 0.2018,  0.2851, -0.1220,  ...,  0.3368, -0.0363, -0.4988],
         [ 0.3353,  0.3948, -0.5119,  ...,  0.5499, -0.4721, -0.2466],
         ...,
         [ 0.4887,  0.6418,  0.0402,  ...,  0.5598, -0.6160, -0.9830],
         [ 0.3936,  0.6333, -0.3229,  ...,  0.8261, -0.2764, -0.3370],
         [ 0.5159,  0.5862, -0.2881,  ...,  0.5648, -0.3948, -0.4207]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.6147],
         [-0.5972],
         [-0.6187],
         [-0.6221],
         [-0.5454],
         [-0.6377],
         [-0.6133],
         [-0.6523],
         [-0.6641],
         [-0.6504]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.3512,  0.4823, -0.0565,  ...,  0.6767, -0.2264, -0.3390],
         [ 0.3871,  0.3776, -0.2302,  ...,  0.6744, -0.2994, -0.4511],
         [ 0.2259,  0.5762, -0.1088,  ...,  1.5733, -0.3206, -0.2468],
         ...,
         [ 0.3210,  0.4698, -0.1968,  ...,  0.6510, -0.0076, -0.3757],
         [-0.0027,  0.5886, -0.1365,  ...,  0.5848, -0.1079, -0.4980],
         [ 0.3795,  0.4462, -0.1572,  ...,  0.8941, -0.1351, -0.5893]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7280],
         [-0.6592],
         [-0.5586],
         [-0.7583],
         [-0.7305],
         [-0.7065],
         [-0.7207],
         [-0.6943],
         [-0.6802],
         [-0.7231]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

input_ids:  tensor([[    0, 50268, 13792,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  29,  186,  346,  509,  663,  734, 1082, 1242, 1484, 1664],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.4810,  0.4282, -0.2266,  ...,  0.8801, -0.5413, -0.5085],
         [ 0.4545,  0.5922, -0.1993,  ...,  0.9252, -0.6838, -0.5273],
         [ 0.5396,  0.4669, -0.2667,  ...,  1.0342, -0.4978, -0.4921],
         ...,
         [ 0.4642,  0.3410, -0.2532,  ...,  1.0673, -0.5451, -0.3947],
         [ 0.6513,  0.2120, -0.0512,  ...,  1.0128, -0.5830, -0.4088],
         [ 0.5514,  0.2269, -0.2539,  ...,  0.9068, -0.4406, -0.3537]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7671],
         [-0.7700],
         [-0.8174],
         [-0.7144],
         [-0.7397],
         [-0.7466],
         [-0.7705],
         [-0.7681],
         [-0.7256],
         [-0.7524]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1810])
start_positions.si

sp_para_output:  tensor([[[ 0.0655,  0.6240, -0.2336,  ...,  0.9132, -0.4581, -0.5362],
         [-0.0153,  0.4639, -0.4122,  ...,  0.6831, -0.2403, -0.7174],
         [ 0.3287,  0.6495, -0.2732,  ...,  0.5385, -0.6261, -0.2918],
         ...,
         [ 0.3436,  0.4962, -0.1399,  ...,  1.0556, -0.5272, -0.3868],
         [ 0.1844,  0.6984, -0.0834,  ...,  0.7857, -0.6347, -0.5249],
         [ 0.3683,  0.6378, -0.1935,  ...,  0.7521, -0.4911, -0.5121]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6738],
         [-0.6782],
         [-0.7153],
         [-0.7773],
         [-0.7695],
         [-0.7427],
         [-0.7427],
         [-0.7197],
         [-0.7290],
         [-0.7427]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[448,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

para_sent_logits: tensor([-1.2197], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-3.6582, -3.6729, -7.2295, -1.2578, -1.1748, -2.4648, -4.8340, -1.1836,
        -1.2217, -1.2197], device='cuda:0')
para_sents_offset: [0, 3, 6, 12, 13, 14, 16, 20, 21, 22, 23]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64), 8: tensor([], device='cuda:0', dtype=torch.int64), 9: tensor([], device='cuda:0', dtype=torch.int64)}
sp_para_pred: tensor([4, 7], device='cuda:0')
pre_answer:	 novel written by henry 	gold_answer:	 american 	sp_sent_pred:	 [] 	sp_sent_gold:	 [13, 14, 15] 	sp_para_pred:	 tensor([4, 7], de

validation_step
qid:  5a7e36045542991319bc9440
input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  275,  642,  711,  854,  954, 1016, 1262, 1479, 1603],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.6102,  0.5177, -0.1444,  ...,  1.0712, -0.5205, -0.7159],
         [ 0.1967,  0.7689, -0.0976,  ...,  1.0109, -0.3802, -0.5074],
         [ 0.5373,  0.5923, -0.0188,  ...,  1.1314, -0.5828, -0.6659],
         ...,
         [ 0.5381,  0.5245, -0.1614,  ...,  1.0152, -0.6281, -0.7592],
         [ 0.4717,  0.4203, -0.0745,  ...,  1.0986, -0.5702, -0.6586],
         [ 0.5226,  0.4984, -0.1678,  ...,  1.0365, -0.6366, -0.7156]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7847],
         [-0.7534],
         [-0.7627],
         [-0.7437],
         [-0.8032],
         [-0.7681],
         [-0.7314],
         [-0.7983],
         [-0.7495],
         [-0.7720]]], device='cuda:0', dtype=torch.float16)
start_logits.si

input_ids:  tensor([[    0, 50268,  5488, 44878,     9, 38727,    11, 14490,  3821,  8377,
           247,  6308,   129,    65,  4707, 50269,     2, 50265,  9473,   493,
         50266,  9473,   493,  4142, 16441,     9,  9473,   493,   741,   298,
         18195,  6528, 26848, 48312,  6382,   271, 18195,   267,  2636,    16,
           247,    11,  2077,    25,   493, 50267,    24,    16,  3821,  8377,
           247,    30,   443,   200,  7877, 24347,   247,    19,    81,   316,
           325,    82,     8,   144, 24347,  4593,    11,   232, 50267,    24,
            16, 43490,    30,  9473,   811,  6444,    15,  2077,  4709,   873,
           811,  3342,    15, 10103,     8, 11751,     9,   741,  3314,   337,
            15, 10171, 50267,    24,   327,  1212,  7562,    19,   181,   677,
          7566,     7,  3072,  1855,  1243,  3087, 18239,     8,   741,   298,
         22165,     7,  9489,     8,   127, 43918, 12601,  1916,     8, 14225,
           462, 42589,     7,  3017, 502

answers:  [{'text': ' where it has been recorded from nova scotia to fl', 'score': tensor([0.5181], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(0.9297, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3025, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([0.3223], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9]
para_sent_logits: tensor([-1.2090, -1.2236, -1.1670, -1.2588, -1.1631, -1.2295], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-7.2510], device='cuda:0')
para_sents_offset: [0, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.2051, -1.2168], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-7.2510, -2.4219], device='cuda:0')
para_sents_offset: [0, 6, 8]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor

sp_para_output:  tensor([[[ 0.5993,  0.4454, -0.3318,  ...,  1.3850, -0.3775, -0.5497],
         [ 0.6058,  0.5606, -0.3257,  ...,  0.9787, -0.2538, -0.5499],
         [ 0.6468,  0.7105, -0.2708,  ...,  0.8381, -0.2005, -0.7418],
         ...,
         [ 0.7261,  0.2501, -0.2467,  ...,  0.9862, -0.3245, -0.5889],
         [ 0.6348,  0.5775, -0.2843,  ...,  1.2105, -0.2399, -0.7124],
         [ 0.3356,  0.4894, -0.4327,  ...,  0.9771, -0.3570, -0.4571]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7578],
         [-0.7217],
         [-0.7603],
         [-0.7456],
         [-0.7549],
         [-0.7676],
         [-0.7642],
         [-0.7607],
         [-0.6709],
         [-0.6660]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1340])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1173,    0,    0,    0,    0,    0,    0,    0,    0,    0,

input_ids:  tensor([[    0, 50268, 12196,    16,  5933,     9,  1349,   147,  1014, 23321,
           118,   475, 15022, 11824, 30141,   316,  1946,    21, 10899, 50269,
             2, 50265, 14206,  5730, 29762,  9326, 50266, 14206,  5730, 29762,
          9326,    16,  4243,  4930,  1349,  2034,    11, 11824, 30141,    92,
          2077,   885,  4575, 28410,  7085,   493, 50267,    24,    16, 17949,
            15,  9910,    19,  6594,   781,  2523,     9, 14206,  5730, 29762,
             8,   885,   895,  6487,   257,     8,    16,   275,   684,    25,
           184,     9, 11824, 30141, 10775,  4243,  1015,   547,   349, 16874,
         24761,     8, 11824, 30141,   316,  1946,   515,   547,   349, 10668,
           428, 48540, 50267,  5356,  1558,  6301,   251,  1349,    16, 12684,
          2014,  9326,     8,    16,   285,   921,    19,  2340,  2078,  5165,
            77,   117,  4930,  1061,    32,   145,   422,     8,    89,    32,
           171, 23251,    61,    64,   1

para_sent_logits: tensor([-1.2012, -1.2012], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-3.4453, -2.3828, -5.9141, -2.4014, -3.6045, -3.5820, -4.7451, -2.4023],
       device='cuda:0')
para_sents_offset: [0, 3, 5, 10, 12, 15, 18, 22, 24]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.1699, -1.1904, -1.2344], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-3.4453, -2.3828, -5.9141, -2.4014, -3.6045, -3.5820, -4.7451, -2.4023,
        -3.5947], device='cuda:0')
para_sents_offset: [0, 3, 5, 10, 12, 15, 18, 22, 24, 27]
evidence_candidates: 

       device='cuda:0')
sp_para_output:  tensor([[[ 0.4212,  0.3130, -0.2868,  ...,  0.7006, -0.4309, -0.6214],
         [ 0.3343,  0.4493, -0.4022,  ...,  0.6697, -0.3254, -0.6716],
         [ 0.3956,  0.5128, -0.3645,  ...,  0.6173, -0.3964, -0.7086],
         ...,
         [ 0.2768,  0.3915, -0.2805,  ...,  0.7194, -0.5048, -0.6725],
         [ 0.3407,  0.2513, -0.2081,  ...,  0.7513, -0.2840, -0.5951],
         [ 0.4723,  0.1455, -0.2979,  ...,  0.6623, -0.3512, -0.5495]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7437],
         [-0.7808],
         [-0.7925],
         [-0.7412],
         [-0.7256],
         [-0.7075],
         [-0.7300],
         [-0.7759],
         [-0.7881],
         [-0.7739]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0, 

sp_para_output:  tensor([[[ 0.4509,  0.6801, -0.1299,  ...,  1.0084, -0.5249, -0.6142],
         [ 0.3289,  0.5361, -0.2765,  ...,  0.7032, -0.4674, -0.6066],
         [ 0.2755,  0.5990, -0.2254,  ...,  0.6950, -0.2680, -0.5815],
         ...,
         [ 0.2387,  0.5224, -0.2070,  ...,  0.8349, -0.4118, -0.6293],
         [ 0.2738,  0.5012, -0.0592,  ...,  1.2850, -0.4537, -0.6441],
         [ 0.3657,  0.3176, -0.1980,  ...,  0.9026, -0.3735, -0.5081]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7441],
         [-0.7212],
         [-0.7280],
         [-0.6602],
         [-0.6689],
         [-0.6719],
         [-0.7466],
         [-0.7529],
         [-0.7236],
         [-0.6997]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1347])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1347])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[507,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.4400,  0.4729, -0.2548,  ...,  0.7639, -0.4353, -0.5199],
         [ 0.2440,  0.4448, -0.4380,  ...,  0.6063, -0.4959, -0.6417],
         [ 0.1610,  0.2799, -0.3873,  ...,  0.5649, -0.4949, -0.5384],
         ...,
         [ 0.2655,  0.3477, -0.4051,  ...,  0.7332, -0.5722, -0.5508],
         [ 0.2102,  0.2965, -0.4478,  ...,  0.6856, -0.4905, -0.6896],
         [ 0.3335,  0.1796, -0.3811,  ...,  0.6420, -0.3918, -0.5558]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7437],
         [-0.7393],
         [-0.7798],
         [-0.7314],
         [-0.6318],
         [-0.7695],
         [-0.6699],
         [-0.7002],
         [-0.7896],
         [-0.7593]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.4210,  0.6603,  0.0062,  ...,  1.2523, -0.3587, -0.3425],
         [ 0.3845,  0.6996, -0.2109,  ...,  0.9126, -0.3251, -0.5208],
         [ 0.3643,  0.5635, -0.1093,  ...,  1.0002, -0.2703, -0.4784],
         ...,
         [ 0.2780,  0.6223, -0.2911,  ...,  1.0963, -0.4350, -0.4049],
         [ 0.4611,  0.8259, -0.1521,  ...,  0.9967, -0.2176, -0.5332],
         [ 0.3655,  0.6924, -0.2003,  ...,  1.1310, -0.3739, -0.5655]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6812],
         [-0.7090],
         [-0.6743],
         [-0.7363],
         [-0.7271],
         [-0.6836],
         [-0.6626],
         [-0.7559],
         [-0.7388],
         [-0.6543]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1136])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1136])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[284, 535,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

para_indexes:  tensor([  29,  207,  372,  479,  594,  720,  806,  984, 1044, 1098],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.6249,  0.5486, -0.2941,  ...,  0.9883, -0.4226, -0.6396],
         [ 0.4566,  0.5417, -0.3164,  ...,  1.1074, -0.3703, -0.5624],
         [ 0.5378,  0.5048, -0.3063,  ...,  1.0677, -0.4067, -0.5546],
         ...,
         [ 0.5711,  0.5811, -0.4214,  ...,  0.8943, -0.4231, -0.4504],
         [ 0.6963,  0.2278, -0.0821,  ...,  0.7520, -0.1958, -0.5932],
         [ 0.5754,  0.3005, -0.4119,  ...,  1.2021, -0.2623, -0.6001]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7666],
         [-0.7944],
         [-0.8188],
         [-0.7275],
         [-0.7271],
         [-0.7734],
         [-0.8159],
         [-0.7158],
         [-0.6694],
         [-0.7441]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size(

sp_para_output:  tensor([[[ 0.4201,  0.1921, -0.2968,  ...,  0.9197, -0.3786, -0.5881],
         [ 0.2459,  0.4282, -0.4321,  ...,  0.9807, -0.2873, -0.5728],
         [ 0.1691,  0.4161, -0.3273,  ...,  0.8857, -0.4437, -0.6734],
         ...,
         [ 0.3126,  0.1723, -0.3921,  ...,  0.8195, -0.4070, -0.6747],
         [ 0.2689,  0.2368, -0.2757,  ...,  0.9129, -0.3965, -0.4232],
         [ 0.3122,  0.3715, -0.2876,  ...,  0.6647, -0.3921, -0.6475]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6655],
         [-0.6831],
         [-0.6938],
         [-0.6846],
         [-0.6558],
         [-0.7021],
         [-0.6978],
         [-0.7100],
         [-0.6729],
         [-0.6919]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1425])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1425])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1363,    0,    0,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[ 0.4296,  0.4437, -0.0898,  ...,  0.8850, -0.3962, -0.3835],
         [ 0.2329,  0.8752, -0.2259,  ...,  1.1598, -0.4948, -0.7437],
         [ 0.1797,  0.8724, -0.3359,  ...,  0.8503, -0.5531, -0.4705],
         ...,
         [ 0.2899,  0.4544, -0.1616,  ...,  0.8111, -0.3377, -0.5844],
         [ 0.1534,  0.3607, -0.2090,  ...,  0.9767, -0.1963, -0.2374],
         [ 0.3473,  0.5983, -0.1221,  ...,  0.8021, -0.4055, -0.5951]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7266],
         [-0.6978],
         [-0.7290],
         [-0.6880],
         [-0.7085],
         [-0.6484],
         [-0.7158],
         [-0.7344],
         [-0.6025],
         [-0.7129]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1383])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1383])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[634, 969,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.6336,  0.4877, -0.2435,  ...,  0.6625, -0.1860, -0.7216],
         [ 0.6271,  0.3381, -0.2495,  ...,  0.6821, -0.2943, -0.5093],
         [ 0.5067,  0.6000, -0.3257,  ...,  0.7583, -0.1515, -0.5383],
         ...,
         [ 0.6122,  0.6630, -0.1403,  ...,  0.5967, -0.3705, -0.4274],
         [ 0.5113,  0.5234, -0.2604,  ...,  0.6977, -0.1851, -0.5415],
         [ 0.4274,  0.5573, -0.2443,  ...,  0.5855, -0.1415, -0.5400]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6763],
         [-0.6963],
         [-0.6904],
         [-0.7334],
         [-0.6636],
         [-0.6445],
         [-0.6704],
         [-0.6533],
         [-0.6250],
         [-0.6738]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 917])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 917])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 68, 225, 248, 302, 386, 417, 495, 540, 647, 656, 693, 747, 7

       dtype=torch.float16)
para_sent_logits_sum: tensor([-5.9219, -8.3242, -6.0078, -3.5303, -2.3770, -4.7285, -3.6123, -4.6660],
       device='cuda:0')
para_sents_offset: [0, 5, 12, 17, 20, 22, 26, 29, 33]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.1924, -1.2041, -1.1943, -1.1836], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-5.9219, -8.3242, -6.0078, -3.5303, -2.3770, -4.7285, -3.6123, -4.6660,
        -4.7744], device='cuda:0')
para_sents_offset: [0, 5, 12, 17, 20, 22, 26, 29, 33, 37]
evidence_candidates: {0: tensor([], device='cuda:0', dtype

input_ids:  tensor([[    0, 50268,  5488,   194,   473,  1262,  2326,     9,    61,  8635,
           139,    16,   997,  2558,   741,  1506,   927,    32,  2034, 50269,
             2, 50265,   201,   102,  1262, 50266,   201,   102,  1262,    16,
          9742,  1400,     8, 15288,  3206,    11, 10409,   982,    14,  4497,
            55,    87, 10572,  2326,    11,  4709,   330, 38058, 15983, 39572,
          2649,  3006, 37679,  2649, 22805,     8,   326,  4734, 39941, 50267,
            24,    21,  4790,    11, 22716, 37372,  4709,   330, 38058,    11,
         11265, 50267,    63,  4351,    32,   202,  2034,    11, 22716, 37372,
            19,   943,  4088,    11,   410,  3152,  4709,   330, 38058,   326,
         11418,   102, 15983, 39572,     8, 26012, 42377,   326,  4734, 39941,
         50267,    63,  4095,   138, 21788,   571, 18656,  4497,  2422,   385,
          1262,  2326,     8,  1437, 13349,  1844,  6720,  1262,   258,  3566,
            11, 16874, 24761,  7528,    

logits:  tensor([[ 1.0803e-01, -5.0537e-01, -3.5669e-01, -2.5171e-01, -3.0005e-01,
         -2.2729e-01,  1.9434e-01, -3.1812e-01, -6.3232e-01, -1.7590e-01,
         -7.1106e-02, -1.0176e+00, -1.5857e-01, -2.4063e-02,  3.9520e-03,
          2.7481e-02, -1.2793e-01, -6.3721e-01, -3.0396e-01, -5.3271e-01,
          1.0626e-01, -5.8740e-01, -1.6577e-01, -5.1221e-01, -3.4375e-01,
         -6.1377e-01, -2.1255e-02, -2.9150e-01, -1.4465e-01, -9.9365e-01,
          8.4167e-02,  2.3364e-01, -3.4155e-01,  3.6621e-02, -1.8652e-01,
         -5.1611e-01, -4.3750e-01, -1.2793e-01, -3.3887e-01, -2.9160e-02,
          9.7504e-03,  2.2852e-01,  8.7891e-02,  1.3879e-01, -3.0640e-01,
         -1.9089e-02, -2.1216e-01, -2.4048e-01, -2.9712e-01, -8.8043e-03,
         -1.5808e-01, -1.5649e-01, -2.6880e-01, -1.3733e-01, -2.0972e-01,
         -1.5381e-01,  1.6125e-01, -1.1285e-01, -1.2274e-01, -6.9189e-01,
         -9.4434e-01, -5.9375e-01, -1.3171e-01, -2.2144e-01,  2.5439e-01,
          9.7290e-02, -3.2959

sp_para_output:  tensor([[[ 0.5503,  0.4511, -0.1363,  ...,  0.5163, -0.2727, -0.6054],
         [ 0.3556,  0.4084, -0.2718,  ...,  0.7051, -0.3483, -0.6583],
         [ 0.4117,  0.4010, -0.2991,  ...,  0.6545, -0.1628, -0.6445],
         ...,
         [ 0.3740,  0.3725, -0.2276,  ...,  0.6808, -0.2610, -0.7427],
         [ 0.4137,  0.3873, -0.2932,  ...,  0.6753, -0.1641, -0.6574],
         [ 0.2661,  0.4016, -0.2567,  ...,  0.7266, -0.3354, -0.6828]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7852],
         [-0.7402],
         [-0.7520],
         [-0.7832],
         [-0.7803],
         [-0.7715],
         [-0.7695],
         [-0.7617],
         [-0.7510],
         [-0.7754]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 835])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 835])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[410, 577, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

        -2.4072, -7.2441], device='cuda:0')
para_sents_offset: [0, 5, 6, 11, 14, 18, 19, 23, 27, 29, 35]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64), 8: tensor([], device='cuda:0', dtype=torch.int64), 9: tensor([], device='cuda:0', dtype=torch.int64)}
sp_para_pred: tensor([1, 5], device='cuda:0')
pre_answer:	 furutani warren t furutani born october 16 	gold_answer:	 kelli ward 	sp_sent_pred:	 [] 	sp_sent_gold:	 [16, 23] 	sp_para_pred:	 tensor([1, 5], device='cuda:0') 	sp_para_gold:	 [4, 7]
validation_step
qid:  5ac2a912554299218029dae8
input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], de

validation_step
qid:  5a7272eb5542997f827839d7
input_ids:  tensor([[    0, 50268,  9178,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  13,  107,  197,  378,  541,  644,  784,  930, 1152, 1294],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.5686,  0.6675, -0.2203,  ...,  1.3481, -0.3962, -0.7141],
         [ 0.4498,  0.3408, -0.3095,  ...,  1.1768, -0.3462, -0.8195],
         [ 0.4085,  0.1785, -0.1647,  ...,  1.3466, -0.4799, -0.9918],
         ...,
         [ 0.3657,  0.4685, -0.2874,  ...,  1.3084, -0.3776, -0.9286],
         [ 0.4436,  0.3184, -0.1099,  ...,  1.3746, -0.4001, -0.9294],
         [ 0.1744,  0.3723, -0.3220,  ...,  1.5388, -0.3898, -0.7297]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6875],
         [-0.6406],
         [-0.6733],
         [-0.7080],
         [-0.7119],
         [-0.6919],
         [-0.6958],
         [-0.6797],
         [-0.6606],
         [-0.7075]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.4178,  0.4910, -0.2273,  ...,  1.1485, -0.5570, -0.4680],
         [ 0.2439,  0.5339, -0.2946,  ...,  0.6664, -0.3291, -0.7262],
         [ 0.4513,  0.3348, -0.2309,  ...,  1.1037, -0.5139, -0.5667],
         ...,
         [ 0.4055,  0.3535, -0.3326,  ...,  1.1876, -0.5332, -0.6289],
         [ 0.3794,  0.4573, -0.2682,  ...,  1.1269, -0.5086, -0.6806],
         [ 0.5301,  0.2873, -0.1698,  ...,  1.0393, -0.4388, -0.4189]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6992],
         [-0.7188],
         [-0.7100],
         [-0.3049],
         [-0.6826],
         [-0.7085],
         [-0.7661],
         [-0.7114],
         [-0.7432],
         [-0.7280]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1514])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1514])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 624,  917, 1128,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[ 0.2319,  0.6644, -0.3098,  ...,  0.9451, -0.2840, -0.4201],
         [ 0.2975,  0.5804, -0.2485,  ...,  1.0553, -0.3478, -0.5465],
         [ 0.3578,  0.5206, -0.2358,  ...,  0.7941, -0.2404, -0.5028],
         ...,
         [ 0.3861,  0.6805, -0.3249,  ...,  1.0279, -0.3643, -0.5799],
         [ 0.2928,  0.4926, -0.3083,  ...,  0.8579, -0.2979, -0.5763],
         [ 0.3181,  0.5606, -0.2308,  ...,  1.0726, -0.2300, -0.4614]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.7207],
         [-0.7993],
         [-0.7651],
         [-0.7222],
         [-0.7158],
         [-0.7192],
         [-0.7280],
         [-0.7563],
         [-0.7661],
         [-0.7178]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1806])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1806])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[606,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

       device='cuda:0', dtype=torch.float16)
masked_target:  tensor([[264, 295, 300,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.1577, -0.2017,  0.1951,  ..., -0.3120, -0.3152, -0.3105]],
       device='cuda:0', dtype=torch.float16)
decode
p_type:  0
answers:  [{'text': ' couple had two children robert and julie  new faces of 1952 new faces', 'score': tensor([0.7070], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(0.9565, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.7119, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([0.4539], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 

decode
p_type:  0
answers:  [{'text': ' novelist tereska torres  leonid levin leonid anatoliev', 'score': tensor([0.6631], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.2705, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3542, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([0.3647], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9]
para_sent_logits: tensor([-1.2500, -1.2656], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-2.5156], device='cuda:0')
para_sents_offset: [0, 2]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.2178], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-2.5156, -1.2178], device='cuda:0')
para_sents_offset: [0, 2, 3]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([

logits:  tensor([[ 0.3740,  0.6353,  0.4004,  0.0425,  0.4319,  0.7310,  0.1646,  0.1179,
          0.7188,  0.3213,  0.3354,  0.1237,  0.5508,  0.3389,  0.3867,  0.2512,
          0.3901,  0.2805,  0.2421,  0.5234,  0.2163,  0.7002,  0.5181,  0.4729,
          0.2148,  0.6499,  0.1059,  0.2098,  0.4346,  0.2435,  0.0673,  0.4075,
          0.2247,  0.2910,  0.4771,  0.4966,  0.4067,  0.7378,  0.2102,  0.6826,
          0.2289,  0.2007, -0.2123,  0.3162,  0.5044,  0.1903,  0.2416,  0.1920,
         -0.0349,  0.1841,  0.5454,  0.1591,  0.2372,  0.1573,  0.4292,  0.2413,
          0.2460,  0.5317,  0.6992,  0.2465,  0.0536,  0.4712,  0.2190, -0.0497,
          0.4482,  0.5049,  0.2769,  0.6558,  0.3716,  0.3240,  0.4502,  0.3831,
          0.6445,  0.3213,  0.2659,  0.4541,  0.4468,  0.3799,  0.1376,  0.1572,
          0.4553,  0.0479,  0.5752,  0.5029,  0.6313,  0.5435,  0.1841,  0.4475,
          0.6123,  0.1506,  0.4651,  0.2472,  0.6353,  0.1466,  0.6221,  0.6426,
          0.5342,  

sp_para_output:  tensor([[[ 0.3727,  0.5964, -0.1304,  ...,  0.7512, -0.4302, -0.4576],
         [ 0.4054,  0.4007, -0.2664,  ...,  0.7677, -0.3797, -0.5284],
         [ 0.2700,  0.5099, -0.2413,  ...,  0.7908, -0.3881, -0.4951],
         ...,
         [ 0.3194,  0.4914, -0.2567,  ...,  0.7433, -0.3354, -0.5403],
         [ 0.3321,  0.6531, -0.2770,  ...,  1.0002, -0.3902, -0.4801],
         [ 0.2938,  0.4233, -0.2529,  ...,  0.8946, -0.4648, -0.4477]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.6885],
         [-0.6738],
         [-0.6860],
         [-0.6489],
         [-0.6406],
         [-0.6831],
         [-0.6484],
         [-0.7070],
         [-0.7505],
         [-0.6299]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1667])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1667])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 120,  151,  180,  479,  538,  577,  892,  922, 1002, 1464,


Epoch 00003: avg_val_f1 reached 0.01258 (best 0.01258), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_3.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(18.0548, device='cuda:0')	avg_answer_loss:  tensor(6.3534, device='cuda:0')	avg_type_loss:  tensor(0.5834, device='cuda:0')	avg_val_f1:  0.01258473355194618	avg_val_em:  0.0	avg_val_prec:  0.0072915555331213725	avg_val_recall:  0.08045977045749796
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.8668,  0.4036,  0.0775,  ...,  0.4180, -

masked_target:  tensor([[448,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.3977,  0.8560,  0.6768,  0.6958,  0.7690,  0.5571,  0.6104,  0.5293,
          0.2639,  0.4448,  0.3660,  0.9365,  0.6436,  0.8491,  0.5776,  0.7437,
          0.3711, -0.0065,  0.5435,  0.5098,  0.2661,  0.4685,  0.8687,  0.6499,
          0.8794, -0.2896,  0.2313,  0.4819,  0.3376,  0.4280,  0.8579,  0.2776,
          0.5151,  0.4031,  0.2034,  0.2479,  1.0918,  0.3240,  0.0470,  0.6328,
          0.3689,  0.4441,  0.1859,  0.6128,  0.2075,  0.2925,  0.6162,  0.7017,
          0.4636,  0.2167,  1.0244,  0.1132,  0.1752,  0.7227,  0.8169,  0.5869,
          0.2856,  0.623

para_indexes:  tensor([ 20, 125, 185, 346, 416, 577, 675, 764, 808, 950], device='cuda:0')
sp_para_output:  tensor([[[ 0.5973,  0.5803,  0.0931,  ...,  1.2123,  0.0887, -0.7880],
         [ 0.4743,  0.8855, -0.2872,  ...,  0.5878, -0.1267, -0.6754],
         [ 0.5847,  0.9965, -0.2667,  ...,  0.7469, -0.4742, -0.6677],
         ...,
         [ 0.7439,  1.1653, -0.1130,  ...,  0.9319, -0.3643, -0.8967],
         [ 0.4002,  0.8960,  0.1750,  ...,  0.0425, -0.0911, -0.6466],
         [ 0.8237,  0.7467, -0.0693,  ...,  1.1266, -0.5296, -0.1209]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.7466],
         [-0.5845],
         [-0.6606],
         [-0.6870],
         [-0.6562],
         [-0.6475],
         [-0.6606],
         [-0.7383],
         [-0.8828],
         [-0.6772]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  to

sp_para_output_t:  tensor([[[-0.6069],
         [-0.6279],
         [-0.6924],
         [-0.6880],
         [-0.6519],
         [-0.3950],
         [-0.7144],
         [-0.7441],
         [-0.6382],
         [-0.8394]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 839])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 839])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[431, 642, 795,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.2386,  0.5977,  0.3147,  0.4800,  0.3054,  0.5703,  0.2416,  0.7158,
          0.2778,  0.1979,  0.6821, -0.0948,  0.

input_ids:  tensor([[    0, 50268,  8155,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  39,  184,  322,  407,  550,  683,  792,  877, 1037, 1088],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.5720,  0.2148, -0.2837,  ...,  0.6935, -0.4754, -0.5592],
         [ 0.3053,  0.2792, -0.2706,  ...,  0.8218, -0.3419, -0.3980],
         [ 0.5406,  0.7374, -0.4320,  ...,  0.0237, -0.1088, -0.9360],
         ...,
         [ 0.2894,  0.5417, -0.1732,  ...,  1.2853, -0.3721, -0.6993],
         [ 0.5116,  0.2902, -0.4975,  ...,  0.3130, -0.5642, -0.3683],
         [ 0.9483,  0.5797, -0.1769,  ...,  1.1132, -0.7352, -0.2395]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.7798],
         [-0.7617],
         [-0.7954],
         [-0.8555],
         [-0.7622],
         [-0.6753],
         [-0.7725],
         [-0.6938],
         [-0.7129],
         [-0.7476]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

logits:  tensor([[ 5.2582e-02, -7.3633e-01, -1.1456e-01, -1.1895e+00, -1.9958e-01,
         -5.4980e-01, -8.7433e-03, -8.9307e-01, -4.4873e-01, -1.0297e-01,
         -3.8647e-01, -7.5073e-02, -5.6299e-01, -4.0314e-02, -3.0396e-01,
          1.4429e-01,  6.6650e-02, -8.0615e-01, -3.4253e-01, -5.6885e-01,
          2.3163e-02, -7.6318e-01, -5.4736e-01, -1.4771e-01, -1.9861e-01,
         -1.2781e-01, -4.6216e-01, -1.9910e-01, -3.6224e-02, -1.9629e-01,
         -3.3643e-01, -3.7573e-01, -2.5244e-01,  1.6174e-01, -8.3679e-02,
         -1.8835e-01, -2.4365e-01,  3.7659e-02,  1.2915e-01,  2.9761e-01,
          1.1353e-01,  2.8027e-01, -1.3220e-01,  1.9104e-01, -2.0715e-01,
         -1.5125e-01, -5.6836e-01, -5.1904e-01, -5.3906e-01, -5.3467e-01,
          3.0365e-02, -5.1727e-03, -2.3950e-01, -2.3584e-01, -2.9785e-01,
          2.3840e-01, -1.5161e-01, -3.4961e-01,  1.7627e-01, -2.9688e-01,
         -3.8037e-01,  8.7585e-02, -1.9189e-01, -6.7291e-03, -2.5131e-02,
         -3.9014e-01, -6.3330

sp_para_output:  tensor([[[ 0.4683,  0.6780, -0.2743,  ...,  0.6039, -0.6157, -0.5707],
         [ 0.2792,  0.6361, -0.3274,  ...,  0.5623, -0.4485, -0.1423],
         [ 0.4670,  0.4487, -0.3240,  ...,  0.2947, -0.3075, -0.5417],
         ...,
         [ 0.6198,  0.5354,  0.0368,  ...,  0.7927, -0.2071, -0.8642],
         [ 0.3391,  0.6659, -0.0021,  ...,  0.5149, -0.6712, -0.7742],
         [ 0.7278,  0.5414, -0.2277,  ...,  0.9817, -0.1312, -0.5510]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8223],
         [-0.7920],
         [-0.7070],
         [-0.8276],
         [-0.8247],
         [-0.7593],
         [-0.7734],
         [-0.6968],
         [-0.6152],
         [-0.7949]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110

input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.6374,  1.0112, -0.1392,  ...,  0.9208, -0.1954, -0.4513],
         [ 0.6007,  0.5003, -0.6685,  ...,  1.2740, -0.0398, -0.2223],
         [ 0.4733,  0.7633, -0.4875,  ...,  0.9137, -0.1256, -0.4278],
         ...,
         [ 0.3291,  0.6956, -0.4392,  ...,  1.3371, -0.4518, -0.4627],
         [ 0.3866,  0.8271, -0.2548,  ..., -0.0359, -0.0549, -0.7941],
         [ 0.3925,  0.6031, -0.5291,  ...,  1.1325, -0.3399, -0.7778]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8418],
         [-0.8599],
         [-0.8340],
         [-0.8652],
         [-0.8198],
         [-0.7866],
         [-0.7837],
         [-0.8843],
         [-0.8120],
         [-0.7607]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

logits:  tensor([[ 0.3967,  0.5850,  0.4590,  0.4478,  0.5972,  0.0347,  1.2207,  0.5239,
          1.2109,  0.5723,  0.4346,  0.7471,  0.3191,  0.4575,  0.4800,  0.1697,
          0.4089,  0.4785,  0.5928,  0.4177,  0.6934, -0.3289,  1.0771,  0.6226,
          0.1055,  0.2783,  0.5776,  0.0717, -0.5815,  0.7051,  0.3787,  0.2374,
          0.2742,  0.4170,  0.1059, -0.2101,  0.3579,  0.4077,  0.4888,  0.4722,
          0.5498,  0.5532,  0.9507,  0.7583,  0.0445,  0.2727,  0.5801,  0.3767,
          0.5308,  0.7212,  0.9814,  0.6401,  0.5645,  0.6714, -0.2529,  1.1738,
          0.4282,  0.3481,  0.5054,  0.6240, -0.0284,  0.2443,  0.8340,  0.5366,
          0.3367,  0.4395,  0.4470,  0.0818,  0.1890,  0.5439,  0.7222,  0.3044,
          0.4521,  0.5127,  0.6255,  0.4314,  0.4041,  0.6724,  0.6060,  0.6338,
          0.3271,  0.6270,  0.5845,  0.7861, -0.0912,  0.5884,  0.3313,  0.1512,
          0.4104,  0.6172,  0.5386,  0.2064,  0.3215,  0.5825,  0.9712,  0.4346,
          0.5957,  

sp_para_output:  tensor([[[ 0.7176,  0.3034, -0.2930,  ...,  1.2712, -0.3028, -0.4848],
         [ 0.4282,  0.4763, -0.9835,  ...,  0.4980, -0.2321, -0.6501],
         [ 0.5920,  0.3584, -0.3675,  ...,  1.1041, -0.4902, -0.5029],
         ...,
         [ 0.8940,  0.3988, -0.3503,  ...,  1.3068, -0.2132, -0.5215],
         [ 0.4435,  0.0814, -0.5834,  ...,  1.1653, -0.0872, -0.8930],
         [ 0.3438,  0.2430, -0.7031,  ...,  0.6112, -0.2446, -0.4555]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8662],
         [-0.8623],
         [-0.8921],
         [-0.9087],
         [-0.8896],
         [-0.9326],
         [-0.8843],
         [-0.9082],
         [-0.9785],
         [-0.9028]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268

sp_para_output:  tensor([[[ 4.4593e-01,  5.6962e-01, -1.4121e-03,  ...,  1.2954e+00,
           3.1403e-03, -5.1664e-01],
         [ 7.8830e-01,  1.0782e+00, -2.1468e-01,  ...,  1.1921e+00,
          -4.7879e-01, -8.3177e-01],
         [ 6.6661e-01,  4.8524e-01, -5.6537e-01,  ...,  1.2405e+00,
          -2.5445e-01, -8.0172e-01],
         ...,
         [ 8.1024e-01,  8.4882e-01, -1.5116e-01,  ...,  1.5401e+00,
          -5.1191e-01, -7.6578e-01],
         [ 5.3575e-01,  7.0536e-01, -4.9177e-01,  ...,  6.6863e-01,
          -3.6767e-01, -5.3552e-01],
         [ 6.2523e-01,  7.7610e-01, -2.6876e-01,  ...,  1.1702e+00,
          -2.1142e-01, -8.1581e-01]]], device='cuda:0',
       grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8203],
         [-0.7319],
         [-0.9033],
         [-0.7944],
         [-0.7920],
         [-0.8340],
         [-0.7461],
         [-0.7759],
         [-0.7935],
         [-0.8467]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.9531,  0.4409, -0.7674,  ...,  0.9247, -0.3053, -0.2385],
         [ 0.8258,  0.3734, -0.6512,  ...,  0.9292, -0.3742, -0.5558],
         [ 0.4626,  0.5609, -0.0913,  ...,  1.0076, -0.4040, -0.4269],
         ...,
         [ 0.3501,  0.8040, -0.5627,  ...,  0.9767, -0.5737, -0.6342],
         [ 0.4328,  0.0335, -0.8110,  ...,  1.0669, -0.6372, -0.5710],
         [ 0.8852,  0.4233, -0.4970,  ...,  1.1165, -0.6319, -0.8075]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.7627],
         [-0.8418],
         [-0.8276],
         [-0.8188],
         [-0.7324],
         [-0.8447],
         [-0.8135],
         [-0.8594],
         [-0.8833],
         [-0.8701]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.1002,  0.7477, -0.1411,  ...,  1.4090, -0.3977, -0.7174],
         [ 0.4377,  0.3707, -0.5450,  ...,  1.3408, -0.4776, -0.5154],
         [ 1.0272,  0.7549, -0.4304,  ...,  1.1092, -0.2670, -0.6286],
         ...,
         [ 0.8368,  0.6723, -0.5011,  ...,  1.2210, -0.5527, -0.0644],
         [ 0.6168,  0.7856, -0.4505,  ...,  0.6597, -0.6131, -0.5952],
         [ 0.9832,  0.3586, -0.3159,  ...,  1.5023, -0.6312, -0.0796]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.9463],
         [-0.8560],
         [-0.8950],
         [-0.9590],
         [-0.8091],
         [-0.9185],
         [-0.8267],
         [-0.8477],
         [-0.8442],
         [-0.8550]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.7687,  0.5978,  0.0227,  ...,  0.5784, -0.2771, -0.1951],
         [ 0.8881,  0.5987, -0.6254,  ...,  1.0545, -0.5909, -0.2885],
         [ 0.8206,  0.5145, -0.4919,  ...,  1.4998, -0.5562, -0.7723],
         ...,
         [-0.0038,  0.4300, -0.4587,  ...,  0.9940, -0.6130, -0.5578],
         [ 0.7884,  0.9218, -0.5401,  ...,  0.9868, -0.4852, -0.8799],
         [ 0.6945,  0.6668, -0.6769,  ...,  0.3491, -0.5708, -0.7516]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8213],
         [-0.8760],
         [-0.9194],
         [-0.9487],
         [-0.8550],
         [-0.9805],
         [-0.8486],
         [-0.9224],
         [-0.9932],
         [-0.8530]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.6442,  0.5969, -0.3157,  ...,  1.0204, -0.2452, -0.6970],
         [ 0.6856,  0.5401, -0.4355,  ...,  1.0030, -0.2701, -0.7751],
         [ 0.5301,  0.7166, -0.4430,  ...,  1.2505, -0.3311, -0.8036],
         ...,
         [ 0.6214,  0.6408, -0.4712,  ...,  1.0289, -0.0712, -0.7314],
         [ 0.4200,  0.7220, -0.3064,  ...,  1.1457, -0.1562, -0.8621],
         [ 0.6677,  0.5821, -0.4104,  ...,  1.2247, -0.2187, -0.8870]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9766],
         [-0.9194],
         [-0.9648],
         [-0.9863],
         [-0.9912],
         [-0.9326],
         [-0.9658],
         [-0.9551],
         [-1.0088],
         [-0.9639]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

sp_para_output:  tensor([[[ 0.9527,  0.5645, -0.4463,  ...,  1.4167, -0.6054, -0.7151],
         [ 0.8265,  0.6479, -0.4420,  ...,  1.5042, -0.5955, -0.7040],
         [ 0.9269,  0.5815, -0.4488,  ...,  1.5606, -0.5210, -0.6339],
         ...,
         [ 0.9040,  0.4896, -0.4509,  ...,  1.5633, -0.5425, -0.6678],
         [ 0.9481,  0.4257, -0.3395,  ...,  1.4044, -0.5308, -0.6968],
         [ 0.9058,  0.3613, -0.4568,  ...,  1.3735, -0.4603, -0.5328]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0020],
         [-1.0234],
         [-1.0625],
         [-1.0078],
         [-1.0078],
         [-1.0088],
         [-1.0342],
         [-1.0322],
         [-1.0088],
         [-1.0098]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1810])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1810])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[707,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

input_ids:  tensor([[    0, 50268, 12196, 26241,    21,  1236, 12336, 29227,  1506,  7259,
           268,  1141, 50269,     2, 50265, 13200,  4306,    50,    42, 25763,
          1848,   232, 50266, 13200,  4306,    50,    42, 25763,  1848,   232,
            16,  9031,   493,  1182, 15919, 25032,  5808,  1982,    30, 29227,
          1506,  7259,   254,    11, 35348,  2517,  3225,   223, 28899,     9,
          5808,  1982,    30,    39,  1141,  1236,  4438, 50267,  1040,   439,
         43179,   454,  8548,  3620,   107,    71,    24,    21,  1982,     8,
           316,   107,    71,  7259,   254,    17,    27,    29,   744, 50267,
            24,    16,  3924,  1687,     7,    28,     9,   773,    55,    25,
           892,     9,  7259,   254,    17,    27,    29, 11419,   434,    87,
            25, 11314,  2125,     9, 11845, 50267,  1437,     2, 50265, 48841,
         35678,   261,    30,  6414,   504,  5243, 50266, 48841, 35678,   261,
            30,  6414,     9,   504,  52

answers:  [{'text': ' his wife june miller psychoanalyst otto rank her father', 'score': tensor([1.1660], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.2188, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.7964, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([1.4854], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 5, 6, 6, 6, 6, 7, 8, 9]
para_sent_logits: tensor([-1.4727, -1.4873, -1.4502], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-4.4102], device='cuda:0')
para_sents_offset: [0, 3]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.4678, -1.4658, -1.4561], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-4.4102, -4.3896], device='cuda:0')
para_sents_offset: [0, 3, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64)}
para

sp_para_output:  tensor([[[ 0.9730,  0.6497, -0.3562,  ...,  1.2736, -0.5275, -0.8309],
         [ 0.6696,  0.9060, -0.2797,  ...,  1.3524, -0.5580, -0.8668],
         [ 0.8705,  0.7241, -0.3259,  ...,  1.3034, -0.5196, -0.7806],
         ...,
         [ 0.9013,  0.6514, -0.3879,  ...,  1.2523, -0.5494, -0.8708],
         [ 0.8024,  0.6377, -0.2907,  ...,  1.3191, -0.4772, -0.8594],
         [ 0.8935,  0.6359, -0.3923,  ...,  1.2791, -0.5539, -0.8267]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0195],
         [-1.0674],
         [-1.0068],
         [-0.9863],
         [-1.0068],
         [-1.0508],
         [-1.0273],
         [-1.0088],
         [-0.9961],
         [-1.0078]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1681])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1681])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[955, 961,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

input_ids:  tensor([[    0, 50268,  5488, 44878,     9, 38727,    11, 14490,  3821,  8377,
           247,  6308,   129,    65,  4707, 50269,     2, 50265,  9473,   493,
         50266,  9473,   493,  4142, 16441,     9,  9473,   493,   741,   298,
         18195,  6528, 26848, 48312,  6382,   271, 18195,   267,  2636,    16,
           247,    11,  2077,    25,   493, 50267,    24,    16,  3821,  8377,
           247,    30,   443,   200,  7877, 24347,   247,    19,    81,   316,
           325,    82,     8,   144, 24347,  4593,    11,   232, 50267,    24,
            16, 43490,    30,  9473,   811,  6444,    15,  2077,  4709,   873,
           811,  3342,    15, 10103,     8, 11751,     9,   741,  3314,   337,
            15, 10171, 50267,    24,   327,  1212,  7562,    19,   181,   677,
          7566,     7,  3072,  1855,  1243,  3087, 18239,     8,   741,   298,
         22165,     7,  9489,     8,   127, 43918, 12601,  1916,     8, 14225,
           462, 42589,     7,  3017, 502

answers:  [{'text': ' mark moth which is found from new jersey south to fl', 'score': tensor([1.0195], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.0430, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4426, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([1.5732], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9]
para_sent_logits: tensor([-1.4512, -1.4697, -1.5332, -1.4717, -1.4541, -1.4482], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-8.8281], device='cuda:0')
para_sents_offset: [0, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.4512, -1.4805], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-8.8281, -2.9316], device='cuda:0')
para_sents_offset: [0, 6, 8]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: ten

sp_para_output:  tensor([[[ 0.9267,  0.6116, -0.5222,  ...,  1.5764, -0.3548, -0.6346],
         [ 0.9022,  0.6563, -0.5382,  ...,  1.3534, -0.3052, -0.7043],
         [ 0.9375,  0.7867, -0.4544,  ...,  1.2435, -0.3068, -0.8124],
         ...,
         [ 0.9928,  0.4684, -0.4031,  ...,  1.3710, -0.2927, -0.7421],
         [ 0.9148,  0.6789, -0.4852,  ...,  1.5609, -0.2777, -0.8512],
         [ 0.7330,  0.5814, -0.5987,  ...,  1.4120, -0.3475, -0.6942]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9775],
         [-0.9468],
         [-0.9717],
         [-0.9746],
         [-0.9761],
         [-0.9814],
         [-0.9902],
         [-0.9980],
         [-0.9351],
         [-0.9453]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1340])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1173,    0,    0,    0,    0,    0,    0,    0,    0,    0,

validation_step
qid:  5a89372855429951533612e6
input_ids:  tensor([[    0, 50268, 12196,    16,  5933,     9,  1349,   147,  1014, 23321,
           118,   475, 15022, 11824, 30141,   316,  1946,    21, 10899, 50269,
             2, 50265, 14206,  5730, 29762,  9326, 50266, 14206,  5730, 29762,
          9326,    16,  4243,  4930,  1349,  2034,    11, 11824, 30141,    92,
          2077,   885,  4575, 28410,  7085,   493, 50267,    24,    16, 17949,
            15,  9910,    19,  6594,   781,  2523,     9, 14206,  5730, 29762,
             8,   885,   895,  6487,   257,     8,    16,   275,   684,    25,
           184,     9, 11824, 30141, 10775,  4243,  1015,   547,   349, 16874,
         24761,     8, 11824, 30141,   316,  1946,   515,   547,   349, 10668,
           428, 48540, 50267,  5356,  1558,  6301,   251,  1349,    16, 12684,
          2014,  9326,     8,    16,   285,   921,    19,  2340,  2078,  5165,
            77,   117,  4930,  1061,    32,   145,   422,     8,    89, 

logits:  tensor([[ 5.5225e-01,  4.6362e-01,  7.1875e-01,  5.0830e-01,  6.6016e-01,
          1.8079e-01,  3.5156e-02,  5.3125e-01,  3.2910e-01,  5.8105e-01,
          2.3596e-01,  4.0210e-01,  5.2002e-01,  6.0059e-01,  1.1945e-01,
          3.8086e-01, -5.5647e-04,  3.5571e-01,  1.5198e-01,  3.0469e-01,
          3.4204e-01,  4.1650e-01,  2.1777e-01,  4.5850e-01,  8.4814e-01,
          8.3374e-02,  3.7549e-01,  1.9373e-01,  3.7427e-01,  7.1533e-01,
          4.3604e-01,  3.8330e-02,  6.9482e-01,  1.0537e+00,  5.3271e-01,
         -1.3733e-01,  2.7661e-01,  7.2266e-01,  2.0605e-01,  8.8232e-01,
          1.1240e+00,  8.3008e-01,  6.0059e-01,  2.6172e-01,  1.0944e-01,
          8.2861e-01,  3.1519e-01,  3.0005e-01, -2.5757e-02,  4.3457e-01,
          5.1855e-01,  8.0566e-01,  7.7637e-01,  4.5483e-01,  5.3320e-01,
          5.1660e-01,  1.9507e-01,  3.7061e-01,  4.1577e-01,  5.4102e-01,
         -1.2006e-01,  5.7227e-01,  5.6445e-01,  2.7344e-01,  2.5220e-01,
          5.3192e-02,  2.6538

sp_para_output:  tensor([[[ 0.7186,  0.6520, -0.5320,  ...,  0.9276, -0.2998, -0.6264],
         [ 0.7311,  0.6454, -0.6183,  ...,  1.0124, -0.3384, -0.5694],
         [ 0.6065,  0.6660, -0.5523,  ...,  1.1539, -0.2963, -0.5647],
         ...,
         [ 0.6029,  0.6683, -0.5812,  ...,  1.0516, -0.1762, -0.5985],
         [ 0.6222,  0.8137, -0.5174,  ...,  1.2404, -0.2546, -0.6808],
         [ 0.6104,  0.7311, -0.5135,  ...,  1.1714, -0.3449, -0.6685]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9702],
         [-0.9609],
         [-0.9551],
         [-0.9912],
         [-1.0029],
         [-0.9321],
         [-0.9575],
         [-0.9678],
         [-0.9473],
         [-0.9536]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.8117,  0.4401, -0.4899,  ...,  1.0012, -0.4114, -0.7353],
         [ 0.7258,  0.5413, -0.5902,  ...,  0.9662, -0.3745, -0.7774],
         [ 0.7411,  0.6162, -0.5730,  ...,  0.9584, -0.3766, -0.8704],
         ...,
         [ 0.6616,  0.4665, -0.5654,  ...,  1.0540, -0.4434, -0.8193],
         [ 0.7341,  0.4770, -0.4552,  ...,  0.9735, -0.2794, -0.8270],
         [ 0.8989,  0.3148, -0.5079,  ...,  0.9931, -0.3885, -0.6801]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9824],
         [-1.0078],
         [-1.0361],
         [-0.9980],
         [-0.9766],
         [-0.9619],
         [-0.9980],
         [-1.0244],
         [-1.0068],
         [-1.0371]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

validation_step
qid:  5a7722d655429966f1a36c99
input_ids:  tensor([[    0, 50268,  8569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  264,  335,  472,  606,  730,  846,  933, 1037, 1156],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.8905,  0.8351, -0.3891,  ...,  1.3299, -0.4935, -0.6774],
         [ 0.7575,  0.7018, -0.4011,  ...,  1.1098, -0.5044, -0.6688],
         [ 0.7301,  0.7573, -0.4529,  ...,  1.1887, -0.3365, -0.6587],
         ...,
         [ 0.6624,  0.7307, -0.4133,  ...,  1.1957, -0.4349, -0.7624],
         [ 0.8072,  0.7634, -0.3129,  ...,  1.6164, -0.5057, -0.7441],
         [ 0.8032,  0.5391, -0.3724,  ...,  1.3079, -0.4796, -0.5965]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0176],
         [-0.9941],
         [-1.0273],
         [-0.9541],
         [-0.9702],
         [-0.9771],
         [-1.0352],
         [-1.0254],
         [-1.0332],
         [-0.9834]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.7363,  0.5983, -0.5086,  ...,  1.1298, -0.4747, -0.7873],
         [ 0.6047,  0.5434, -0.6539,  ...,  1.0711, -0.4487, -0.8549],
         [ 0.5192,  0.4350, -0.6217,  ...,  1.0609, -0.4633, -0.7869],
         ...,
         [ 0.5440,  0.4304, -0.6319,  ...,  1.1820, -0.4968, -0.6942],
         [ 0.5482,  0.4776, -0.6396,  ...,  1.1269, -0.4420, -0.8130],
         [ 0.6403,  0.3637, -0.5993,  ...,  1.0802, -0.3851, -0.7787]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0010],
         [-1.0225],
         [-1.0176],
         [-1.0088],
         [-0.9805],
         [-0.9951],
         [-0.9341],
         [-0.9814],
         [-1.0146],
         [-0.9961]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

validation_step
qid:  5ab381b155429969a97a816b
input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.7021,  0.9012, -0.2967,  ...,  1.5519, -0.2940, -0.6959],
         [ 0.6812,  0.8559, -0.5131,  ...,  1.3182, -0.2870, -0.7939],
         [ 0.6419,  0.7645, -0.3911,  ...,  1.3053, -0.2737, -0.6958],
         ...,
         [ 0.5588,  0.8037, -0.4929,  ...,  1.4640, -0.2971, -0.7260],
         [ 0.7266,  0.9198, -0.3356,  ...,  1.4086, -0.2604, -0.7234],
         [ 0.6630,  0.8838, -0.4224,  ...,  1.3739, -0.2733, -0.8192]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9214],
         [-0.9727],
         [-0.9355],
         [-0.9580],
         [-0.9404],
         [-0.9189],
         [-0.9434],
         [-0.9458],
         [-0.9492],
         [-0.9175]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.9869,  0.6912, -0.5593,  ...,  1.3039, -0.3875, -0.8218],
         [ 0.6979,  0.5749, -0.5574,  ...,  1.4523, -0.2910, -0.8208],
         [ 0.7082,  0.6329, -0.5283,  ...,  1.2312, -0.3473, -0.7386],
         ...,
         [ 0.7755,  0.7354, -0.6001,  ...,  1.2399, -0.3692, -0.7465],
         [ 1.0546,  0.4839, -0.4480,  ...,  1.2134, -0.3121, -0.8239],
         [ 0.7947,  0.4733, -0.5559,  ...,  1.4025, -0.2546, -0.8259]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0195],
         [-1.0254],
         [-1.0244],
         [-0.9902],
         [-0.9922],
         [-1.0088],
         [-1.0254],
         [-0.9951],
         [-1.0068],
         [-0.9497]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[203, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

validation_step
qid:  5ac2a5d455429921a00ab01b
input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  23,  136,  330,  467,  616,  808,  859,  950, 1139, 1298],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.7560,  0.3644, -0.5502,  ...,  1.1662, -0.4499, -0.7063],
         [ 0.6494,  0.4544, -0.5233,  ...,  1.2054, -0.4175, -0.7273],
         [ 0.5531,  0.5123, -0.4792,  ...,  1.1786, -0.4215, -0.8430],
         ...,
         [ 0.6594,  0.3392, -0.5815,  ...,  1.0348, -0.4414, -0.8309],
         [ 0.6355,  0.3541, -0.5719,  ...,  1.1611, -0.4078, -0.6270],
         [ 0.6285,  0.4534, -0.5123,  ...,  0.9449, -0.4560, -0.7937]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9458],
         [-0.9341],
         [-0.9390],
         [-0.9458],
         [-0.9150],
         [-0.9351],
         [-0.9570],
         [-0.9683],
         [-0.9502],
         [-0.9185]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.7973,  0.5415, -0.2809,  ...,  1.3630, -0.4081, -0.6873],
         [ 0.6699,  0.8200, -0.3870,  ...,  1.4633, -0.4582, -0.7856],
         [ 0.5739,  0.8117, -0.4162,  ...,  1.3165, -0.4650, -0.7222],
         ...,
         [ 0.7251,  0.5553, -0.3013,  ...,  1.2443, -0.3340, -0.8475],
         [ 0.5975,  0.5079, -0.3760,  ...,  1.3929, -0.2791, -0.6589],
         [ 0.7315,  0.6609, -0.3053,  ...,  1.2773, -0.3982, -0.8033]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9683],
         [-0.9756],
         [-0.9814],
         [-0.9541],
         [-0.9590],
         [-0.9434],
         [-0.9517],
         [-0.9961],
         [-1.0059],
         [-0.9629]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1383])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1383])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[634, 969,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.8243,  0.7348, -0.4450,  ...,  1.1932, -0.2875, -0.7107],
         [ 0.8378,  0.5422, -0.4414,  ...,  1.1636, -0.3573, -0.6578],
         [ 0.8024,  0.7488, -0.4383,  ...,  1.2046, -0.2620, -0.6452],
         ...,
         [ 0.8715,  0.7070, -0.4121,  ...,  1.1108, -0.3585, -0.6131],
         [ 0.7749,  0.6725, -0.4658,  ...,  1.2019, -0.3255, -0.6546],
         [ 0.7453,  0.7144, -0.4234,  ...,  1.1440, -0.2402, -0.6561]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.8989],
         [-0.9326],
         [-0.9199],
         [-0.9478],
         [-0.9468],
         [-0.9028],
         [-0.9263],
         [-0.8872],
         [-0.8911],
         [-0.9111]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 917])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 917])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 68, 225, 248, 302, 386, 417, 495, 540, 647, 656, 693, 747, 7

sp_para_output:  tensor([[[ 0.4138,  0.5632, -0.4195,  ...,  1.1397, -0.4293, -0.6027],
         [ 0.5283,  0.4323, -0.5843,  ...,  0.9735, -0.3763, -0.5948],
         [ 0.3636,  0.4622, -0.5993,  ...,  1.0411, -0.3054, -0.5365],
         ...,
         [ 0.4959,  0.5928, -0.4153,  ...,  1.1580, -0.4072, -0.5663],
         [ 0.4306,  0.3995, -0.3817,  ...,  1.1730, -0.2521, -0.5742],
         [ 0.4632,  0.4106, -0.3395,  ...,  1.2548, -0.4068, -0.5633]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0010],
         [-0.9648],
         [-0.9971],
         [-1.0000],
         [-1.0156],
         [-0.9971],
         [-0.9736],
         [-0.9912],
         [-0.9893],
         [-0.9941]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268, 288, 337, 362, 387, 430, 434,   0,   0,   0,   0,   0,

para_indexes:  tensor([ 21, 178, 251, 276, 378, 441, 520, 601, 706, 801], device='cuda:0')
sp_para_output:  tensor([[[ 0.7962,  0.7847, -0.2137,  ...,  1.0880, -0.0241, -0.7491],
         [ 0.7412,  0.9269, -0.3390,  ...,  1.0693, -0.1400, -0.9341],
         [ 0.6689,  0.8452, -0.3432,  ...,  0.9528, -0.0431, -0.8355],
         ...,
         [ 0.6331,  1.0283, -0.3869,  ...,  1.1929, -0.1071, -0.7275],
         [ 0.7051,  0.7561, -0.3227,  ...,  1.1123, -0.0800, -0.7833],
         [ 0.6964,  0.7131, -0.2526,  ...,  1.1013,  0.0257, -0.8060]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9258],
         [-0.9238],
         [-0.9243],
         [-0.9243],
         [-0.9238],
         [-0.9521],
         [-0.9414],
         [-0.9321],
         [-0.9116],
         [-0.9180]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 877])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 877])
end_positions.size():  torch.Size([1, 

answers:  [{'text': ' pine', 'score': tensor([1.0742], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.1934, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4714, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([1.5576], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9]
para_sent_logits: tensor([-1.4941, -1.4355, -1.4463, -1.4512, -1.4082, -1.4600], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-8.6953], device='cuda:0')
para_sents_offset: [0, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.4326, -1.4639], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-8.6953, -2.8965], device='cuda:0')
para_sents_offset: [0, 6, 8]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=to

sp_para_output:  tensor([[[ 0.7005,  0.7031, -0.3833,  ...,  0.8875, -0.2786, -0.8159],
         [ 0.6062,  0.6167, -0.4306,  ...,  1.0258, -0.2808, -0.8677],
         [ 0.6783,  0.6570, -0.4952,  ...,  1.0352, -0.1354, -0.8507],
         ...,
         [ 0.6456,  0.5423, -0.4143,  ...,  0.9681, -0.2303, -0.8937],
         [ 0.6821,  0.6454, -0.4906,  ...,  1.0526, -0.1362, -0.8574],
         [ 0.5307,  0.5142, -0.4619,  ...,  1.0393, -0.2407, -0.9043]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0332],
         [-0.9692],
         [-1.0029],
         [-1.0273],
         [-1.0166],
         [-1.0479],
         [-1.0146],
         [-1.0137],
         [-1.0010],
         [-1.0137]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 835])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 835])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[410, 577, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

para_sent_logits: tensor([-1.4561, -1.4766], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-7.2998, -1.4385, -7.2500, -4.3955, -5.8828, -1.4863, -5.8213, -5.7432,
        -2.9326], device='cuda:0')
para_sents_offset: [0, 5, 6, 11, 14, 18, 19, 23, 27, 29]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64), 8: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.4434, -1.4219, -1.4404, -1.4775, -1.4707, -1.4229], device='cuda:0',
       dtype=torch.float16)
para_sent_logits_sum: tensor([-7.2998, -1.4385, -7.2500, -4.3955, -5.8828, -1.4863, -5.8213, -5.7432,
        -2.932

        -4.4648, -5.9287], device='cuda:0')
para_sents_offset: [0, 4, 8, 13, 16, 18, 24, 28, 30, 33, 37]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64), 5: tensor([], device='cuda:0', dtype=torch.int64), 6: tensor([], device='cuda:0', dtype=torch.int64), 7: tensor([], device='cuda:0', dtype=torch.int64), 8: tensor([], device='cuda:0', dtype=torch.int64), 9: tensor([], device='cuda:0', dtype=torch.int64)}
sp_para_pred: tensor([4, 7], device='cuda:0')
pre_answer:	 loves husband kurt cobain died in their home 	gold_answer:	 wolfhounds 	sp_sent_pred:	 [] 	sp_sent_gold:	 [16, 33, 35] 	sp_para_pred:	 tensor([4, 7], device='cuda:0') 	sp_para_gold:	 [4, 9]
validation_step
qid:  5a7272eb5542997f827839d7
input_ids:  tensor([[    0, 50268,  9178,  ..., 10932,  2362,     2

validation_step
qid:  5ac2c3545542990b17b1548b
input_ids:  tensor([[    0, 50268,   611,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  171,  429,  512,  592,  747,  906, 1208, 1349, 1429],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.6969,  0.5752, -0.4595,  ...,  1.1949, -0.5028, -0.7872],
         [ 0.6025,  0.4929, -0.3939,  ...,  1.1333, -0.4909, -0.9341],
         [ 0.7068,  0.4625, -0.4635,  ...,  1.2101, -0.4771, -0.7921],
         ...,
         [ 0.6404,  0.4019, -0.5462,  ...,  1.3173, -0.4867, -0.7499],
         [ 0.6528,  0.5575, -0.4576,  ...,  1.2571, -0.4489, -0.8514],
         [ 0.8257,  0.4472, -0.3435,  ...,  1.1110, -0.4146, -0.7514]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9365],
         [-0.9844],
         [-0.9644],
         [-0.7310],
         [-0.9268],
         [-0.9463],
         [-0.9746],
         [-0.9482],
         [-0.9805],
         [-0.9614]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.7280,  0.8182, -0.5608,  ...,  1.2015, -0.3322, -0.7236],
         [ 0.6670,  0.7345, -0.4971,  ...,  1.2871, -0.3187, -0.7907],
         [ 0.7264,  0.7133, -0.4455,  ...,  1.0908, -0.3047, -0.8122],
         ...,
         [ 0.7383,  0.7971, -0.4926,  ...,  1.2122, -0.3759, -0.7946],
         [ 0.7237,  0.7433, -0.4741,  ...,  1.0656, -0.3423, -0.8179],
         [ 0.7106,  0.7320, -0.4140,  ...,  1.2469, -0.3117, -0.7887]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9824],
         [-1.0264],
         [-0.9873],
         [-0.9814],
         [-0.9893],
         [-0.9658],
         [-0.9624],
         [-0.9785],
         [-1.0049],
         [-0.9717]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1806])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1806])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[606,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

masked_target:  tensor([[264, 295, 300,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.2091, -0.0056,  0.1796,  ..., -0.3149, -0.4614, -0.4465]],
       device='cuda:0', dtype=torch.float16)
decode
p_type:  0
answers:  [{'text': ' picture in 1954 it helped jump start careers of several young', 'score': tensor([1.2070], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.3662, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.7324, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([1.5225], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7,

sp_para_output:  tensor([[[ 0.6581,  0.4442, -0.5420,  ...,  0.9429, -0.3840, -0.6627],
         [ 0.7052,  0.4980, -0.5255,  ...,  1.1244, -0.4135, -0.8351],
         [ 0.7427,  0.5339, -0.6638,  ...,  0.8305, -0.4366, -0.7947],
         ...,
         [ 0.8262,  0.5924, -0.5512,  ...,  1.1445, -0.4696, -0.8566],
         [ 0.7023,  0.5140, -0.5469,  ...,  1.2151, -0.4915, -0.9269],
         [ 0.7806,  0.5373, -0.5351,  ...,  1.1638, -0.3853, -0.7968]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.0010],
         [-0.9834],
         [-0.9751],
         [-0.9683],
         [-1.0176],
         [-1.0000],
         [-0.9795],
         [-0.9951],
         [-0.9702],
         [-0.9751]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 875])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 875])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[873,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

sp_para_output:  tensor([[[ 0.6774,  0.7082, -0.2728,  ...,  1.1895, -0.4189, -0.6375],
         [ 0.7321,  0.5457, -0.3731,  ...,  1.2701, -0.3796, -0.7115],
         [ 0.6193,  0.6292, -0.3777,  ...,  1.2678, -0.3728, -0.6912],
         ...,
         [ 0.6337,  0.5728, -0.3666,  ...,  1.2369, -0.3716, -0.7019],
         [ 0.5987,  0.7277, -0.3690,  ...,  1.3824, -0.3727, -0.7109],
         [ 0.6333,  0.5523, -0.4277,  ...,  1.2514, -0.4024, -0.7249]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.9351],
         [-0.9360],
         [-0.9443],
         [-0.9478],
         [-0.9360],
         [-0.9688],
         [-0.9170],
         [-0.9521],
         [-0.9844],
         [-0.9033]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1667])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1667])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 120,  151,  180,  479,  538,  577,  892,  922, 1002, 1464,


Epoch 00004: avg_val_f1 reached 0.01478 (best 0.01478), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_4.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(15.8513, device='cuda:0')	avg_answer_loss:  tensor(6.2963, device='cuda:0')	avg_type_loss:  tensor(0.2755, device='cuda:0')	avg_val_f1:  0.01477832526996218	avg_val_em:  0.0	avg_val_prec:  0.008062880357791638	avg_val_recall:  0.10344827586206896
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.9317,  0.8034, -0.6595,  ...,  1.1401, -0

logits:  tensor([[ 7.8857e-01,  5.1172e-01,  1.0879e+00,  1.2246e+00,  5.1904e-01,
          6.8164e-01,  4.6021e-01,  6.2646e-01,  7.0898e-01,  6.1572e-01,
          5.6445e-01,  8.9453e-01,  7.5537e-01,  4.7754e-01,  5.1660e-01,
          8.0811e-01,  5.7031e-01, -3.1104e-01,  6.1182e-01,  6.0596e-01,
          6.5576e-01,  1.0029e+00,  7.1045e-01,  8.0420e-01,  5.2637e-01,
         -1.3931e-02,  7.0361e-01,  6.7773e-01,  4.8877e-01,  9.1162e-01,
          4.3506e-01,  5.5322e-01,  6.1279e-01,  2.7075e-01,  4.4873e-01,
          2.1338e-01,  1.4004e+00,  5.6592e-01,  3.5278e-01,  8.1592e-01,
          6.1182e-01,  5.9473e-01,  2.9297e-01,  5.9717e-01,  3.0493e-01,
          4.3652e-01,  6.4990e-01,  8.3838e-01,  1.0684e+00,  2.7612e-01,
          1.1465e+00,  4.0332e-01,  2.9565e-01,  9.6631e-01,  8.3203e-01,
          6.1133e-01,  3.9673e-01,  7.0508e-01,  7.4805e-01, -2.6221e-01,
          4.4678e-01,  2.8015e-02,  5.0098e-01,  4.5874e-01,  2.8516e-01,
          3.8116e-02,  4.9463

sp_para_output:  tensor([[[ 1.1405,  0.7082, -0.6965,  ...,  1.3408, -0.4239, -0.5302],
         [ 0.8524,  0.2693, -0.7126,  ...,  2.1192, -0.1936, -0.6374],
         [ 1.0521,  0.7265, -0.7552,  ...,  0.3684, -0.0552, -0.8249],
         ...,
         [ 0.9125,  0.4951, -0.4321,  ...,  1.7290, -0.2282, -0.6184],
         [ 0.7518,  0.9581, -0.6731,  ...,  1.2288, -0.2756, -0.6345],
         [ 0.8918,  1.0490, -0.4546,  ...,  1.5248, -0.2553, -0.9218]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.8403],
         [-0.9380],
         [-0.8716],
         [-0.9863],
         [-0.9375],
         [-0.9111],
         [-0.8994],
         [-0.9185],
         [-0.9541],
         [-0.8936]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1098])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1098])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 63

logits:  tensor([[ 3.9014e-01,  1.6980e-01,  2.5854e-01,  5.1074e-01,  2.0593e-01,
          8.3984e-01,  2.4988e-01,  7.8516e-01,  4.8145e-01,  1.3281e-01,
          6.0938e-01, -6.0181e-02,  2.4255e-01,  7.4512e-01,  4.2236e-01,
          6.7676e-01,  4.8535e-01,  3.5034e-01,  4.0234e-01,  3.4155e-01,
          3.3545e-01,  9.1748e-01,  4.2847e-01,  6.1963e-01,  5.2051e-01,
         -1.2366e-01,  5.0195e-01,  4.4067e-01,  2.6807e-01,  1.2854e-01,
          4.8779e-01,  1.1908e-01,  7.3779e-01,  7.2266e-01,  1.7517e-01,
          5.4413e-02,  2.9468e-01,  4.4409e-01,  5.6543e-01,  1.4111e-01,
          1.4648e-01,  3.5181e-01,  4.3237e-01,  3.0664e-01,  8.2178e-01,
          2.7295e-01,  4.5630e-01, -2.7783e-01,  4.3921e-01,  1.3757e-01,
          1.0107e-01,  1.7798e-01,  1.3220e-01,  1.0712e-01,  9.6680e-02,
          1.1884e-01,  2.7759e-01,  5.4932e-01,  3.0786e-01,  3.3301e-01,
          1.4868e-01,  6.1865e-01, -2.5439e-01,  7.9407e-02,  5.9052e-02,
          4.8096e-01, -1.4307

       device='cuda:0')
sp_para_output:  tensor([[[ 0.9908,  0.5031, -0.5895,  ...,  1.6971, -0.3717, -0.2995],
         [ 0.4379,  0.3523, -0.5542,  ...,  1.3218, -0.3779, -0.6012],
         [ 0.7734,  0.8879, -0.6098,  ...,  1.0927, -0.6075, -0.4209],
         ...,
         [ 0.9301,  0.7109, -0.6878,  ...,  1.6247, -0.3873, -0.7555],
         [ 0.7686,  0.4629, -0.4785,  ...,  1.3975, -0.5939, -0.2506],
         [ 0.9695,  0.2628, -0.2132,  ...,  1.8366, -0.2564, -0.3486]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.9844],
         [-1.0674],
         [-1.0303],
         [-1.0400],
         [-1.0449],
         [-1.0977],
         [-1.0752],
         [-0.9692],
         [-1.0273],
         [-0.9702]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
mask

logits:  tensor([[ 2.1313e-01, -5.9375e-01, -7.4658e-01, -7.1240e-01, -2.9492e-01,
         -4.2041e-01, -9.0332e-02, -5.6592e-01, -3.2910e-01, -1.1823e-01,
         -5.5615e-01, -1.2421e-01, -8.0225e-01, -1.8326e-02, -4.0552e-01,
          9.9365e-02,  1.8173e-02, -4.3213e-01, -2.3584e-01, -2.4707e-01,
          9.8999e-02, -5.8643e-01, -3.5034e-01,  2.7808e-01, -3.1372e-01,
         -4.6436e-01, -2.3178e-02, -2.6147e-01,  2.3901e-01, -3.8721e-01,
         -1.6150e-01, -4.4092e-01, -2.9395e-01,  3.7964e-01,  2.5317e-01,
         -3.5889e-01, -1.0675e-01,  1.9727e-01,  9.3079e-03,  4.4238e-01,
         -9.2041e-02,  1.2012e-01, -2.5391e-01,  2.4011e-01, -1.5503e-01,
          3.7170e-02, -2.9224e-01, -5.3564e-01, -4.1919e-01, -6.5381e-01,
          8.1665e-02, -2.9526e-02, -2.0056e-01,  5.9540e-02,  2.2839e-01,
          2.4219e-01, -8.3740e-01, -2.5244e-01,  2.4951e-01, -2.2412e-01,
         -7.7197e-01,  3.0396e-01, -1.8750e-01, -2.6318e-01,  5.1727e-03,
         -4.8877e-01, -6.6260

sp_para_output:  tensor([[[ 0.9273, -0.1310, -0.0545,  ...,  1.1597, -0.2000, -0.6879],
         [ 0.9341,  0.8207, -0.9525,  ...,  0.7668, -0.7076, -0.8463],
         [ 0.9964,  0.3566, -0.6484,  ...,  1.1279, -0.4234, -0.8501],
         ...,
         [ 0.3374,  0.6975, -0.4516,  ...,  1.0197, -0.3451, -0.7337],
         [ 0.8037,  0.6396, -0.5520,  ...,  1.4510, -0.3518, -0.7048],
         [ 0.9796,  0.5292, -0.4781,  ...,  1.2469, -0.4780, -0.6763]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-0.9912],
         [-0.9521],
         [-0.9961],
         [-1.0420],
         [-1.0430],
         [-0.9556],
         [-1.0410],
         [-0.8896],
         [-1.0791],
         [-0.9961]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110

input_ids:  tensor([[    0, 50268, 12196,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  22,  123,  225,  306,  392,  519,  575,  803,  922, 1002],
       device='cuda:0')
sp_para_output:  tensor([[[ 1.0053,  1.0057,  0.0682,  ...,  1.4544, -0.5880, -0.8349],
         [ 0.3096,  0.5963, -0.4057,  ...,  1.3239, -0.1295, -0.8441],
         [ 0.7289,  0.8691, -0.3493,  ...,  1.4191, -0.4411, -0.6500],
         ...,
         [ 0.8369,  0.5791, -0.7429,  ...,  0.2032, -0.3331, -0.9011],
         [ 0.8714,  0.4808, -0.3227,  ...,  1.2452, -0.2801, -0.1486],
         [ 0.6235,  0.5630, -0.4661,  ...,  1.0468, -0.2621, -0.0951]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.0244],
         [-1.0635],
         [-1.0957],
         [-1.0850],
         [-1.0508],
         [-1.0059],
         [-1.0918],
         [-1.0518],
         [-1.0859],
         [-1.0352]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  6588,   102, 11356,   102, 47510,  3250,  2214,  9408,
             8,  1421,  2642, 29219,   346,  1264,   881,    92,  1492,    16,
          2214,    30, 47510,  3250,   385,  4324, 11356,   102,    31,    69,
         14887, 31386,  2453,  4535,  2642,   703,    11,    99,    76, 50269,
             2, 50265,    94,  3836,   385,  4324, 11356,   102,  2214, 50266,
            94,  3836,    16,  2214,    30, 47510,  3250,   385,  4324, 11356,
           102,    31,    69, 14887, 31386,  2453,  4535,  2642,   193, 50267,
         11356,   102,   875,  2214,    19,  1149,  2457,   449,  3979,   449,
          3979,  2262,   118,  1350,     8, 15079,   857,   910,  9422,    19,
           449,  3979,  2262,   118,  1350,  5516,  3686,   931, 50267,    24,
            21,   703,    15,   361, 10668,   428, 48540,   336,    25, 11794,
           200,   881, 50267,  1349,  2092,    15,  2424, 39210,  5403,     9,
           385,  4324, 11356,   102, 502

logits:  tensor([[ 0.4067,  0.5586,  0.3894,  0.3687,  0.3918,  0.3752,  1.0459,  0.4443,
          1.0459,  0.3796,  0.4019,  0.7041,  0.5117,  0.3584,  0.7598,  0.4824,
          0.2644,  0.5015,  0.8013,  0.7153,  0.5308,  0.0257,  1.0215,  0.2004,
          0.2649,  0.1790,  0.5425,  0.2737, -0.3884,  0.7515,  0.5122,  0.2386,
          0.5288,  0.5156, -0.1187, -0.1161,  0.2776,  0.6831,  0.6919,  0.6611,
          0.9019,  0.3289,  0.9399,  1.0020,  0.2222,  0.4043,  0.5352,  0.4182,
          0.3162,  0.6445,  1.0898,  0.6606,  0.2979,  0.6821, -0.0779,  0.8125,
          0.4031,  0.1902,  0.3472,  0.5342,  0.1832,  0.1196,  0.7681,  0.8613,
          0.1630,  0.5874,  0.6572,  0.0205,  0.2810,  0.5054,  0.7324,  0.3806,
          0.4229,  0.7563,  0.6685,  0.6382,  0.5615,  0.8052,  0.3201,  0.9048,
          0.4451,  0.4956,  0.4399,  1.1016,  0.0975,  0.2900,  0.3186,  0.0398,
          0.4336,  0.6553,  0.3804,  0.5674,  0.4382,  0.6235,  0.9204,  0.2150,
          0.6299,  

sp_para_output:  tensor([[[ 0.4082,  0.3519, -0.6980,  ...,  1.3427, -0.8878, -0.8618],
         [ 0.8822,  0.6828, -0.6081,  ...,  0.2256, -0.1880, -0.1900],
         [ 0.5688,  0.1995, -0.6590,  ...,  1.1558, -0.3898, -0.6160],
         ...,
         [ 0.7290,  0.5009, -0.2952,  ...,  1.1710, -0.2689, -0.4070],
         [ 0.6849,  0.0448, -0.5779,  ...,  1.0064, -0.5883, -0.5889],
         [ 0.9114,  0.4144, -0.4056,  ...,  1.3653, -0.4874, -0.9423]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.1553],
         [-1.0498],
         [-1.1299],
         [-1.1484],
         [-1.2041],
         [-1.1650],
         [-1.0703],
         [-1.0889],
         [-1.0996],
         [-1.1553]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268

          1131, 20400, 50267, 50270, 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 21, 178, 251, 276, 378, 441, 520, 601, 706, 801], device='cuda:0')
sp_para_output:  tensor([[[ 0.8331,  0.3261, -0.3191,  ...,  1.1745, -0.2866, -0.5620],
         [ 0.8340,  0.8458, -0.3952,  ...,  0.1707, -0.2110, -0.4318],
         [ 0.6671,  0.5831, -0.2690,  ...,  0.6925, -0.1141, -0.7304],
         ...,
         [ 0.8391,  0.6789, -0.4898,  ...,  1.3777, -0.1240, -0.6852],
         [ 0.8715,  0.5797, -0.2565,  ...,  1.0320, -0.3416, -0.1428],
         [ 0.6951,  0.7357, -0.4922,  ...,  1.3257, -0.4130, -0.7842]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.1240],
         [-1.0234],
         [-1.0986],
         [-1.0801],
         [-1.0381],
         [-1.0967],
         [-1.0986],
         [-1.1680],
         [-1.0752],
         [-1.1445]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_logits.size():  torch.Size([

input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463,   218,
           895,  1780,   842,  3320, 225

input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.6663,  0.6410, -0.7407,  ...,  1.7985, -0.6580, -0.3145],
         [ 0.6780,  0.1013, -0.6615,  ...,  1.8534, -0.3242, -0.7081],
         [ 0.5075,  0.3068, -0.5974,  ...,  1.3277, -0.6127, -0.8360],
         ...,
         [ 1.1172,  0.3538, -0.7244,  ...,  1.2369, -0.4024, -0.5715],
         [ 0.4128,  0.3120, -0.7445,  ...,  1.7269, -0.6152, -0.5704],
         [ 0.8670,  0.5853, -0.6748,  ...,  1.8472, -0.5781, -0.5813]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.1514],
         [-1.1270],
         [-1.1328],
         [-1.1113],
         [-1.1045],
         [-1.2012],
         [-1.1855],
         [-1.1787],
         [-1.0908],
         [-1.1543]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

input_ids:  tensor([[    0, 50268,  4651,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  31,   92,  193,  291,  328,  448,  602,  660,  983, 1154],
       device='cuda:0')
sp_para_output:  tensor([[[ 1.1833,  0.4465, -0.3850,  ...,  1.3612, -0.5068, -0.8050],
         [ 1.0582,  0.3250, -0.6548,  ...,  1.0487, -0.4866, -0.7809],
         [ 0.9597,  0.4577, -0.5777,  ...,  1.3505, -0.5427, -0.3362],
         ...,
         [ 0.9781,  0.4821, -0.5497,  ...,  1.5086, -0.4393, -0.3582],
         [ 1.2342,  0.5442, -0.4318,  ...,  0.9723, -0.4157, -0.8143],
         [ 1.0362,  0.2418, -0.0473,  ...,  1.0356, -0.4624, -0.4578]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.2422],
         [-1.2100],
         [-1.1729],
         [-1.2900],
         [-1.1904],
         [-1.3086],
         [-1.1260],
         [-1.1543],
         [-1.1367],
         [-1.1133]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

logits:  tensor([[ 2.5024e-01, -2.9248e-01, -2.8824e-02, -1.7859e-01,  7.0435e-02,
          2.9517e-01, -1.2079e-01, -6.5869e-01,  9.3933e-02, -1.2952e-01,
          9.2529e-02,  2.3596e-01,  3.1952e-02,  8.7524e-02,  2.2079e-02,
         -7.8674e-02, -2.1484e-01,  2.3474e-01, -6.4392e-02, -5.8154e-01,
          2.3218e-01,  1.4209e-01,  1.1803e-02, -5.2734e-01,  1.9714e-02,
          1.4648e-01, -6.1475e-01, -4.0619e-02, -4.3481e-01, -1.1383e-01,
         -7.1838e-02,  3.3618e-01,  1.6968e-01, -1.0933e-02,  3.3545e-01,
          3.5156e-02,  5.0244e-01,  3.1714e-01,  3.4839e-01,  8.1726e-02,
         -3.4692e-01, -5.0244e-01,  8.4656e-02,  4.8615e-02,  2.0447e-01,
          3.7451e-01,  8.4900e-02, -2.7173e-01, -5.7764e-01, -3.4399e-01,
          4.6533e-01, -2.4023e-01, -4.0283e-01, -2.1545e-01, -5.4492e-01,
          3.2471e-01, -3.8422e-02,  3.1104e-01,  2.8760e-01,  3.2745e-02,
          3.8501e-01,  1.3171e-01,  8.6975e-02, -1.6663e-01,  2.2192e-01,
          2.3572e-01, -7.7271

input_ids:  tensor([[    0, 50268, 10569,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  17,  164,  219,  457,  591,  882, 1138, 1299, 1438, 1534],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.7638,  0.3730, -0.4980,  ...,  1.6031, -0.5253, -0.6092],
         [ 0.3065,  0.6804, -0.4394,  ...,  1.2698, -0.3506, -0.7629],
         [ 0.6072,  0.6455, -0.4985,  ...,  0.7011, -0.5275, -0.7763],
         ...,
         [ 0.5601,  0.8209, -0.2430,  ...,  1.5704, -0.4454, -0.7441],
         [ 0.7396,  0.7085, -0.3678,  ...,  1.5761, -0.4980, -0.5641],
         [ 0.5184,  0.4496, -0.7681,  ...,  1.4873, -0.6802, -0.5202]]],
       device='cuda:0', grad_fn=<IndexBackward>)
sp_para_output_t:  tensor([[[-1.1348],
         [-1.1484],
         [-1.1455],
         [-1.1816],
         [-1.0107],
         [-1.1895],
         [-1.1602],
         [-1.2178],
         [-1.1875],
         [-1.1777]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
start_

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

validation_step
qid:  5a879ab05542996e4f30887e
input_ids:  tensor([[    0, 50268, 24761,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([ 19, 117, 148, 310, 424, 497, 690, 746, 834, 880], device='cuda:0')
sp_para_output:  tensor([[[ 0.7716,  0.6052, -0.3287,  ...,  1.2317, -0.3828, -0.7057],
         [ 0.8132,  0.5344, -0.4433,  ...,  1.2059, -0.3897, -0.7592],
         [ 0.7129,  0.6566, -0.4680,  ...,  1.3497, -0.3827, -0.7562],
         ...,
         [ 0.7366,  0.6617, -0.4666,  ...,  1.1912, -0.2192, -0.7318],
         [ 0.6119,  0.7162, -0.3574,  ...,  1.3259, -0.2884, -0.8267],
         [ 0.7824,  0.5982, -0.4292,  ...,  1.4045, -0.3635, -0.8559]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2734],
         [-1.2217],
         [-1.2471],
         [-1.2910],
         [-1.2734],
         [-1.2529],
         [-1.2754],
         [-1.2500],
         [-1.3115],
         [-1.2666]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size

sp_para_output:  tensor([[[ 1.0005,  0.5933, -0.4517,  ...,  1.5871, -0.7074, -0.7612],
         [ 0.8432,  0.6292, -0.4667,  ...,  1.6144, -0.6687, -0.7434],
         [ 0.9862,  0.5958, -0.4494,  ...,  1.6780, -0.6105, -0.7126],
         ...,
         [ 0.9535,  0.5022, -0.4664,  ...,  1.6663, -0.6306, -0.7583],
         [ 0.9889,  0.4498, -0.3405,  ...,  1.5276, -0.6395, -0.7620],
         [ 0.9618,  0.3647, -0.4540,  ...,  1.5332, -0.5777, -0.5987]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2803],
         [-1.2705],
         [-1.3320],
         [-1.2607],
         [-1.2676],
         [-1.2969],
         [-1.2832],
         [-1.2930],
         [-1.2676],
         [-1.2812]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1810])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1810])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[707,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

para_indexes:  tensor([ 14, 108, 170, 333, 378, 430, 492, 626, 662, 742], device='cuda:0')
sp_para_output:  tensor([[[ 0.6271,  0.7575, -0.4035,  ...,  1.4041, -0.5401, -0.6471],
         [ 0.5366,  0.5842, -0.3153,  ...,  1.4248, -0.5853, -0.8775],
         [ 0.6329,  0.6572, -0.4486,  ...,  1.2551, -0.6095, -0.6331],
         ...,
         [ 0.7200,  0.5662, -0.4037,  ...,  1.6439, -0.5569, -0.7084],
         [ 0.5591,  0.5549, -0.3822,  ...,  1.5376, -0.6160, -0.6718],
         [ 0.7631,  0.6655, -0.4214,  ...,  1.5074, -0.5893, -0.7864]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2676],
         [-1.2471],
         [-1.2393],
         [-1.3164],
         [-1.2998],
         [-1.2979],
         [-1.2842],
         [-1.2676],
         [-1.2451],
         [-1.2773]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 783])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 783])
end_positions.size():  torch.Size([1, 

answers:  [{'text': ' nin it features nins relationships with writer henry miller his wife june miller psychoanalyst otto rank her father', 'score': tensor([1.2852], device='cuda:0', dtype=torch.float16), 'start_logit': tensor(1.2627, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.9185, device='cuda:0', dtype=torch.float16), 'p_type_score': tensor([1.6738], device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 5, 6, 6, 6, 6, 7, 8, 9]
para_sent_logits: tensor([-1.9619, -1.9541, -1.9258], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-5.8418], device='cuda:0')
para_sents_offset: [0, 3]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int64)}
para_sent_logits: tensor([-1.9336, -1.9443, -1.9473], device='cuda:0', dtype=torch.float16)
para_sent_logits_sum: tensor([-5.8418, -5.8252], device='cuda:0')
para_sents_offset: [0, 3, 6]
evidence_candidates: {0: tensor([], device='cuda:0', dtype=torch.int

sp_para_output:  tensor([[[ 1.0367,  0.6014, -0.3700,  ...,  1.4589, -0.6927, -0.8438],
         [ 0.8065,  0.7863, -0.3162,  ...,  1.4691, -0.7020, -0.8822],
         [ 0.9137,  0.6583, -0.3473,  ...,  1.4677, -0.6783, -0.7917],
         ...,
         [ 0.9926,  0.6235, -0.4068,  ...,  1.4624, -0.7093, -0.8990],
         [ 0.8748,  0.5988, -0.3097,  ...,  1.4994, -0.6416, -0.8675],
         [ 0.9533,  0.5918, -0.4023,  ...,  1.4701, -0.7035, -0.8616]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.3096],
         [-1.3389],
         [-1.2832],
         [-1.2930],
         [-1.3203],
         [-1.3203],
         [-1.3086],
         [-1.3145],
         [-1.2852],
         [-1.3037]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1681])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1681])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[955, 961,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

input_ids:  tensor([[    0, 50268,  5488, 44878,     9, 38727,    11, 14490,  3821,  8377,
           247,  6308,   129,    65,  4707, 50269,     2, 50265,  9473,   493,
         50266,  9473,   493,  4142, 16441,     9,  9473,   493,   741,   298,
         18195,  6528, 26848, 48312,  6382,   271, 18195,   267,  2636,    16,
           247,    11,  2077,    25,   493, 50267,    24,    16,  3821,  8377,
           247,    30,   443,   200,  7877, 24347,   247,    19,    81,   316,
           325,    82,     8,   144, 24347,  4593,    11,   232, 50267,    24,
            16, 43490,    30,  9473,   811,  6444,    15,  2077,  4709,   873,
           811,  3342,    15, 10103,     8, 11751,     9,   741,  3314,   337,
            15, 10171, 50267,    24,   327,  1212,  7562,    19,   181,   677,
          7566,     7,  3072,  1855,  1243,  3087, 18239,     8,   741,   298,
         22165,     7,  9489,     8,   127, 43918, 12601,  1916,     8, 14225,
           462, 42589,     7,  3017, 502

sp_para_output:  tensor([[[ 0.9138,  0.5225, -0.3533,  ...,  1.5449, -0.4744, -0.6126],
         [ 0.7706,  0.4838, -0.4304,  ...,  1.5338, -0.4683, -0.6609],
         [ 0.7619,  0.6836, -0.3976,  ...,  1.4257, -0.5096, -0.7042],
         ...,
         [ 0.8032,  0.5717, -0.3829,  ...,  1.4921, -0.5059, -0.7029],
         [ 0.8271,  0.4108, -0.4698,  ...,  1.4207, -0.4585, -0.5026],
         [ 0.9217,  0.3647, -0.4005,  ...,  1.6144, -0.4974, -0.6242]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.3037],
         [-1.3174],
         [-1.3486],
         [-1.3027],
         [-1.3076],
         [-1.2842],
         [-1.2910],
         [-1.3115],
         [-1.2666],
         [-1.3018]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1338])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1338])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[551, 556, 827,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 1.0061,  0.5925, -0.5369,  ...,  1.7178, -0.4975, -0.6412],
         [ 0.9903,  0.6233, -0.5438,  ...,  1.5262, -0.4734, -0.7129],
         [ 1.0452,  0.7257, -0.4708,  ...,  1.4887, -0.4818, -0.8099],
         ...,
         [ 1.0690,  0.4746, -0.4177,  ...,  1.5125, -0.4650, -0.7622],
         [ 0.9626,  0.6352, -0.5076,  ...,  1.6830, -0.4646, -0.8462],
         [ 0.7572,  0.5111, -0.5855,  ...,  1.5387, -0.4849, -0.7314]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2852],
         [-1.2568],
         [-1.2988],
         [-1.2812],
         [-1.2930],
         [-1.2998],
         [-1.3037],
         [-1.3125],
         [-1.2510],
         [-1.2480]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1340])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1340])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1173,    0,    0,    0,    0,    0,    0,    0,    0,    0,

validation_step
qid:  5a89372855429951533612e6
input_ids:  tensor([[    0, 50268, 12196,    16,  5933,     9,  1349,   147,  1014, 23321,
           118,   475, 15022, 11824, 30141,   316,  1946,    21, 10899, 50269,
             2, 50265, 14206,  5730, 29762,  9326, 50266, 14206,  5730, 29762,
          9326,    16,  4243,  4930,  1349,  2034,    11, 11824, 30141,    92,
          2077,   885,  4575, 28410,  7085,   493, 50267,    24,    16, 17949,
            15,  9910,    19,  6594,   781,  2523,     9, 14206,  5730, 29762,
             8,   885,   895,  6487,   257,     8,    16,   275,   684,    25,
           184,     9, 11824, 30141, 10775,  4243,  1015,   547,   349, 16874,
         24761,     8, 11824, 30141,   316,  1946,   515,   547,   349, 10668,
           428, 48540, 50267,  5356,  1558,  6301,   251,  1349,    16, 12684,
          2014,  9326,     8,    16,   285,   921,    19,  2340,  2078,  5165,
            77,   117,  4930,  1061,    32,   145,   422,     8,    89, 

logits:  tensor([[ 0.5488,  0.5137,  0.6943,  0.4980,  0.6821,  0.2408,  0.1109,  0.4839,
          0.3379,  0.6240,  0.2430,  0.4087,  0.5176,  0.6250,  0.1022,  0.3657,
          0.0411,  0.3376,  0.1586,  0.3418,  0.3835,  0.4480,  0.3313,  0.5459,
          0.9370,  0.1161,  0.4868,  0.3667,  0.4319,  0.7754,  0.4714, -0.0176,
          0.7266,  1.0771,  0.5361, -0.1788,  0.2637,  0.8301,  0.2396,  0.9014,
          1.1719,  0.8677,  0.6802,  0.2888,  0.1201,  0.8423,  0.3606,  0.3101,
         -0.0476,  0.4456,  0.5352,  0.9814,  0.7661,  0.4492,  0.5439,  0.4824,
          0.1620,  0.5088,  0.4971,  0.6167, -0.2551,  0.6279,  0.6533,  0.3232,
          0.2489,  0.0081,  0.2448, -0.1461, -0.1765, -0.0886,  0.7222,  0.4148,
          0.9424,  0.3823,  0.4419,  0.7227,  0.6934,  0.0401, -0.0038,  0.5312,
          0.3828, -0.0119,  0.8911,  0.3108,  0.4397,  0.4285,  0.4600, -0.0961,
         -0.1592,  0.6084,  0.1342,  0.3650,  0.4795,  0.6260,  0.1227,  0.3567,
          0.3984,  

sp_para_output:  tensor([[[ 0.8359,  0.6084, -0.4939,  ...,  1.1289, -0.3991, -0.6639],
         [ 0.8464,  0.5815, -0.5939,  ...,  1.1665, -0.4514, -0.6042],
         [ 0.6994,  0.6027, -0.5130,  ...,  1.3304, -0.4087, -0.5990],
         ...,
         [ 0.7098,  0.6488, -0.5523,  ...,  1.2407, -0.3462, -0.6948],
         [ 0.7757,  0.7570, -0.4848,  ...,  1.4162, -0.3839, -0.7245],
         [ 0.7342,  0.7024, -0.4999,  ...,  1.3622, -0.4547, -0.7088]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2617],
         [-1.2422],
         [-1.2422],
         [-1.2607],
         [-1.2754],
         [-1.2422],
         [-1.2529],
         [-1.2666],
         [-1.2539],
         [-1.2568]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1206])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1206])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[110,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.9477,  0.4163, -0.4970,  ...,  1.1920, -0.5432, -0.7056],
         [ 0.8880,  0.5011, -0.5678,  ...,  1.1313, -0.4936, -0.7421],
         [ 0.8732,  0.5611, -0.5630,  ...,  1.1123, -0.5015, -0.8128],
         ...,
         [ 0.8228,  0.4213, -0.5822,  ...,  1.2202, -0.5387, -0.7771],
         [ 0.8684,  0.4311, -0.4532,  ...,  1.1206, -0.4351, -0.7795],
         [ 1.0378,  0.3100, -0.5114,  ...,  1.1767, -0.5076, -0.6558]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2822],
         [-1.3145],
         [-1.3330],
         [-1.2920],
         [-1.2656],
         [-1.2646],
         [-1.2939],
         [-1.3203],
         [-1.3164],
         [-1.3115]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1175])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1175])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[853,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.9475,  0.7762, -0.4066,  ...,  1.4058, -0.6032, -0.6796],
         [ 0.7934,  0.6702, -0.4090,  ...,  1.2234, -0.6110, -0.6918],
         [ 0.8402,  0.7171, -0.4441,  ...,  1.3183, -0.4523, -0.6700],
         ...,
         [ 0.7495,  0.6845, -0.4179,  ...,  1.3043, -0.5562, -0.7567],
         [ 0.8837,  0.7241, -0.3414,  ...,  1.6644, -0.6506, -0.7454],
         [ 0.8415,  0.5170, -0.3916,  ...,  1.4121, -0.5968, -0.6214]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2812],
         [-1.2871],
         [-1.3252],
         [-1.2598],
         [-1.2549],
         [-1.2793],
         [-1.3330],
         [-1.3086],
         [-1.2842],
         [-1.2686]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1347])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1347])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[507,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.7928,  0.5767, -0.5286,  ...,  1.2847, -0.5826, -0.7763],
         [ 0.6641,  0.5221, -0.6454,  ...,  1.2269, -0.5529, -0.8448],
         [ 0.6168,  0.4376, -0.6231,  ...,  1.2497, -0.5724, -0.7913],
         ...,
         [ 0.6421,  0.3937, -0.6043,  ...,  1.3325, -0.5951, -0.6992],
         [ 0.6291,  0.4647, -0.6363,  ...,  1.2830, -0.5458, -0.8110],
         [ 0.7340,  0.3615, -0.5885,  ...,  1.2623, -0.5027, -0.7939]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.3018],
         [-1.3018],
         [-1.3135],
         [-1.3027],
         [-1.2568],
         [-1.3086],
         [-1.2363],
         [-1.2715],
         [-1.3145],
         [-1.3086]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1494])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1494])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[254, 270, 276,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.8302,  0.8465, -0.3870,  ...,  1.6933, -0.4445, -0.7355],
         [ 0.8150,  0.8076, -0.5584,  ...,  1.4790, -0.4291, -0.7768],
         [ 0.7656,  0.7481, -0.4369,  ...,  1.4853, -0.4278, -0.7330],
         ...,
         [ 0.6664,  0.7548, -0.5620,  ...,  1.5956, -0.4275, -0.7765],
         [ 0.8691,  0.8646, -0.3698,  ...,  1.5994, -0.4305, -0.7490],
         [ 0.7767,  0.8350, -0.4755,  ...,  1.5302, -0.4290, -0.8328]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2139],
         [-1.2422],
         [-1.2461],
         [-1.2686],
         [-1.2598],
         [-1.2148],
         [-1.2266],
         [-1.2588],
         [-1.2607],
         [-1.2217]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1136])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1136])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[284, 535,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 1.0556,  0.6354, -0.5648,  ...,  1.4250, -0.5248, -0.8495],
         [ 0.8547,  0.5328, -0.5601,  ...,  1.5641, -0.4302, -0.8636],
         [ 0.8267,  0.5850, -0.5603,  ...,  1.3669, -0.5190, -0.7683],
         ...,
         [ 0.8855,  0.6839, -0.5872,  ...,  1.4009, -0.5288, -0.8242],
         [ 1.1253,  0.5015, -0.4811,  ...,  1.3509, -0.4953, -0.8617],
         [ 0.9398,  0.4517, -0.5505,  ...,  1.5574, -0.4369, -0.8989]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.3037],
         [-1.3330],
         [-1.3291],
         [-1.2881],
         [-1.2783],
         [-1.3203],
         [-1.3301],
         [-1.2822],
         [-1.2881],
         [-1.2744]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1196])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1196])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[203, 581,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.8476,  0.3341, -0.5330,  ...,  1.3658, -0.5987, -0.7300],
         [ 0.7953,  0.3905, -0.5203,  ...,  1.4365, -0.5558, -0.7569],
         [ 0.6767,  0.4493, -0.4657,  ...,  1.3819, -0.5742, -0.8898],
         ...,
         [ 0.7440,  0.2983, -0.5451,  ...,  1.2561, -0.5956, -0.8372],
         [ 0.7743,  0.2970, -0.5326,  ...,  1.3447, -0.5443, -0.6728],
         [ 0.7448,  0.4013, -0.5000,  ...,  1.1507, -0.5880, -0.8384]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2500],
         [-1.2490],
         [-1.2695],
         [-1.2637],
         [-1.2354],
         [-1.2598],
         [-1.2773],
         [-1.2793],
         [-1.2461],
         [-1.2432]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1425])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1425])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1363,    0,    0,    0,    0,    0,    0,    0,    0,    0,

validation_step
qid:  5adc1309554299438c868d3b
input_ids:  tensor([[    0, 50268, 19003,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  20,  128,  364,  473,  500,  619,  785,  949, 1096, 1272],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.9318,  0.5167, -0.2867,  ...,  1.5755, -0.5363, -0.7056],
         [ 0.7852,  0.6898, -0.3618,  ...,  1.5649, -0.5712, -0.7948],
         [ 0.7278,  0.6871, -0.3751,  ...,  1.4905, -0.5426, -0.7744],
         ...,
         [ 0.8152,  0.5236, -0.2753,  ...,  1.4238, -0.4646, -0.8673],
         [ 0.6626,  0.4637, -0.3735,  ...,  1.4644, -0.3848, -0.7030],
         [ 0.8372,  0.6308, -0.2889,  ...,  1.4665, -0.5218, -0.8205]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2822],
         [-1.2725],
         [-1.3096],
         [-1.2666],
         [-1.2637],
         [-1.2432],
         [-1.2686],
         [-1.2998],
         [-1.2568],
         [-1.2783]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.9070,  0.7316, -0.4053,  ...,  1.4154, -0.4428, -0.7186],
         [ 0.9019,  0.5144, -0.3964,  ...,  1.3461, -0.5174, -0.6659],
         [ 0.8961,  0.7014, -0.3933,  ...,  1.4102, -0.4394, -0.6803],
         ...,
         [ 0.9752,  0.6731, -0.4405,  ...,  1.3139, -0.5527, -0.6776],
         [ 0.8517,  0.6144, -0.4469,  ...,  1.4107, -0.5103, -0.7094],
         [ 0.8308,  0.6654, -0.4012,  ...,  1.3621, -0.4285, -0.7123]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2285],
         [-1.2490],
         [-1.2520],
         [-1.2764],
         [-1.2627],
         [-1.2178],
         [-1.2412],
         [-1.2168],
         [-1.2246],
         [-1.2422]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 917])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 917])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 68, 225, 248, 302, 386, 417, 495, 540, 647, 656, 693, 747, 7

sp_para_output:  tensor([[[ 0.5434,  0.5439, -0.4329,  ...,  1.3476, -0.5224, -0.6571],
         [ 0.6433,  0.4117, -0.5627,  ...,  1.2047, -0.4891, -0.6969],
         [ 0.4909,  0.4535, -0.5894,  ...,  1.2564, -0.4356, -0.6143],
         ...,
         [ 0.6204,  0.5764, -0.4410,  ...,  1.3760, -0.5106, -0.6350],
         [ 0.5094,  0.4268, -0.4343,  ...,  1.3840, -0.4117, -0.7081],
         [ 0.5966,  0.4430, -0.3776,  ...,  1.4671, -0.5169, -0.6684]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2754],
         [-1.2461],
         [-1.2686],
         [-1.2842],
         [-1.3115],
         [-1.2900],
         [-1.2559],
         [-1.2744],
         [-1.2969],
         [-1.2822]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1124])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1124])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[268, 288, 337, 362, 387, 430, 434,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.9441,  0.7884, -0.2459,  ...,  1.2553, -0.1441, -0.7123],
         [ 0.8660,  0.8561, -0.3544,  ...,  1.2023, -0.2414, -0.8690],
         [ 0.8184,  0.8094, -0.3370,  ...,  1.0610, -0.1548, -0.8167],
         ...,
         [ 0.8254,  0.9755, -0.3684,  ...,  1.2896, -0.1702, -0.7103],
         [ 0.8590,  0.7379, -0.3457,  ...,  1.2455, -0.1717, -0.7594],
         [ 0.8406,  0.7159, -0.2794,  ...,  1.2913, -0.1314, -0.8231]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2217],
         [-1.2109],
         [-1.2188],
         [-1.2305],
         [-1.2275],
         [-1.2344],
         [-1.2217],
         [-1.2041],
         [-1.2109],
         [-1.2373]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 877])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 877])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[271,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

validation_step
qid:  5adf732a5542993a75d264e9
input_ids:  tensor([[    0, 50268,  5488, 38187,   260,  8676,   222,   218,   895,  1780,
          4209, 50269,     2, 50265,   475,  4348,   385,  8974,  5691, 50266,
           475,  4348,   385,  8974,  5691,  2421, 31025,    16, 38187,   260,
          8676,     8,  1632,  5799,  8298,    31, 16080,   895, 50267, 26232,
            37,  1665,    25,   919,     9, 16080,   895,   194, 22437,  4561,
           982,   974,   212, 22437,  1418,    11,   425, 50267,   385,  8974,
          5691,  1665,    25,  5688,   884,    11, 16080,   895, 22437, 50267,
          2052,     7,   145,  2736,     7, 16080,   895, 22437,   385,  8974,
          5691,  1665,    11,   194,   790,    31, 15077,     7,  8548, 50267,
            37,  3562,  2052,     7,  2266,  1727,     8,    21,  4209,    30,
         44009,   181,  1368,  4291,  1344, 50267,  1437,     2, 50265,   475,
          8616,  2463,   218,   895,  1780, 50266,   475,  8616,  2463, 

validation_step
qid:  5ac2a912554299218029dae8
input_ids:  tensor([[    0, 50268,  5488,  ..., 10932,  2362,     2]], device='cuda:0')
para_indexes:  tensor([  24,  145,  383,  527,  607,  680,  846,  944,  999, 1072],
       device='cuda:0')
sp_para_output:  tensor([[[ 0.8948,  0.3393, -0.5704,  ...,  1.5085, -0.4629, -0.6716],
         [ 0.8521,  0.3274, -0.6495,  ...,  1.3678, -0.5451, -0.6628],
         [ 0.6858,  0.3255, -0.5810,  ...,  1.3451, -0.4781, -0.6490],
         ...,
         [ 0.8520,  0.4098, -0.6650,  ...,  1.4777, -0.4141, -0.6133],
         [ 0.7519,  0.4124, -0.5817,  ...,  1.5271, -0.4647, -0.7259],
         [ 0.7923,  0.4284, -0.6250,  ...,  1.3869, -0.5927, -0.6243]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2275],
         [-1.2324],
         [-1.2627],
         [-1.2441],
         [-1.2168],
         [-1.2451],
         [-1.2666],
         [-1.2549],
         [-1.2324],
         [-1.2422]]], device='cuda:0', dtype=torch.float16)
start_logits.si

sp_para_output:  tensor([[[ 0.9457,  0.7770, -0.4742,  ...,  1.7945, -0.5671, -0.8431],
         [ 0.8632,  0.5366, -0.5030,  ...,  1.6287, -0.5401, -0.9453],
         [ 0.8956,  0.4802, -0.4073,  ...,  1.8564, -0.6690, -1.1226],
         ...,
         [ 0.8274,  0.6823, -0.4352,  ...,  1.7937, -0.5811, -1.0598],
         [ 0.8196,  0.5502, -0.3743,  ...,  1.8637, -0.6135, -1.0113],
         [ 0.7772,  0.5948, -0.4867,  ...,  1.9943, -0.5838, -0.8480]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2080],
         [-1.1885],
         [-1.2305],
         [-1.2490],
         [-1.2324],
         [-1.2227],
         [-1.2305],
         [-1.2236],
         [-1.1973],
         [-1.2432]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1395])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1395])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[567,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output:  tensor([[[ 0.8209,  0.5477, -0.4425,  ...,  1.3615, -0.6250, -0.8163],
         [ 0.7312,  0.4839, -0.4207,  ...,  1.2720, -0.6118, -0.8952],
         [ 0.7945,  0.4323, -0.4615,  ...,  1.3467, -0.6191, -0.8094],
         ...,
         [ 0.7583,  0.4087, -0.5226,  ...,  1.4760, -0.6067, -0.7982],
         [ 0.7453,  0.5525, -0.4619,  ...,  1.3917, -0.5812, -0.8790],
         [ 0.9512,  0.4465, -0.3664,  ...,  1.2902, -0.5690, -0.7638]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2393],
         [-1.2568],
         [-1.2588],
         [-0.9556],
         [-1.2295],
         [-1.2471],
         [-1.2793],
         [-1.2480],
         [-1.2881],
         [-1.2705]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1514])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1514])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 624,  917, 1128,    0,    0,    0,    0,    0,    0,    0,

sp_para_output:  tensor([[[ 0.8540,  0.7362, -0.5623,  ...,  1.3619, -0.4920, -0.7484],
         [ 0.7839,  0.6820, -0.5097,  ...,  1.4485, -0.4857, -0.8160],
         [ 0.8508,  0.6792, -0.4587,  ...,  1.2871, -0.4687, -0.8577],
         ...,
         [ 0.8496,  0.7558, -0.4936,  ...,  1.3815, -0.5423, -0.8471],
         [ 0.8368,  0.7060, -0.4697,  ...,  1.2410, -0.5122, -0.8690],
         [ 0.8172,  0.7104, -0.4278,  ...,  1.3949, -0.4942, -0.8315]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2656],
         [-1.3203],
         [-1.2979],
         [-1.2900],
         [-1.2822],
         [-1.2510],
         [-1.2764],
         [-1.2920],
         [-1.3086],
         [-1.2686]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1806])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1806])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[606,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

sp_para_output_t:  tensor([[[-1.2939],
         [-1.2637],
         [-1.3027],
         [-1.3164],
         [-1.3184],
         [-1.3018],
         [-1.2861],
         [-1.2812],
         [-1.2725],
         [-1.2979]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1199])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1199])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[261, 292, 297,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[0.5923, 0.6182, 0.5664,  ..., 0.8481, 0.8306, 0.8276]],
       device='cuda:0', dtype=torch.float16)
masked_target:  tensor([[264, 295, 300,   0,   0

start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1326])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[1324,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
logits:  tensor([[0.5386, 0.5127, 0.5913,  ..., 0.8130, 0.6890, 0.8623]],
       device='cuda:0', dtype=torch.float16)
masked_target:  tensor([[1324,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0

sp_para_output_t:  tensor([[[-1.2578],
         [-1.2432],
         [-1.2393],
         [-1.2383],
         [-1.2891],
         [-1.2598],
         [-1.2627],
         [-1.2480],
         [-1.2295],
         [-1.2637]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 875])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 875])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[873,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
logits:  tensor([[ 0.5615,  0.8237,  0.5493,  0.1049,  0.5239,  0.7725,  0.3394,  0.1340,
          0.9697,  0.4324,  0.3967,  0.1743,  0.6094,  0.4402,  0.5283,  0.4358

sp_para_output:  tensor([[[ 0.7496,  0.7213, -0.2966,  ...,  1.3744, -0.5397, -0.6414],
         [ 0.7886,  0.5632, -0.3674,  ...,  1.4328, -0.5115, -0.7231],
         [ 0.7071,  0.6333, -0.3995,  ...,  1.4116, -0.4942, -0.7140],
         ...,
         [ 0.7386,  0.5672, -0.3370,  ...,  1.4125, -0.5091, -0.7169],
         [ 0.7065,  0.7206, -0.3796,  ...,  1.5157, -0.4877, -0.7098],
         [ 0.7272,  0.5460, -0.4011,  ...,  1.4206, -0.5413, -0.7309]]],
       device='cuda:0')
sp_para_output_t:  tensor([[[-1.2275],
         [-1.2188],
         [-1.2373],
         [-1.2256],
         [-1.2207],
         [-1.2607],
         [-1.2109],
         [-1.2529],
         [-1.2705],
         [-1.1963]]], device='cuda:0', dtype=torch.float16)
start_logits.size():  torch.Size([1, 1667])
start_positions.size():  torch.Size([1, 64])
end_logits.size():  torch.Size([1, 1667])
end_positions.size():  torch.Size([1, 64])
masked_target:  tensor([[ 120,  151,  180,  479,  538,  577,  892,  922, 1002, 1464,


Epoch 00005: avg_val_f1 reached 0.02105 (best 0.02105), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_5.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(15.0574, device='cuda:0')	avg_answer_loss:  tensor(6.2189, device='cuda:0')	avg_type_loss:  tensor(0.2508, device='cuda:0')	avg_val_f1:  0.021047917933299624	avg_val_em:  0.0	avg_val_prec:  0.011511156271243918	avg_val_recall:  0.13793103448275862
avg_val_sp_sent_f1:  tensor(0., device='cuda:0')	avg_val_sp_sent_em:  tensor(0., device='cuda:0')	avg_val_sp_sent_prec:  tensor(0., device='cuda:0')	avg_val_sp_sent_recall:  tensor(0., device='cuda:0')
avg_val_joint_f1:  tensor(0., device='cuda:0')	avg_val_joint_em:  tensor(0., device='cuda:0')	avg_val_joint_prec:  tensor(0., device='cuda:0')	avg_val_joint_recall:  tensor(0., device='cuda:0')



1

In [None]:
#     trainer.test(model)

In [None]:
### To install apex ### 
#     !git clone https://github.com/NVIDIA/apex
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/apex/")
#     !module load cuda101/neuralnet/7/7.6.4  
#     !module load cuda10.1/toolkit/10.1.243 
#     !conda install -c conda-forge cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf201901 cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf202003 cudatoolkit-dev --yes
#     !which nvcc
#     !python -m pip install -v --no-cache-dir ./
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/")

In [40]:
# debug: check args
import shlex
argString ='--train_dataset small.json --dev_dataset small.json  \
    --gpus 0 --num_workers 4 \
    --max_seq_len 4096 --doc_stride -1  \
    --save_prefix hotpotqa-longformer_jupyter  --model_path /xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096'
# hotpot_dev_distractor_v1.json
#  --train_dataset /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_train_reduced_context_coref_fuzzy.json --dev_dataset /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json  \ 

import argparse 
if __name__ == "__main__":
    main_arg_parser = argparse.ArgumentParser(description="hotpotqa")
    parser = hotpotqa.add_model_specific_args(main_arg_parser, os.getcwd())
    args = parser.parse_args(shlex.split(argString)) 
    for arg in vars(args):
        print((arg, getattr(args, arg)))
    main(args)


('save_dir', 'jupyter-hotpotqa')
('save_prefix', 'hotpotqa-longformer_jupyter')
('train_dataset', 'small.json')
('dev_dataset', 'small.json')
('batch_size', 2)
('gpus', '0')
('warmup', 1000)
('lr', 5e-05)
('val_every', 1.0)
('val_percent_check', 1.0)
('num_workers', 4)
('seed', 1234)
('epochs', 6)
('max_seq_len', 4096)
('max_doc_len', 4096)
('max_num_answers', 64)
('max_question_len', 55)
('doc_stride', -1)
('ignore_seq_with_no_answers', False)
('disable_checkpointing', False)
('n_best_size', 20)
('max_answer_length', 30)
('regular_softmax_loss', False)
('test', False)
('model_path', '/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096')
('no_progress_bar', False)
('attention_mode', 'sliding_chunks')
('fp32', False)
('train_percent', 1.0)


### Sandbox

In [None]:
import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096') 
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

In [None]:
model = Longformer.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

In [None]:
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
model = model.cuda() 
input_ids = input_ids.cuda()

In [None]:
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens

# padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
input_ids, attention_mask = pad_to_window_size(
        input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

In [None]:
output = model(input_ids, attention_mask=attention_mask)[0]
output

In [None]:

!nvidia-smi
!nvidia-smi -L

In [None]:
import torch
crossentropy_average = torch.nn.CrossEntropyLoss(reduction = 'mean', ignore_index=-1) 
crossentropy_weighted_average = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 4.0]), reduction = 'mean', ignore_index=-1) 

sp_para_output_t = torch.tensor([[
         [ 0.0227],
         [ 0.0365],
         [ 0.0054],
         [ 0.0401],
         [ 0.0590],
         [ 0.0134],
         [ 0.0336],
         [-0.0062],
         [ 0.0631],
         [ 0.0811]]])
sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float) 
predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).view(-1,2)
# predict_support_para = torch.cat([sp_para_output_t, sp_para_output_aux], dim=-1).view(-1,2)
predict_support_para

sp_para =  torch.tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
#sp_para_loss:  tensor(0.7061, device='cuda:0')

crossentropy_average(predict_support_para, sp_para)

crossentropy_weighted_average(predict_support_para, sp_para)

sp_para

positive_index = torch.nonzero(sp_para.view(-1)).view(-1).tolist()

from random import choice
choice([i for i in range(0,9) if i not in poositive_index])

set(range(10))

predict_support_para_1 = torch.tensor([[ 0.0000,  0.0227]])
sp_para_1 =  torch.tensor([0])
loss1 = crossentropy_average(predict_support_para_1, sp_para_1)
loss1

predict_support_para_2 = torch.tensor([[ 0.0000,  0.0365]])
sp_para_2 =  torch.tensor([0])
loss2 = crossentropy_average(predict_support_para_2, sp_para_2)
loss2

predict_support_para_3 = torch.tensor([[ 0.0000,  0.0054]])
sp_para_3 =  torch.tensor([0])
loss3 = crossentropy_average(predict_support_para_3, sp_para_3)
loss3

predict_support_para_4 = torch.tensor([[ 0.0000,  0.0401]])
sp_para_4 =  torch.tensor([0])
loss4 = crossentropy_average(predict_support_para_4, sp_para_4)
loss4

predict_support_para_5 = torch.tensor([[ 0.0000,  0.0590]])
sp_para_5 =  torch.tensor([0])
loss5 = crossentropy_average(predict_support_para_5, sp_para_5)
loss5

predict_support_para_6 = torch.tensor([[ 0.0000,  0.0134]])
sp_para_6 =  torch.tensor([1])
loss6 = crossentropy_average(predict_support_para_6, sp_para_6)
loss6

predict_support_para_7 = torch.tensor([[ 0.0000,  0.0336]])
sp_para_7 =  torch.tensor([1])
loss7 = crossentropy_average(predict_support_para_7, sp_para_7)
loss7

predict_support_para_8 = torch.tensor([[ 0.0000, -0.0062]])
sp_para_8 =  torch.tensor([0])
loss8 = crossentropy_average(predict_support_para_8, sp_para_8)
loss8

predict_support_para_9 = torch.tensor([[ 0.0000,  0.0631]])
sp_para_9 =  torch.tensor([0])
loss9 = crossentropy_average(predict_support_para_9, sp_para_9)
loss9

predict_support_para_10 = torch.tensor([[ 0.0000,  0.0811]])
sp_para_10 =  torch.tensor([0])
loss10 = crossentropy_average(predict_support_para_10, sp_para_10)
loss10

(loss1+loss2+loss3+loss4+loss5+loss6+loss7+loss8+loss9+loss10)/10

(loss1+loss2+loss3+loss4+loss5+4.0*loss6+4.0*loss7+loss8+loss9+loss10)/(8+2*4)

In [None]:
s_to_p_map = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 7, 7, 8, 9]
sp_sent_logits = torch.tensor([[[-0.0143],
         [ 0.0416],
         [ 0.1065],
         [-0.0007],
         [ 0.0407],
         [ 0.1273],
         [ 0.0663],
         [ 0.0500],
         [ 0.0630],
         [ 0.1230],
         [ 0.0140],
         [ 0.0205],
         [-0.0505],
         [ 0.0224],
         [ 0.0907],
         [-0.0142],
         [ 0.0180],
         [ 0.0564],
         [-0.0132],
         [ 0.0016],
         [ 0.0327],
         [ 0.0687],
         [ 0.1049],
         [ 0.0568],
         [ 0.0740],
         [ 0.0006],
         [ 0.1159],
         [ 0.0566],
         [ 0.0543],
         [-0.0304],
         [ 0.1315],
         [-0.0143],
         [ 0.0444],
         [-0.0682],
         [ 0.1120],
         [ 0.1247]]])


In [None]:

if(len(s_to_p_map)>0):
    for i in range(s_to_p_map[-1]+1)
        sent_logits_torch.masked_select(sp_sent_logits.squeeze(), torch.tensor([p==0 for p in s_to_p_map]))

In [None]:
list(range(0))
