In [1]:
# increase the cell width 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))   

# need to run this every time start this notebook, to add python3.7/site-packages to sys.pat, in order to import ipywidgets, which is used when RobertaTokenizer.from_pretrained('roberta-base') 
import sys
# sys.path.insert(0, '/xdisk/msurdeanu/fanluo/miniconda3/envs/hotpotqa/lib/python3.7/site-packages') 

### convert hotpotqa to squard format

According to Longformer: use the following input format with special tokens:  “[CLS] [q] question [/q] [p] sent1,1 [s] sent1,2 [s] ... [p] sent2,1 [s] sent2,2 [s] ...” 
where [s] and [p] are special tokens representing sentences and paragraphs. The special tokens were added to the RoBERTa vocabulary and randomly initialized before task finetuning.

In [2]:
# helper functions to convert hotpotqa to squard format modified from  https://github.com/chiayewken/bert-qa/blob/master/run_hotpot.py

import tqdm 
from datetime import datetime 
import pytz 
timeZ_Az = pytz.timezone('US/Mountain') 
#!pip install -U transformers
#!pip install torch==1.6.0 torchvision==0.7.0
import transformers 

QUESTION_START = '[question]'
QUESTION_END = '[/question]' 
TITLE_START = '<t>'  # indicating the start of the title of a paragraph (also used for loss over paragraphs)
TITLE_END = '</t>'   # indicating the end of the title of a paragraph
SENT_MARKER_END = '[/sent]'  # indicating the end of the title of a sentence (used for loss over sentences)
PAR = '[/par]'  # used for indicating end of the regular context and beginning of `yes/no/null` answers
EXTRA_ANSWERS = " yes no null"

 
def create_example_dict(context, answer, id, question, is_sup_fact, is_supporting_para):
    return {
        "context": context,
        "qas": [                        # each context corresponds to only one qa in hotpotqa
            {
                "answer": answer,
                "id": id,
                "question": question,
                "is_sup_fact": is_sup_fact,
                "is_supporting_para": is_supporting_para
            }
        ],
    }

def create_para_dict(example_dicts):
    if type(example_dicts) == dict:
        example_dicts = [example_dicts]   # each paragraph corresponds to only one [context, qas] in hotpotqa
    return {"paragraphs": example_dicts}   


In [3]:
# !python -m pip install tqdm 
# !python -m pip install git+https://github.com/allenai/longformer.git 
# !python -m pip install pytorch-lightning==0.6.0
# !python -m pip install jdc  
# !wget https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-base-4096.tar.gz
# !tar -xf longformer-base-4096.tar.gz

In [4]:
import re
import string

def convert_hotpot_to_squad_format(json_dict, gold_paras_only=False):
    
    """function to convert hotpotqa to squard format.


    Note: A context corresponds to several qas in SQuard. In hotpotqa, one question corresponds to several paragraphs as context. 
          "paragraphs" means different: each paragraph in SQuard contains a context and a list of qas; while 10 paragraphs in hotpotqa concatenated into a context for one question.

    Args:
        json_dict: The original data load from hotpotqa file.
        gold_paras_only: when is true, only use the 2 paragraphs that contain the gold supporting facts; if false, use all the 10 paragraphs
 

    Returns:
        new_dict: The converted dict of hotpotqa dataset, use it as a dict would load from SQuAD json file
                  usage: input_data = new_dict["data"]   https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_squad.py#L230

    """
 
    new_dict = {"data": []} 
    for example in json_dict: 

        support_para = set(
            para_title for para_title, _ in example["supporting_facts"]
        )
        sp_set = set(list(map(tuple, example['supporting_facts'])))
        
        raw_contexts = example["context"]
        if gold_paras_only: 
            raw_contexts = [lst for lst in raw_contexts if lst[0] in support_para]
            
        is_supporting_para = []  # a boolean list with 10 True/False elements, one for each paragraph
        is_sup_fact = []         # a boolean list with True/False elements, one for each context sentence
        for para_title, para_lines in raw_contexts:
            is_supporting_para.append(para_title in support_para)   
            for sent_id, sent in enumerate(para_lines):
                is_sup_fact.append( (para_title, sent_id) in sp_set )    
        
        contexts = []   
        for para_id, para in enumerate(raw_contexts):   
            title = _normalize_text(para[0])    
            sents = [_normalize_text(sent) for sent in para[1]] 
            
            if("kept_para_sent" in example):    # reduceded context 
                sent_joint = '' 
                for sent_id, sent in enumerate(sents):  
                    if(sent_id > 0 and example["kept_para_sent"][para_id][sent_id] - example["kept_para_sent"][para_id][sent_id-1] > 1):    
                        sent_joint += (' </s> ' + sent + ' ' + SENT_MARKER_END )   # </s> indicates at least one sentence is removed    
                    else:   
                        sent_joint += (sent + ' ' + SENT_MARKER_END )   
            else:   
                sent_joint =  (' ' + SENT_MARKER_END +' ').join(sents) + ' ' + SENT_MARKER_END      
                
            contexts.append(TITLE_START + ' ' + title + ' ' + TITLE_END + ' ' +  sent_joint)    
            
        # contexts = [TITLE_START + ' ' + lst[0]  + ' ' + TITLE_END + ' ' + (' ' + SENT_MARKER_END +' ').join(lst[1]) + ' ' + SENT_MARKER_END for lst in raw_contexts]    
 
        answer = _normalize_text(example["answer"])  
        
        if(len(answer) > 0):   # answer can be '' after normalize
            new_dict["data"].append(
                create_para_dict(
                    create_example_dict(
                        context=contexts,
                        answer=answer,
                        id = example["_id"],
                        question=_normalize_text(example["question"]),
                        is_sup_fact = is_sup_fact,
                        is_supporting_para = is_supporting_para 
                    )
                )
            ) 
#     print("number of questions with answer not found in context: ", num_null_answer)
#     print("number of questions with answer 'yes': ", num_yes_answer)
#     print("number of questions with answer 'no': ", num_no_answer)
    return new_dict

def _normalize_text(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [5]:
# debug: check whether convert_hotpot_to_squad_format() works
import os
os.chdir('/xdisk/msurdeanu/fanluo/hotpotQA/Data')
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[1:30]' > small.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[37:50]' > small_dev.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_train_v1.1.json | ../../helper/jq-linux64 -c '.[31:50]' > sample.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[6666:7000]' > small_dev.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[1515:1525]' > small_dev3.json
# !cat /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json | ../../helper/jq-linux64 -c '.[6800:7405]' > small_end.json    
    
import json
with open("small.json", "r", encoding='utf-8') as f:  
    json_dict = convert_hotpot_to_squad_format(json.load(f))['data']
    print(json.dumps(json_dict[3], indent=2))

{
  "paragraphs": [
    {
      "context": [
        "<t> cadmium chloride </t> cadmium chloride is white crystalline compound of cadmium and chlorine with formula cdcl [/sent] it is hygroscopic solid that is highly soluble in water and slightly soluble in alcohol [/sent] although it is considered to be ionic it has considerable covalent character to its bonding [/sent] crystal structure of cadmium chloride described below composed of twodimensional layers of ions is reference for describing other crystal structures [/sent] also known are cdcl\u2022ho and cdcl\u20225ho [/sent]",
        "<t> water blue </t> water blue also known as aniline blue acid blue 22 soluble blue 3m marine blue v or ci 42755 is chemical compound used as stain in histology [/sent] water blue stains collagen blue in tissue sections [/sent] it is soluble in water and slightly soluble in ethanol [/sent]",
        "<t> diflucortolone valerate </t> diflucortolone valerate also nerisone creamoily creamointment neriderm

### longfomer's fine-tuning


- For answer span extraction we use BERT’s QA model with addition of a question type (yes/no/span) classification head over the first special token ([CLS]).

- For evidence extraction we apply 2 layer feedforward networks on top of the representations corresponding to sentence and paragraph tokens to get the corresponding evidence prediction scores and use binary cross entropy loss to train the model.

- We combine span, question classification, sentence, and paragraphs losses and train the model in a multitask way using linear combination of losses.


In [6]:
### Section2: This is modified from longfomer's fine-tuning with triviaqa.py from https://github.com/allenai/longformer/blob/master/scripts/triviaqa.py

# !pip uninstall longformer -y
# !python -m pip uninstall longformer -y
# !pip install git+https://github.com/allenai/longformer.git 
# !python -m pip uninstall pytorch-lightning -y
# !pip uninstall pytorch-lightning -y
# !python -m pip install git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
#!pip install torch==1.6.0 torchvision==0.7.0
 


####requirements.txt:torch>=1.2.0, transformers>=3.0.2, tensorboardX, pytorch-lightning==0.6.0, test-tube==0.7.5
# !conda install transformers --yes
# !conda install cudatoolkit=10.0 --yes
# !python -m pip install git+https://github.com/allenai/longformer.git
# !conda install -c conda-forge regex --force-reinstall --yes
# !conda install pytorch-lightning -c conda-forge
#!python -m pip install jdc 
# !pip install test-tube 
#!python -m pip install ipywidgets 
# !conda update --force conda --yes  
# !jupyter nbextension enable --py widgetsnbextension 
# !conda install jupyter --yes


import os
import json
import string
import random
import numpy as np
import torch
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, Dataset 

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
from pytorch_lightning.logging import TestTubeLogger    # sometimes pytorch_lightning.loggers works instead

from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer
import jdc
from more_itertools import locate
from collections import Counter
from collections import defaultdict

In [7]:
print(pl.__file__)

/home/u32/fanluo/.local/lib/python3.6/site-packages/pytorch_lightning/__init__.py


#### class hotpotqaDataset

##### \_\_init\_\_, \_\_getitem\_\_ and \_\_len\_\_ 

In [8]:
class hotpotqaDataset(Dataset):
    """
    Largely based on
    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
    and
    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
    """
    
    
    def __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride,
                 max_num_answers, ignore_seq_with_no_answers, max_question_len):
        assert os.path.isfile(file_path)
        self.file_path = file_path
#         if("reduced_context" not in self.file_path):
        with open(self.file_path, "r", encoding='utf-8') as f:
            print(f'reading file: {self.file_path}')
            self.data_json = convert_hotpot_to_squad_format(json.load(f))['data']
                
#         else:
#             with open(self.file_path, "r", encoding='utf-8') as f:
#                 print(f'reading file: {self.file_path}')
#                 self.data_json = json.load(f)['data']            
#                 print(self.data_json[0])
            
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.max_doc_len = max_doc_len
        self.doc_stride = doc_stride
        self.max_num_answers = max_num_answers
        self.ignore_seq_with_no_answers = ignore_seq_with_no_answers
        self.max_question_len = max_question_len


#         print(tokenizer.all_special_tokens) 
    
        # A mapping from qid to an int, which can be synched across gpus using `torch.distributed`
        if 'train' not in self.file_path:  # only for the evaluation set 
            self.val_qid_string_to_int_map =                  {
                    entry["paragraphs"][0]['qas'][0]['id']: index
                    for index, entry in enumerate(self.data_json)
                }
        else:
            self.val_qid_string_to_int_map = None
            
            
    def __len__(self):
        return len(self.data_json)

    def __getitem__(self, idx):
        entry = self.data_json[idx]
        tensors_list = self.one_example_to_tensors(entry, idx)
        if(len(tensors_list) != 1):
            print("tensors_list: ", tensors_list)
        assert len(tensors_list) == 1
        return tensors_list[0]

##### one_example_to_tensors

In [9]:
    %%add_to hotpotqaDataset
    def one_example_to_tensors(self, example, idx):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
                return True
            return False
        
        def map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset):
            # char offset to word offset
            if(answer_start >= len(char_to_word_offset)):
                print("answer_start: ", answer_start)
                print("len(char_to_word_offset): ", len(char_to_word_offset))
            # char offset to word offset
            start_word_position = char_to_word_offset[answer_start]
            end_word_position = char_to_word_offset[answer_end-1] 

#             print("start_word_position: ", start_word_position)
#             print("end_word_position: ", end_word_position)
            # sub_tokens postion reletive to context
            tok_start_position_in_doc = orig_to_tok_index[start_word_position]  
            not_end_of_doc = int(end_word_position + 1 < len(orig_to_tok_index))
            tok_end_position_in_doc = orig_to_tok_index[end_word_position + not_end_of_doc] - not_end_of_doc
            
            if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end:
                return (-1, -1) # this answer is outside the current slice                     
            
            # sub_tokens postion reletive to begining of all the tokens, including query sub tokens  
            start_position = tok_start_position_in_doc + doc_offset  
            end_position = tok_end_position_in_doc + doc_offset
            
            return (start_position, end_position)
        
#         print("idx: ", idx)
#         print("len(example): ", "len(example)")
        if(len(example["paragraphs"])==0):
            print("idx: ", idx, "'s len(example[‘paragraphs’])==0")

        tensors_list = []
        for paragraph in example["paragraphs"]:  # example["paragraphs"] only contains one paragraph in hotpotqa
            # print("for paragraph in example['paragraphs']: ") 
            context = self.tokenizer.sep_token + ' ' + (' ' + self.tokenizer.sep_token + ' ').join(paragraph["context"] )   
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in context:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c) # add a new token
                    else:
                        doc_tokens[-1] += c  # append the character to the last token
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            
#             print("len(char_to_word_offset): ", len(char_to_word_offset))
#             print("char_to_word_offset: ", char_to_word_offset)
            for qa in paragraph["qas"]:
                question_text = qa["question"]
                # print("question text: ", question_text)  
                sp_sent = qa["is_sup_fact"]
                sp_para = qa["is_supporting_para"]
                start_position = None
                end_position = None
                orig_answer_text = None 

#                     print("len(sp_sent):", len(sp_sent))
#                     print("sp_sent", sp_sent) 
#                     print("doc_tokens", doc_tokens)
 
                # keep all answers in the document, not just the first matched answer. It also added the list of textual answers to make evaluation easy.
                
                   
                # ===== Given an example, convert it into tensors  =============
                 
                query_tokens = self.tokenizer.tokenize(question_text)
                query_tokens = query_tokens[:self.max_question_len]
                tok_to_orig_index = []
                orig_to_tok_index = []
                all_doc_tokens = []
                
                # each original token in the context is tokenized to multiple sub_tokens
                for (i, token) in enumerate(doc_tokens):
                    orig_to_tok_index.append(len(all_doc_tokens))
                    # hack: the line below should have been `self.tokenizer.tokenize(token')`
                    # but roberta tokenizer uses a different subword if the token is the beginning of the string
                    # or in the middle. So for all tokens other than the first, simulate that it is not the first
                    # token by prepending a period before tokenizing, then dropping the period afterwards
                    sub_tokens = self.tokenizer.tokenize(f'. {token}')[1:] if i > 0 else self.tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        tok_to_orig_index.append(i)
                        all_doc_tokens.append(sub_token)
                
                # all sub tokens, truncate up to limit
                all_doc_tokens = all_doc_tokens[:self.max_doc_len-7] 

                # The -7 accounts for CLS, QUESTION_START, QUESTION_END， [/par]， yes， no， </s>   
                max_tokens_per_doc_slice = self.max_seq_len - len(query_tokens) - 7
                if(max_tokens_per_doc_slice <= 0):
                    print("(max_tokens_per_doc_slice <= 0)")
                assert max_tokens_per_doc_slice > 0
                if self.doc_stride < 0:                           # default
                    # negative doc_stride indicates no sliding window, but using first slice
                    self.doc_stride = -100 * len(all_doc_tokens)  # large -negtive value for the next loop to execute once
                
                # inputs to the model
                input_ids_list = []
                input_mask_list = []
                segment_ids_list = []
                start_positions_list = []
                end_positions_list = []
                q_type_list = []
                sp_sent_list =  [1 if ss else 0 for ss in sp_sent]
                sp_para_list = [1 if sp else 0 for sp in sp_para]
                
                if(len(all_doc_tokens) == 0):
                    print("idx: ", idx, " len(all_doc_tokens) == 0")
#               
                
                for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride):    # execute once by default
                
                    # print("slice_start in range") 
                    slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))

                    doc_slice_tokens = all_doc_tokens[slice_start:slice_end]
                    tokens = [self.tokenizer.cls_token] + [QUESTION_START] + query_tokens + [QUESTION_END] + doc_slice_tokens + [PAR] + self.tokenizer.tokenize("yes") + self.tokenizer.tokenize("no") + [self.tokenizer.eos_token]   
                    segment_ids = [0] * (len(query_tokens) + 3) + [1] * (len(doc_slice_tokens) + 4) 
#                     if(len(segment_ids) != len(tokens)):
#                         print("len(segment_ids): ", len(segment_ids))
#                         print("len(tokens): ", len(tokens))
                    assert len(segment_ids) == len(tokens)

                    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)   
                    input_mask = [1] * len(input_ids)

                    doc_offset = len(query_tokens) + 3 - slice_start  # where context starts
                    
                    # ===== answer positions tensors  ============
                    start_positions = []
                    end_positions = []
 
                    answer = qa["answer"] 
                    # print("idx: ", idx, " qa['id']: ", qa['id'], " answer: ", answer)
                    if answer == '':
                        q_type = -1
                        start_positions.append(-1)   
                        end_positions.append(-1)           
                    
                    elif answer == 'yes':
                        q_type = 1
                        start_positions.append(len(tokens)-3)   
                        end_positions.append(len(tokens)-3) 
                    elif answer == 'no':
                        q_type = 2
                        start_positions.append(len(tokens)-2)   
                        end_positions.append(len(tokens)-2)  
                    else:
                        # keep all the occurences of answer in the context 
#                         for m in re.finditer("\s?".join(answer.split()), context):   # "\s?".join(answer.split()) in order to match even with extra space in answer or context
                        for m in re.finditer(_normalize_text(answer), context, re.IGNORECASE):
                            answer_start, answer_end = m.span() 
                            start_position, end_position = map_answer_positions(char_to_word_offset, orig_to_tok_index, answer_start, answer_end, slice_start, slice_end, doc_offset)
                            if(start_position != -1):
                                start_positions.append(start_position)   
                                end_positions.append(end_position)
                            
                        if(len(start_positions) > 0): 
                            q_type = 0
                        else: # answer not found in context
                            q_type = -1
                            start_positions.append(-1)   
                            end_positions.append(-1) 


                    # answers from start_positions and end_positions if > self.max_num_answers
                    start_positions = start_positions[:self.max_num_answers]
                    end_positions = end_positions[:self.max_num_answers]

                    # -1 padding up to self.max_num_answers
                    padding_len = self.max_num_answers - len(start_positions)
                    start_positions.extend([-1] * padding_len)
                    end_positions.extend([-1] * padding_len)

                    # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
                    found_start_positions = set()
                    found_end_positions = set()
                    for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)):
                        
                        if start_position in found_start_positions:
                            start_positions[i] = -1
                        if end_position in found_end_positions:
                            end_positions[i] = -1
                        found_start_positions.add(start_position)
                        found_end_positions.add(end_position)
                    
                                         
                    if self.doc_stride >= 0:  # no need to pad if document is not strided
                        # Zero-pad up to the sequence length.
                        padding_len = self.max_seq_len - len(input_ids)
                        input_ids.extend([self.tokenizer.pad_token_id] * padding_len)
                        input_mask.extend([0] * padding_len)
                        segment_ids.extend([0] * padding_len)
                        
                        print("self.doc_stride >= 0")
                        assert len(input_ids) == self.max_seq_len
                        assert len(input_mask) == self.max_seq_len
                        assert len(segment_ids) == self.max_seq_len  
                        
                    input_ids_list.append(input_ids)
                    input_mask_list.append(input_mask)
                    segment_ids_list.append(segment_ids)
                    start_positions_list.append(start_positions)
                    end_positions_list.append(end_positions)
                    q_type_list.append(q_type)
                    
                tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list), torch.tensor(segment_ids_list),
                                     torch.tensor(start_positions_list), torch.tensor(end_positions_list), torch.tensor(q_type_list),
                                     torch.tensor([sp_sent_list]),  torch.tensor([sp_para_list]),
                                     qa['id'], answer))     
        return tensors_list



##### collate_one_doc_and_lists

In [10]:
    %%add_to hotpotqaDataset
    @staticmethod
    def collate_one_doc_and_lists(batch):
        num_metadata_fields = 2  # qid and answer  
        fields = [x for x in zip(*batch)]
        stacked_fields = [torch.stack(field) for field in fields[:-num_metadata_fields]]  # don't stack metadata fields
        stacked_fields.extend(fields[-num_metadata_fields:])  # add them as lists not torch tensors

        # always use batch_size=1 where each batch is one document
        # will use grad_accum to increase effective batch size
        assert len(batch) == 1
        fields_with_batch_size_one = [f[0] for f in stacked_fields]
        return fields_with_batch_size_one


##### class info

In [11]:
dir(hotpotqaDataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'collate_one_doc_and_lists',
 'one_example_to_tensors']

In [12]:
from inspect import getmembers
getmembers(hotpotqaDataset)

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__class__', type),
 ('__delattr__', <slot wrapper '__delattr__' of 'object' objects>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__doc__': '\n    Largely based on\n    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py\n    and\n    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n    ',
                '__init__': <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>,
                '__len__': <function __main__.hotpotqaDataset.__len__(self)>,
                '__getitem__': <function __main__.hotpotqaDataset.__getitem__(self, idx)>,
                'one_example_to_tensors': <function __main__.one_example_to_tensors(self, example, idx)>,
                'collate_one

In [13]:
from inspect import isfunction
functions_list = [o for o in getmembers(hotpotqaDataset) if isfunction(o[1])]
functions_list

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__getitem__', <function __main__.hotpotqaDataset.__getitem__(self, idx)>),
 ('__init__',
  <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>),
 ('__len__', <function __main__.hotpotqaDataset.__len__(self)>),
 ('collate_one_doc_and_lists',
  <function __main__.collate_one_doc_and_lists(batch)>),
 ('one_example_to_tensors',
  <function __main__.one_example_to_tensors(self, example, idx)>)]

In [14]:
import inspect
inspect.getmro(hotpotqaDataset)  # a hierarchy of classes 

(__main__.hotpotqaDataset, torch.utils.data.dataset.Dataset, object)

In [15]:
inspect.getfullargspec(hotpotqaDataset.one_example_to_tensors)

FullArgSpec(args=['self', 'example', 'idx'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={})

In [16]:
help(hotpotqaDataset)

Help on class hotpotqaDataset in module __main__:

class hotpotqaDataset(torch.utils.data.dataset.Dataset)
 |  Largely based on
 |  https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
 |  and
 |  https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
 |  
 |  Method resolution order:
 |      hotpotqaDataset
 |      torch.utils.data.dataset.Dataset
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, idx)
 |  
 |  __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  one_example_to_tensors(self, example, idx)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  collate_one_doc_and_lists(batch)
 |  
 |  -----------------------

#### class hotpotqa

##### \_\_init\_\_,  forward, dataloaders

In [17]:
class hotpotqa(pl.LightningModule):
    def __init__(self, args):
        super(hotpotqa, self).__init__()
        self.args = args
        self.hparams = args
 
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        num_new_tokens = self.tokenizer.add_special_tokens({"additional_special_tokens": [TITLE_START, TITLE_END, SENT_MARKER_END, QUESTION_START , QUESTION_END, PAR]})
#         print(self.tokenizer.all_special_tokens)
        self.tokenizer.model_max_length = self.args.max_seq_len
        self.model_1 = self.load_model()
        self.model_1.resize_token_embeddings(len(self.tokenizer))
  
        self.fnn_sp_para_1 = torch.nn.Sequential(
          torch.nn.Linear(self.model_1.config.hidden_size, self.model_1.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model_1.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
         
        self.model_2 = self.load_model()
        self.model_2.resize_token_embeddings(len(self.tokenizer))
        
        self.num_labels = 2
        self.qa_outputs = torch.nn.Linear(self.model_2.config.hidden_size, self.num_labels)
         
        self.linear_type = torch.nn.Linear(self.model_2.config.hidden_size, 3)   #  question type (yes/no/span) classification 
       
        self.fnn_sp_sent = torch.nn.Sequential(
          torch.nn.Linear(self.model_2.config.hidden_size, self.model_2.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model_2.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
        
        self.fnn_sp_para_2 = torch.nn.Sequential(
          torch.nn.Linear(self.model_2.config.hidden_size, self.model_2.config.hidden_size), 
          torch.nn.GELU(),
          torch.nn.Linear(self.model_2.config.hidden_size, 1),      # score for 'yes', while 0 for 'no'
        )
        
        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
    
    def load_model(self):
        
        # config = LongformerConfig.from_pretrained(self.args.model_path) 
        # config.attention_mode = self.args.attention_mode
        # model = Longformer.from_pretrained(self.args.model_path, config=config)
        
        if 'longformer' in self.args.model_path:
            model = Longformer.from_pretrained(self.args.model_path) 

            for layer in model.encoder.layer:
                layer.attention.self.attention_mode = self.args.attention_mode
                self.args.attention_window = layer.attention.self.attention_window
        else:
            model = AutoModel.from_pretrained(self.args.model_path)
            
        print("Loaded model with config:")
        print(model.config)

        for p in model.parameters():
            p.requires_grad_(True)
        model.train()
        return model

#%%add_to hotpotqa    # does not seems to work for the @pl.data_loader decorator, missing which causes error "validation_step() takes 3 positional arguments but 4 were given"    
###################################################### dataloaders ########################################################### 
 
    @pl.data_loader
    def train_dataloader(self):
        if self.train_dataloader_object is not None:
            return self.train_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.train_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers)
        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) if self.trainer.use_ddp else None
        dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None),
                        num_workers=self.args.num_workers, sampler=sampler,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.train_dataloader_object = dl
        return self.train_dataloader_object

    @pl.data_loader
    def val_dataloader(self):
        if self.val_dataloader_object is not None:
            return self.val_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples
        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False) if self.trainer.use_ddp else None
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, sampler=sampler,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.val_dataloader_object = dl
        return self.val_dataloader_object

    @pl.data_loader
    def test_dataloader(self):
        if self.test_dataloader_object is not None:
            return self.test_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples
        dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False) if self.trainer.use_ddp else None
        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, sampler=dist_sampler,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.test_dataloader_object = dl
        return self.test_dataloader_object


#%%add_to hotpotqa  
    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, q_type, sp_sent, sp_para):

        if(input_ids.size(0) > 1):
            print("multi rows per document")
        assert(input_ids.size(0)==1)
        # Each batch is one document, and each row of the batch is a chunck of the document.    ????
        # Make sure all rows have the same question length.
        
        ########################################################## stage 1 ############################################################################
        # local attention everywhere
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
        
        # global attention for the cls and all question tokens
        question_end_index = self._get_special_index(input_ids, [QUESTION_END])  
        attention_mask[:,:question_end_index[0].item()+1] = 2  # from <cls> until </q> 
        
        # global attention for the sentence and paragraph special tokens  
        sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
        attention_mask[:, sent_indexes] = 2
        
        para_indexes = self._get_special_index(input_ids, [TITLE_START])
        attention_mask[:, para_indexes] = 2       
         

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)

        sequence_output = self.model_1(
                input_ids,
                attention_mask=attention_mask)[0]
#         print("size of sequence_output: " + str(sequence_output.size()))

        # The pretrained hotpotqa model wasn't trained with padding, so remove padding tokens
        # before computing loss and decoding.
        padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
        if padding_len > 0:
            sequence_output = sequence_output[:, :-padding_len]  
            
        ### supporting paragraph classification ###  
        sp_para_output = sequence_output[:,para_indexes,:]  
        sp_para_output_t = self.fnn_sp_para_1(sp_para_output)     
            
        #  keep up to 5 paragraphs whose raw score is higher than a pre-specified threshold (-3.0)  
        print('sp_para_output_t: ', sp_para_output_t)
        para_stage2_indices = sp_para_output_t.flatten().topk(k=min(5, sp_para_output_t.numel()), dim=-1).indices  
   
        i = para_stage2_indices.numel() - 1   # the lowest score
        while i >= 0:
            if(sp_para_output_t.flatten()[para_stage2_indices[i]] > -3.0):
                para_stage2_indices = para_stage2_indices[:i+1]  # slice at the last score that is > -3.0
                break
            else:
                i -= 1
        if(i == -1):
            para_stage2_indices = sp_para_output_t.flatten().topk(k=min(2, sp_para_output_t.numel()), dim=-1).indices
        para_stage2_indices = para_stage2_indices.sort().values
        # relocate sp     
#         print('sp_para: ', sp_para)
        sp_para_stage2 = sp_para[:,para_stage2_indices]
#         print('sp_para_stage2: ', sp_para_stage2)
        
        sp_para_map = dict(zip(range(para_stage2_indices.numel()), para_stage2_indices.tolist())) # used to map back to sp_para in decode 
         
#         print('sent_indexes: ', sent_indexes)
#         print('para_indexes: ', para_indexes)
#         print('sp_sent: ', sp_sent)
        
        s_to_p_map = []   
        for s in sent_indexes:
            s_to_p = torch.where(torch.le(para_indexes, s))[0][-1]     # last para_index smaller or equal to s
            s_to_p_map.append(s_to_p.item())   # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 7, 7, 8, 9]
        sp_sent_stage2 = []
        sp_sent_map = {}
        for idx, s_to_p in enumerate(s_to_p_map):
            if(s_to_p in para_stage2_indices):
                sp_sent_map[len(sp_sent_stage2)] = idx
                sp_sent_stage2.append(sp_sent[0][idx].item())   
                
        sp_sent_stage2 = torch.tensor([sp_sent_stage2]).type_as(sp_para_stage2) 
        
        # input_stage2_ids
        idx_to_get = list(range(para_indexes[0]))    # before the first para: question 
        for idx, p_i in enumerate(para_stage2_indices):
            if(p_i < para_indexes.numel()-1):
                idx_to_get.extend(range(para_indexes[p_i], para_indexes[p_i+1]))    # between the para begin to next para begin 
            else:
                idx_to_get.extend(range(para_indexes[p_i], input_ids.size(1)))     # last para
        input_stage2_ids = input_ids[:,idx_to_get]
         
        # reposition answer  
#         print('start_positions: ', start_positions)
#         print('idx_to_get: ', idx_to_get)
        for i , start_pos in enumerate(start_positions[0]):
            if(start_pos != -1):   
                if(start_pos in idx_to_get):
                    start_positions[0][i] = idx_to_get.index(start_pos)
                else:
                    start_positions[0][i] = -1
        for i , end_pos in enumerate(end_positions[0]):
            if(end_pos != -1):   
                if(end_pos in idx_to_get):
                    end_positions[0][i] = idx_to_get.index(end_pos)
                else:
                    end_positions[0][i] = -1
        

                
            
        ########################################################## stage 2 ############################################################################    
        # local attention everywhere
        attention_mask = torch.ones(input_stage2_ids.shape, dtype=torch.long, device=input_stage2_ids.device)
        
        # global attention for the cls and all question tokens
        if(question_end_index != self._get_special_index(input_stage2_ids, [QUESTION_END]) ):
            print("suppose to be same as stage 1's question_end_index")
            assert(question_end_index == self._get_special_index(input_stage2_ids, [QUESTION_END]))
        question_end_index = self._get_special_index(input_stage2_ids, [QUESTION_END])  
        attention_mask[:,:question_end_index[0].item()+1] = 2  # from <cls> until </q> 
        
        # global attention for the sentence and paragraph special tokens  
        sent_indexes = self._get_special_index(input_stage2_ids, [SENT_MARKER_END])
        attention_mask[:, sent_indexes] = 2
        
        para_indexes = self._get_special_index(input_stage2_ids, [TITLE_START])
        attention_mask[:, para_indexes] = 2       
         

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_stage2_ids, attention_mask = pad_to_window_size(
            input_stage2_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)

        sequence_output = self.model_2(
                input_stage2_ids,
                attention_mask=attention_mask)[0]            
            
        # The pretrained hotpotqa model wasn't trained with padding, so remove padding tokens
        # before computing loss and decoding.
        padding_len = input_stage2_ids[0].eq(self.tokenizer.pad_token_id).sum()
        if padding_len > 0:
            sequence_output = sequence_output[:, :-padding_len]              
        
        ################### layers on top of sequence_output ############## 
        
        ### 1. answer start and end positions classification ###   
        logits = self.qa_outputs(sequence_output) 
        start_logits, end_logits = logits.split(1, dim=-1) 
        start_logits = start_logits.squeeze(-1) 
        end_logits = end_logits.squeeze(-1)
 
        ### 2. type classification, similar as class LongformerClassificationHead(nn.Module) https://huggingface.co/transformers/_modules/transformers/modeling_longformer.html#LongformerForSequenceClassification.forward ### 
        type_logits = self.linear_type(sequence_output[:,0]) 
        
        ### 3. supporting paragraph classification ###  
        sp_para_output = sequence_output[:,para_indexes,:]  
        sp_para_output_t = self.fnn_sp_para_2(sp_para_output) 

         # linear_sp_sent generates a single score for each sentence, instead of 2 scores for yes and no.   
        # Argument the score with additional score=0. The same way did in the HOTPOTqa paper
        sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float, device=sp_para_output_t.device) 
        predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).contiguous() 
 
        ### 4. supporting fact classification ###     
        # the first sentence in a paragraph is leading by <p>, other sentences are leading by <s>
 
        sp_sent_output = sequence_output[:,sent_indexes,:]  
        sp_sent_output_t = self.fnn_sp_sent(sp_sent_output)     
        sp_sent_output_aux = torch.zeros(sp_sent_output_t.shape, dtype=torch.float, device=sp_sent_output_t.device) 
        predict_support_sent = torch.cat([sp_sent_output_aux, sp_sent_output_t], dim=-1).contiguous() 
        
        outputs = (start_logits, end_logits, type_logits, sp_para_output_t, sp_sent_output_t, input_stage2_ids, sp_para_map, sp_sent_map)  
        answer_loss, type_loss, sp_para_loss, sp_sent_loss  = self.loss_computation(start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para_stage2, predict_support_para, sp_sent_stage2, predict_support_sent)
 
        outputs = (answer_loss, type_loss, sp_para_loss, sp_sent_loss,) + outputs    
        return outputs
    
    def loss_computation(self, start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para, predict_support_para, sp_sent, predict_support_sent):
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
            else: 
                start_positions = start_positions[:, 0:1]   # only use the top1 start_position considering only one appearance of the answer string
                end_positions = end_positions[:, 0:1]
                start_loss = crossentropy(start_logits, start_positions[:, 0])
                end_loss = crossentropy(end_logits, end_positions[:, 0])
                
 
            crossentropy = torch.nn.CrossEntropyLoss(ignore_index=-1)
            type_loss = crossentropy(type_logits, q_type)  
            
            crossentropy_average = torch.nn.CrossEntropyLoss(reduction = 'mean', ignore_index=-1)   
            if(sp_para.view(-1).size(0) > 0):
                sp_para_loss = crossentropy_average(predict_support_para.view(-1, 2), sp_para.view(-1))
            else:
                sp_para_loss = torch.tensor(0).type_as(type_loss)  # when raw_context is []
                
            if(sp_sent.view(-1).size(0) > 0):    
                print('predict_support_sent.view(-1, 2).size(): ', predict_support_sent.view(-1, 2).size())
                print('sp_sent.view(-1).size(): ', sp_sent.view(-1).size())
                print('predict_support_sent.view(-1, 2): ', predict_support_sent.view(-1, 2))
                print('sp_sent.view(-1): ', sp_sent.view(-1))
                
                sp_sent_loss = crossentropy_average(predict_support_sent.view(-1, 2), sp_sent.view(-1))      
            else:
                sp_sent_loss = torch.tensor(0).type_as(type_loss)  # when raw_context is []
 
            answer_loss = (start_loss + end_loss) / 2 
        return answer_loss, type_loss, sp_para_loss, sp_sent_loss  


#     %%add_to hotpotqa    
    def _get_special_index(self, input_ids, special_tokens):
        assert(input_ids.size(0)==1) 
        mask = input_ids != input_ids # initilaize 
        for special_token in special_tokens:
            mask = torch.logical_or(mask, input_ids.eq(self.tokenizer.convert_tokens_to_ids(special_token))) 
 
        token_indices = torch.nonzero(mask, as_tuple=False)        
         
 
        return token_indices[:,1]    

    def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1):
        """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
        assert logits.ndim == 2
        assert target.ndim == 2
        assert logits.size(0) == target.size(0) 
        
        # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
        # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

        # target are indexes of tokens, padded with ignore_index=-1
        # logits are scores (one for each label) for each token
 
        # compute a target mask
        target_mask = target == ignore_index
        # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
        masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
    
        # gather logits
        gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
 
        # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
        gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
 
        # each batch is one example
        gathered_logits = gathered_logits.view(1, -1)
        logits = logits.view(1, -1)
 
        # numerator = log(sum(exp(gathered logits)))
        log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
 
        log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
        
        # compute the loss
        loss = -(log_score - log_norm) 
        
        # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
        # when `target` is all `ignore_index`, loss is 0 
        loss = loss[~torch.isinf(loss)].sum()
#         loss = torch.tanh(loss)
#         print("final loss: " + str(loss)) 
        return loss  


In [18]:
# debug
# input_ids = torch.tensor([[-1, 5, -1, 2]])
# input_ids.size(0)
# token_indices =  torch.nonzero(input_ids == torch.tensor(-1))[:,1]
# # token_indices
# # token_indices.item()
# # indices =  torch.LongTensor([[2],[0,2]])

# # torch.gather(input_ids, 1, token_indices.unsqueeze(0))
# # p_index = token_indices.view(input_ids.size(0), -1)[:,1::2]   
# # attention_mask = torch.ones(input_ids.shape, dtype=torch.long) 
# # attention_mask[:,token_indices] = 2
# # attention_mask
# p_index = torch.tensor([1, 3, 4])
# s_index = torch.tensor([1,3,6])
# torch.sort(torch.cat((s_index, p_index)))[0]
# attention_mask.view(-1)[ p_index.view(-1), :].view(attention_mask.size(0), -1)
# # for pi in p_index[0]:
# #     attention_mask[:, pi] = 2
# # attention_mask
# # s_index = torch.tensor([[1,3]])
# # torch.sort(torch.cat((p_index, s_index), -1), -1)

# sequence_output  = torch.tensor([[[-1, 5, -1, 2],
#                                  [-2, 27, 2, 9],
#                                  [3, 6, 1, 65],
#                                  [52, 36, 13, 2],
#                                  [73, 26, 1, 7]
#                                 ]])

# sp_para_output_t   = torch.tensor([[[-1],
#                                  [-2 ],
#                                  [3],
#                                  [52],
#                                  [73]
#                                 ]])
# torch.zeros(sp_para_output_t.shape, dtype=torch.float) 

# print("size of sequence_output: " + str(sequence_output.size()))
# # print("size of p_index.unsqueeze(0).unsqueeze(-1): " + str(p_index.unsqueeze(0).size()))
# sequence_output[:,p_index,:]
# b = torch.tensor([0, 1, 2, 3])
# p_index.unsqueeze(-1) * b

# input_ids = torch.tensor([[0.2, 0.0, 0.6, 0.6], [0.2, 0.6, 0.0, 0.0]]) 
# # input_ids.tolist()
# p_index =  torch.nonzero(input_ids == torch.tensor(0.2))
# print(p_index)
# s_index =  torch.nonzero(input_ids == torch.tensor(0.6))
# print(s_index)

# sp_sent = torch.tensor([[0, 1, 1, 0]])
# torch.nonzero(sp_sent, as_tuple=True)[1]
# cat_index = torch.tensor([])
# cat_index = torch.cat((cat_index, ids[0][1]))
# print(ids)
# print(cat_index)
# p_index[p_index[:,0] == 0]

# cat_index[cat_index[:,0].argsort()]

# sorted(torch.cat((p_index, s_index)), key = lambda x: x[0])
# torch.sort(torch.cat((p_index, s_index)), 0)[0]
# for cor in token_indices:
#     attention_mask[cor[0].item()][cor[1].item()] = 2
# attention_mask 
# input_ids = torch.tensor([[-1, 5, -6, 2]])
# print(input_ids.size())
# input_ids.topk(k=2, dim=-1).indices

# predict_type = torch.tensor([[-0.0925, -0.0999, -0.1671]])
# p_type = torch.argmax(predict_type, dim=1).item()
# p_type_score = torch.max(predict_type, dim=1)[0].item()
# print("predict_type: ", predict_type)
# print("p_type: ", p_type)
# print("p_type_score: ", p_type_score)
    
# a = torch.tensor([[0.9213,  1.0887, -0.8858, -1.7683]])
# a.view(-1).size() 
# print(torch.sigmoid(a))
# a = torch.tensor([ 9.213,  1.0887, -0.8858, 7683])
# print(torch.sigmoid(a))

# a = torch.tensor([[[1],[2],[4],[-1],[-1]]])
# a= a.squeeze(-1)
# a.size() 
# a[:, torch.where(a!=-1)[1]]
# m = torch.nn.Sigmoid()
# print("m: ", m)
# loss = torch.nn.BCELoss()
# # input = torch.randn(3, requires_grad=True)
# # print("input: ", input)
# # target = torch.empty(3).random_(2)
# # print("target: ", target)
# # output = loss(m(input), target)
# # print("output: ", output)

# input = torch.tensor([1.0293, -0.1585,  1.1408], requires_grad=True)
# print("input: ", input)
# print("Sigmoid(input): ", m(input))
# target = torch.tensor([0., 1., 0.])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# input = torch.tensor([[1.0293, -0.1585,  1.1408]], requires_grad=True)
# print("input: ", input)
# target = torch.tensor([[0., 1., 0.]])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# 1.1761 * 3
# soft_input = torch.nn.Softmax(dim=-1)
# log_soft_input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss() 
# loss(log_soft_input, target)
# input = torch.log(soft_input(input))
# loss=torch.nn.NLLLoss()
# loss(input,target)

# loss =torch.nn.CrossEntropyLoss()
# loss(input,target) 

# sp_sent_logits =torch.tensor([[[0.0988],
#          [0.0319],
#          [0.0314]]])
# sp_sent_logits.squeeze()

# input_ids = torch.tensor([[0.6, 0.0, 0.6, 0.0]]) 
# token_indices =  torch.nonzero(input_ids == torch.tensor(0.6))
# token_indices[:,1][0].item()

# def or_softmax_cross_entropy_loss_one_doc(logits, target, ignore_index=-1, dim=-1):
#     """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
#     assert logits.ndim == 2
#     assert target.ndim == 2
#     assert logits.size(0) == target.size(0) 

#     # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
#     # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

#     # target are indexes of tokens, padded with ignore_index=-1
#     # logits are scores (one for each label) for each token
# #         print("or_softmax_cross_entropy_loss_one_doc" ) 
# #         print("size of logits: " + str(logits.size()))                    # torch.Size([1, 746]), 746 is number of all tokens 
# #         print("size of target: " + str(target.size()))                    # torch.Size([1, 64]),  -1 padded
#     print("target: " + str(target)) 

#     # compute a target mask
#     target_mask = target == ignore_index
#     # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
#     masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
#     print("masked_target: " + str(masked_target))     
#     # gather logits
#     gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
# #         print("size of gathered_logits: " + str(gathered_logits.size()))  # torch.Size([1, 64])
#     print("gathered_logits: " + str(gathered_logits)) 
#     # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
#     gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
#     print("gathered_logits after -inf: " + str(gathered_logits))      # tensor([[0.4382,   -inf,   -inf,   -inf,   -inf,...]])

#     # each batch is one example
#     gathered_logits = gathered_logits.view(1, -1)
#     logits = logits.view(1, -1)
# #         print("size of gathered_logits after view: " + str(gathered_logits.size()))  # torch.Size([1, 64])
# #         print("size of logits after view: " + str(logits.size()))                    # torch.Size([1, 746])　　

#     # numerator = log(sum(exp(gathered logits)))
#     log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
#     print("log_score: " + str(log_score)) 
#     # denominator = log(sum(exp(logits)))
#     log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
#     print("log_norm: " + str(log_norm)) 

#     # compute the loss
#     loss = -(log_score - log_norm)
#     print("loss: " + str(loss))


#     # some of the examples might have a loss of `inf` when `target` is all `ignore_index`: when computing start_loss and end_loss for question with the gold answer of yes/no 
#     # replace -inf with 0
#     loss = loss[~torch.isinf(loss)].sum()
#     print("final loss: " + str(loss)) 
#     return loss 

# # input = torch.tensor([[ 0,  0.0780],
# #         [0, 0.9253 ],
# #         [0, 0.0987]])
# # target = torch.tensor([0,1,0])
# # target.size(0) < 1
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312],
# #         [-0.3499, -1.9253, -1.5725],
# #         [-0.6578, -0.0987,  1.1570]])
# # target=torch.tensor([0,1,2])
# # predict_support_para.view(-1, 2), sp_para.view(-1)
# # input = torch.tensor([[ 1.1879,  1.0780,  0.5312]])
# # target=torch.tensor([0])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([2])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# # target=torch.tensor([-1])
# # or_softmax_cross_entropy_loss_one_doc(input, target.unsqueeze(-1))
# a = torch.tensor([6.4062])    
# b = torch.tensor([2.23])
# torch.cat((a,b))
 
# for a in list_tensor
# from functools import reduce
# reduce(lambda x,y: torch.cat((x,y)), list_tensor[:-1])

# torch.tanh(a)
# # if(torch.isinf(a)):
# #     print("is inf")
# 5 * 1e-2


# import torch
# special_tokens = [1,2]
# input_ids = torch.tensor([[ 1, 0, 2, 1, 0, 2]])

# mask = input_ids != input_ids # initilaize 
# for special_token in special_tokens:
#     mask = torch.logical_or(mask, input_ids.eq(special_token)) 
#     print("mask: ", mask)
# torch.nonzero(mask)    
 

In [19]:
# # debug: check loaded dataset by DataLoader
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# num_new_tokens = tokenizer.add_special_tokens({"additional_special_tokens": ["<p>", "<q>", "</q>"]})
# # # # print(tokenizer.all_special_tokens)    
# # # # print(tokenizer.all_special_ids)     
# # # # tokenizer.convert_tokens_to_ids("<s>")
# # # # tokenizer.sep_token
# print(tokenizer.tokenize("yes"))
# print(tokenizer.tokenize("no"))
# print(tokenizer.tokenize("null"))
# # # all_doc_tokens = []
# # # orig_to_tok_index = []
# # # tok_to_orig_index = []
# # # for (i, token) in enumerate(["<s>", "da", "tell", "<p>", "say"]):
# # #     orig_to_tok_index.append(len(all_doc_tokens))
# # #     sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token)
# # #     for sub_token in sub_tokens:
# # #         tok_to_orig_index.append(i)
# # #         all_doc_tokens.append(sub_token)
# # # all_doc_tokens


 

# dataset = hotpotqaDataset(file_path= args.train_dataset, tokenizer=tokenizer,
#                           max_seq_len= args.max_seq_len, max_doc_len= args.max_doc_len,
#                           doc_stride= args.doc_stride,
#                           max_num_answers= args.max_num_answers,
#                           max_question_len= args.max_question_len,
#                           ignore_seq_with_no_answers= args.ignore_seq_with_no_answers)
# print(len(dataset))

# # # dl = DataLoader(dataset, batch_size=1, shuffle=None,
# # #                     num_workers=args.num_workers, sampler=None,
# # #                     collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

# example = dataset[3]  
# [input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids] = example
 

# print(input_ids[0][:20].tolist())
# print(input_mask) 
# print(segment_ids) 
# print(subword_starts) 
# print(subword_ends)
# print(q_type)
# print(sp_sent) 
# print(sp_para) 
# print(qids)
# print(tokenizer.convert_ids_to_tokens(input_ids[0][667:669+1].tolist()))
# 0.0033 * 90447 
# 28*4
# torch.tensor(0.0)

##### configure_ddp

In [20]:
 %%add_to hotpotqa
 # A hook to overwrite to define your own DDP(DistributedDataParallel) implementation init. 
 # The only requirement is that: 
 # 1. On a validation batch the call goes to model.validation_step.
 # 2. On a training batch the call goes to model.training_step.
 # 3. On a testing batch, the call goes to model.test_step
 def configure_ddp(self, model, device_ids):
    model = LightningDistributedDataParallel(
        model,
        device_ids=device_ids,
        find_unused_parameters=True
    )
    return model

##### **configure_optimizers**

In [21]:
%%add_to hotpotqa
def configure_optimizers(self):
    # Set up optimizers and (optionally) learning rate schedulers
    def lr_lambda(current_step):
        if current_step < self.args.warmup:
            return float(current_step) / float(max(1, self.args.warmup))
        return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup)))

    optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr)

    scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)
    return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

##### **training_step**

In [22]:
%%add_to hotpotqa
def training_step(self, batch, batch_nb):
    # do the forward pass and calculate the loss for a batch 
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch 
    # print("size of input_ids: " + str(input_ids.size())) 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss  = output[:4]
    # print("answer_loss: ", answer_loss)
    # print("type_loss: ", type_loss)
    # print("sp_para_loss: ", sp_para_loss)
    # print("sp_sent_loss: ", sp_sent_loss)

#     loss  = answer_loss +  type_loss + sp_para_loss + sp_sent_loss
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss
#     print("weighted loss: ", loss)
#     print("self.trainer.optimizers[0].param_groups[0]['lr']: ", self.trainer.optimizers[0].param_groups[0]['lr'])
    lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']  # loss.new_zeros(1) is tensor([0.]), converting 'lr' to tensor' by adding it.  

    tensorboard_logs = {'loss': loss, 'train_answer_loss': answer_loss, 'train_type_loss': type_loss, 
                        'train_sp_para_loss': sp_para_loss, 'train_sp_sent_loss': sp_sent_loss, 
                        'lr': lr #,
                        # 'mem': torch.tensor(torch.cuda.memory_allocated(input_ids.device) / 1024 ** 3).type_as(loss) 
    }
    return tensorboard_logs

##### training_end

In [23]:
# %%add_to hotpotqa
    # # the function is called for each batch after every epoch is completed
    # def training_end(self, output): 
    #     # print("training_end at epoch: ", self.current_epoch)
    # #     print("len(outputs): ",len(outputs))
    # #     print("output: ",output)
    
    #     # one batch only has one example
    #     avg_loss = output['loss']    
    #     avg_answer_loss = output['train_answer_loss']  
    #     avg_type_loss = output['train_type_loss']    
    #     avg_sp_para_loss = output['train_sp_para_loss']   
    #     avg_sp_sent_loss = output['train_sp_sent_loss'] 
    #     avg_lr = output['lr']      
         
     
    #     if self.trainer.use_ddp:
    #         torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_answer_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_type_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_para_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
    #         avg_sp_sent_loss /= self.trainer.world_size 
    #         torch.distributed.all_reduce(avg_lr, op=torch.distributed.ReduceOp.SUM)
    #         avg_lr /= self.trainer.world_size 
            
     
    #     tensorboard_logs = { #'avg_train_loss': avg_loss, 
    #             'avg_train_answer_loss': avg_answer_loss, 'avg_train_type_loss': avg_type_loss, 'avg_train_sp_para_loss': avg_sp_para_loss, 'avg_train_sp_sent_loss': avg_sp_sent_loss, 'lr': avg_lr
    #           }
    
    #     return {'loss': avg_loss, 'log': tensorboard_logs}


##### validation_step

In [24]:
%%add_to hotpotqa
# When the validation_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of validation, model goes back to training mode and gradients are enabled.
def validation_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch

    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output, input_stage2_ids, sp_para_map, sp_sent_map = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_stage2_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output, sp_para_map, sp_sent_map)


    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    pre_answer = _normalize_text(answers_pred[0]['text'])
#         print("pred answer_score: " + str(pre_answer_score))
#         print("pred answer_text: " + str(pre_answer)) 

    gold_answer = _normalize_text(answer)
    f1, prec, recall = self.f1_score(pre_answer, gold_answer)
    em = self.exact_match_score(pre_answer, gold_answer) 
    f1 = torch.tensor(f1).type_as(loss)
    prec = torch.tensor(prec).type_as(loss)
    recall = torch.tensor(recall).type_as(loss)
    em = torch.tensor(em).type_as(loss)
#         print("f1: " + str(f1))
#         print("prec: " + str(prec))
#         print("recall: " + str(recall))
#         print("em: " + str(em))  

    if(len(sp_sent_pred) > 0):
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = self.sp_metrics(sp_sent_pred, torch.where(sp_sent.squeeze())[0].tolist())
        sp_sent_em = torch.tensor(sp_sent_em).type_as(loss)
        sp_sent_precision = torch.tensor(sp_sent_precision).type_as(loss)
        sp_sent_recall = torch.tensor(sp_sent_recall).type_as(loss)
        sp_sent_f1 = torch.tensor(sp_sent_f1).type_as(loss)

#         print("sp_sent_em: " + str(sp_sent_em))
#         print("sp_sent_precision: " + str(sp_sent_precision))
#         print("sp_sent_recall: " + str(sp_sent_recall))    
#         print("sp_sent_f1: " + str(sp_sent_f1))    


        joint_prec = prec * sp_sent_precision
        joint_recall = recall * sp_sent_recall
        if joint_prec + joint_recall > 0:
            joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
        else:
            joint_f1 = torch.tensor(0.0).type_as(loss)
        joint_em = em * sp_sent_em 

    else:
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)
        joint_em, joint_f1, joint_prec, joint_recall =  torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)

    if(len(sp_para_pred) > 0): 
        sp_para_em, sp_para_precision, sp_para_recall, sp_para_f1 = self.sp_metrics(sp_para_pred, torch.where(sp_para.squeeze())[0].tolist())
        sp_para_em = torch.tensor(sp_para_em).type_as(loss)
        sp_para_precision = torch.tensor(sp_para_precision).type_as(loss)
        sp_para_recall = torch.tensor(sp_para_recall).type_as(loss)
        sp_para_f1 = torch.tensor(sp_para_f1).type_as(loss)
    else:
        sp_para_em, sp_para_precision, sp_para_recall, sp_para_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)


    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss,
               'answer_score': pre_answer_score, 'f1': f1, 'prec':prec, 'recall':recall, 'em': em,
               'sp_sent_em': sp_sent_em, 'sp_sent_f1': sp_sent_f1, 'sp_sent_precision': sp_sent_precision, 'sp_sent_recall': sp_sent_recall,
               'sp_para_em': sp_para_em, 'sp_para_f1': sp_para_f1, 'sp_para_precision': sp_para_precision, 'sp_para_recall': sp_para_recall,
               'joint_em': joint_em, 'joint_f1': joint_f1, 'joint_prec': joint_prec, 'joint_recall': joint_recall}



###### decode

In [25]:
%%add_to hotpotqa
def decode(self, input_ids, start_logits, end_logits, type_logits, sp_para_logits, sp_sent_logits, sp_para_map, sp_sent_map):
#         print("decode")

    question_end_index = self._get_special_index(input_ids, [QUESTION_END])
    print("question_end_index: ", question_end_index)

    # one example per batch
    start_logits = start_logits.squeeze()
    end_logits = end_logits.squeeze()
    print("start_logits: ", start_logits)
    print("end_logits: ", end_logits)
    start_logits_indices = start_logits.topk(k=min(self.args.n_best_size, start_logits.size(0)), dim=-1).indices
    end_logits_indices = end_logits.topk(k=min(self.args.n_best_size, end_logits.size(0)), dim=-1).indices
    if(len(start_logits_indices.size()) > 1):
        print("len(start_logits_indices.size()): ", len(start_logits_indices.size()))
        assert("len(start_logits_indices.size()) > 1")
    p_type = torch.argmax(type_logits, dim=1).item()
    p_type_score = torch.max(type_logits, dim=1)[0] 
    print("type_logits: ", type_logits)
#     print("p_type: ", p_type)
#     print("p_type_score: ", p_type_score)

    answers = []
    if p_type == 0:
        potential_answers = []
        for start_logit_index in start_logits_indices: 
            for end_logit_index in end_logits_indices: 
                if start_logit_index <= question_end_index.item():
                    continue
                if end_logit_index <= question_end_index.item():
                    continue
                if start_logit_index > end_logit_index:
                    continue
                answer_len = end_logit_index - start_logit_index + 1
                if answer_len > self.args.max_answer_length:
                    continue
                potential_answers.append({'start': start_logit_index, 'end': end_logit_index,
                                          'start_logit': start_logits[start_logit_index],  # single logit score for start position at start_logit_index
                                          'end_logit': end_logits[end_logit_index]})    
        sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) 
        print("sorted_answers: " + str(sorted_answers))

        if len(sorted_answers) == 0:
            answers.append({'text': 'NoAnswerFound', 'score': -1000000, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score})
        else:
            answer = sorted_answers[0]
            answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]

            answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())
            # remove [/sent], <t> and </t>
            for special_token in [SENT_MARKER_END, TITLE_START, TITLE_END, self.tokenizer.sep_token]:   
                try:    
                    if(answer_tokens[0] == special_token):  
                        answer['start_logit'] = -2000000    
                    elif(answer_tokens[-1] == special_token):   
                        answer['end_logit'] = -2000000  

                    answer_tokens.remove(special_token) 
                except: 
                    pass    


            text = self.tokenizer.convert_tokens_to_string(answer_tokens)
            score = (answer['start_logit'] + answer['end_logit'] + p_type_score) / 3
            # score = (torch.sigmoid(answer['start_logit']) + torch.sigmoid(answer['end_logit']) + torch.sigmoid(p_type_score)) / 3
            answers.append({'text': text, 'score': score, 'start_logit': answer['start_logit'], 'end_logit': answer['end_logit'], 'p_type_score': p_type_score})
#             print("answers: " + str(answers))
    elif p_type == 1:   
        answers.append({'text': 'yes', 'score': p_type_score, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score})    
    elif p_type == 2:   
        answers.append({'text': 'no', 'score': p_type_score, 'start_logit': -1000000, 'end_logit': -1000000, 'p_type_score': p_type_score}) 
    else:
        assert False 


    sent_indexes = self._get_special_index(input_ids, [SENT_MARKER_END])
    para_indexes = self._get_special_index(input_ids, [TITLE_START])

    s_to_p_map = []   
    for s in sent_indexes:
        s_to_p = torch.where(torch.le(para_indexes, s))[0][-1]     # last para_index smaller or equal to s
        s_to_p_map.append(s_to_p.item())  # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 7, 7, 8, 9]
    print('s_to_p_map: ', s_to_p_map)
    
    if(len(s_to_p_map)>0):      # https://arxiv.org/pdf/2004.06753.pdf section 3.3
        para_sent_logits_sum = torch.tensor([], device=sp_sent_logits.device)  
        evidence_candidates = {}
        para_sents_offset = [0]
        for i in range(s_to_p_map[-1]+1):
            para_sent_logits = torch.masked_select(sp_sent_logits.squeeze(), torch.tensor([p==i for p in s_to_p_map])) 

            para_sent_logits_sum = torch.cat([para_sent_logits_sum, torch.sum(para_sent_logits).unsqueeze(0) ]) 
            para_sents_offset.append(para_sent_logits.numel()+para_sents_offset[-1])  # [0, 21, 22, 24, 25, 26, 29, 30, 34, 35, 36], one more elements than num of paras   
            evidence_candidates[i] = torch.gt(para_sent_logits, 0.1).nonzero(as_tuple=True)[0]  # 0.1 is the threshold to be a candidate sentences
        print('evidence_candidates: ', evidence_candidates)
        print('para_sents_offset: ', para_sents_offset)
        # para_sent_logits_sum: tensor([ 7.8180e-01,  6.8700e-02,  1.6170e-01,  7.4000e-02,  6.0000e-04,  2.2680e-01, -3.0400e-02,  9.3400e-02,  1.1200e-01,  1.2470e-01])
        # evidence_candidates: sentences with logits larger than threshold in each para,  [tensor([ 1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 16, 17, 19, 20]), tensor([0]), tensor([0, 1]), tensor([0]), tensor([0]), tensor([0, 1, 2]), tensor([], dtype=torch.int64), tensor([0, 2]), tensor([0]), tensor([0])]
        sp_para_pred = para_sent_logits_sum.squeeze().topk(k=min(para_sent_logits_sum.numel(), 2)).indices  # sp are from <=2 paragraphs
        print('sp_para_pred: ', sp_para_pred)
        sp_sent_pred = []
        if(sp_para_pred.numel() > 1):
            for para_idx in sp_para_pred: 
                if(para_idx.item() in evidence_candidates):
                    sp_sent_pred.extend([(para_sents_offset[para_idx]+sent).item() for sent in evidence_candidates[para_idx.item()]]) 
        elif(sp_para_pred.numel()==1 and sp_para_pred.item() in evidence_candidates):
            sp_sent_pred = [(para_sents_offset[sp_para_pred]+sent).item() for sent in evidence_candidates[sp_para_pred.item()]]
        print('sp_sent_pred: ', sp_sent_pred)
        print('sp_sent_map: ', sp_sent_map)
        print('sp_para_map: ', sp_para_map)
        for idx, sp_sent in enumerate(sp_sent_pred):
            sp_sent_pred[idx] = sp_sent_map[sp_sent.item()]
        for idx, sp_para in enumerate(sp_para_pred):
            sp_para_pred[idx] = sp_para_map[sp_para.item()]
    else:
        sp_sent_pred = []
        sp_para_pred = [] 
    return (answers, sp_sent_pred, sp_para_pred)


###### metrics

In [26]:
%%add_to hotpotqa


def f1_score(self, prediction, ground_truth):
    normalized_prediction = _normalize_text(prediction)
    normalized_ground_truth = _normalize_text(ground_truth)
    ZERO_METRIC = (0, 0, 0)

    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(self, prediction, ground_truth):
    return int(_normalize_text(prediction) == _normalize_text(ground_truth))


def sp_metrics(self, prediction, gold): 
    tp, fp, fn = 0, 0, 0
    for e in prediction:
        if e in gold:
            tp += 1
        else:
            fp += 1 
    for e in gold:
        if e not in prediction:
            fn += 1 
    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0 
    return em, prec, recall, f1 



##### validation_end

In [27]:
%%add_to hotpotqa
# If a validation_step is not defined, this won't be called. Called at the end of the validation loop with the outputs of validation_step.
def validation_epoch_end(self, outputs):
    print("validation_epoch_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  


    answer_scores = [x['answer_score'] for x in outputs] 
    f1_scores = [x['f1'] for x in outputs]  
    em_scores = [x['em'] for x in outputs]  
    prec_scores =  [x['prec'] for x in outputs] 
    recall_scores = [x['recall'] for x in outputs]  
    sp_sent_f1_scores = [x['sp_sent_f1'] for x in outputs]   
    sp_sent_em_scores = [x['sp_sent_em'] for x in outputs]   
    sp_sent_prec_scores = [x['sp_sent_precision'] for x in outputs]   
    sp_sent_recall_scores = [x['sp_sent_recall'] for x in outputs]   
    sp_para_f1_scores = [x['sp_para_f1'] for x in outputs]   
    sp_para_em_scores = [x['sp_para_em'] for x in outputs]   
    sp_para_prec_scores = [x['sp_para_precision'] for x in outputs]   
    sp_para_recall_scores = [x['sp_para_recall'] for x in outputs]   
    joint_f1_scores = [x['joint_f1'] for x in outputs]  
    joint_em_scores = [x['joint_em'] for x in outputs]  
    joint_prec_scores = [x['joint_prec'] for x in outputs]  
    joint_recall_scores = [x['joint_recall'] for x in outputs]


    print(f'before sync --> sizes:  {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 

        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)

        sp_sent_f1_scores = self.sync_list_across_gpus(sp_sent_f1_scores, avg_loss.device, torch.float)
        sp_sent_em_scores = self.sync_list_across_gpus(sp_sent_em_scores, avg_loss.device, torch.float)
        sp_sent_prec_scores = self.sync_list_across_gpus(sp_sent_prec_scores, avg_loss.device, torch.float)
        sp_sent_recall_scores = self.sync_list_across_gpus(sp_sent_recall_scores, avg_loss.device, torch.float)

        sp_para_f1_scores = self.sync_list_across_gpus(sp_para_f1_scores, avg_loss.device, torch.float)
        sp_para_em_scores = self.sync_list_across_gpus(sp_para_em_scores, avg_loss.device, torch.float)
        sp_para_prec_scores = self.sync_list_across_gpus(sp_para_prec_scores, avg_loss.device, torch.float)
        sp_para_recall_scores = self.sync_list_across_gpus(sp_para_recall_scores, avg_loss.device, torch.float)

        joint_f1_scores = self.sync_list_across_gpus(joint_f1_scores, avg_loss.device, torch.float)
        joint_em_scores = self.sync_list_across_gpus(joint_em_scores, avg_loss.device, torch.float)
        joint_prec_scores = self.sync_list_across_gpus(joint_prec_scores, avg_loss.device, torch.float)
        joint_recall_scores = self.sync_list_across_gpus(joint_recall_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')

    avg_val_f1 = sum(f1_scores) / len(f1_scores)    
    avg_val_em = sum(em_scores) / len(em_scores)    
    avg_val_prec = sum(prec_scores) / len(prec_scores)  
    avg_val_recall = sum(recall_scores) / len(recall_scores)
    avg_val_sp_sent_f1 = sum(sp_sent_f1_scores) / len(sp_sent_f1_scores)  
    avg_val_sp_sent_em = sum(sp_sent_em_scores) / len(sp_sent_em_scores)  
    avg_val_sp_sent_prec = sum(sp_sent_prec_scores) / len(sp_sent_prec_scores)  
    avg_val_sp_sent_recall = sum(sp_sent_recall_scores) / len(sp_sent_recall_scores)   
    avg_val_sp_para_f1 = sum(sp_para_f1_scores) / len(sp_para_f1_scores)  
    avg_val_sp_para_em = sum(sp_para_em_scores) / len(sp_para_em_scores)  
    avg_val_sp_para_prec = sum(sp_para_prec_scores) / len(sp_para_prec_scores)  
    avg_val_sp_para_recall = sum(sp_para_recall_scores) / len(sp_para_recall_scores)   
    avg_val_joint_f1 = sum(joint_f1_scores) / len(joint_f1_scores) 
    avg_val_joint_em = sum(joint_em_scores) / len(joint_em_scores)  
    avg_val_joint_prec = sum(joint_prec_scores) / len(joint_prec_scores)
    avg_val_joint_recall = sum(joint_recall_scores) / len(joint_recall_scores)

    print("avg_loss: ", avg_loss, end = '\t')   
    print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    print("avg_type_loss: ", avg_type_loss, end = '\t') 
    print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t')   
    print("avg_sp_sent_loss: ", avg_sp_sent_loss)   
    print("avg_val_f1: ", avg_val_f1, end = '\t')   
    print("avg_val_em: ", avg_val_em, end = '\t')   
    print("avg_val_prec: ", avg_val_prec, end = '\t')   
    print("avg_val_recall: ", avg_val_recall)   
    print("avg_val_sp_sent_f1: ", avg_val_sp_sent_f1, end = '\t')   
    print("avg_val_sp_sent_em: " , avg_val_sp_sent_em, end = '\t')  
    print("avg_val_sp_sent_prec: ", avg_val_sp_sent_prec, end = '\t')   
    print("avg_val_sp_sent_recall: ", avg_val_sp_sent_recall)   
    print("avg_val_sp_para_f1: ", avg_val_sp_para_f1, end = '\t')   
    print("avg_val_sp_para_em: " , avg_val_sp_para_em, end = '\t')  
    print("avg_val_sp_para_prec: ", avg_val_sp_para_prec, end = '\t')   
    print("avg_val_sp_para_recall: ", avg_val_sp_para_recall)   
    print("avg_val_joint_f1: " , avg_val_joint_f1, end = '\t')  
    print("avg_val_joint_em: ", avg_val_joint_em, end = '\t')   
    print("avg_val_joint_prec: ", avg_val_joint_prec, end = '\t')   
    print("avg_val_joint_recall: ", avg_val_joint_recall)   


    logs = {'avg_val_loss': avg_loss, 'avg_val_answer_loss': avg_answer_loss, 'avg_val_type_loss': avg_type_loss, 
            'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss,   
            'avg_val_f1': avg_val_f1 , 'avg_val_em': avg_val_em,  'avg_val_prec': avg_val_prec, 'avg_val_recall': avg_val_recall ,    
            'avg_val_sp_sent_f1': avg_val_sp_sent_f1, 'avg_val_sp_sent_em': avg_val_sp_sent_em,  'avg_val_sp_sent_prec': avg_val_sp_sent_prec, 'avg_val_sp_sent_recall': avg_val_sp_sent_recall, 
            'avg_val_sp_para_f1': avg_val_sp_para_f1, 'avg_val_sp_para_em': avg_val_sp_para_em,  'avg_val_sp_para_prec': avg_val_sp_para_prec, 'avg_val_sp_para_recall': avg_val_sp_para_recall, 
            'avg_val_joint_f1': avg_val_joint_f1, 'avg_val_joint_em': avg_val_joint_em,  'avg_val_joint_prec': avg_val_joint_prec, 'avg_val_joint_recall': avg_val_joint_recall 
    }   


    return logs


def sync_list_across_gpus(self, l, device, dtype):
    l_tensor = torch.tensor(l, device=device, dtype=dtype)
    gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)]
    torch.distributed.all_gather(gather_l_tensor, l_tensor)
    return torch.cat(gather_l_tensor).tolist()

##### test_step

In [28]:
%%add_to hotpotqa
def test_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qid, answer = batch

    print("test_step of qid: ", qid, end="\t") 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    answer_loss, type_loss, sp_para_loss, sp_sent_loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output, input_stage2_ids, sp_para_map, sp_sent_map = output 
    loss = answer_loss + 5*type_loss + 10*sp_para_loss + 10*sp_sent_loss

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_stage2_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output, sp_para_map, sp_sent_map)

    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)

    pre_answer_score = answers_pred[0]['score']  # (start_logit + end_logit + p_type_score) / 3
    pre_answer = _normalize_text(answers_pred[0]['text'])
    start_logit = answers_pred[0]['start_logit']    
    end_logit = answers_pred[0]['end_logit']    
    type_score = answers_pred[0]['p_type_score']

    gold_answer = _normalize_text(answer)
    f1, prec, recall = self.f1_score(pre_answer, gold_answer)
    em = self.exact_match_score(pre_answer, gold_answer) 
    f1 = torch.tensor(f1).type_as(loss)
    prec = torch.tensor(prec).type_as(loss)
    recall = torch.tensor(recall).type_as(loss)
    em = torch.tensor(em).type_as(loss)

    if(len(sp_sent_pred) > 0):
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = self.sp_metrics(sp_sent_pred, torch.where(sp_sent.squeeze())[0].tolist())
        sp_sent_em = torch.tensor(sp_sent_em).type_as(loss)
        sp_sent_precision = torch.tensor(sp_sent_precision).type_as(loss)
        sp_sent_recall = torch.tensor(sp_sent_recall).type_as(loss)
        sp_sent_f1 = torch.tensor(sp_sent_f1).type_as(loss)

#         print("sp_sent_em: " + str(sp_sent_em))
#         print("sp_sent_precision: " + str(sp_sent_precision))
#         print("sp_sent_recall: " + str(sp_sent_recall))    
#         print("sp_sent_f1: " + str(sp_sent_f1))    

        joint_prec = prec * sp_sent_precision
        joint_recall = recall * sp_sent_recall
        if joint_prec + joint_recall > 0:
            joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
        else:
            joint_f1 = torch.tensor(0.0).type_as(loss)
        joint_em = em * sp_sent_em 

    else:
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)
        joint_em, joint_f1, joint_prec, joint_recall =  torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)


    if(len(sp_para_pred) > 0):
        sp_para_em, sp_para_precision, sp_para_recall, sp_para_f1 = self.sp_metrics(sp_para_pred, torch.where(sp_para.squeeze())[0].tolist())
        sp_para_em = torch.tensor(sp_para_em).type_as(loss)
        sp_para_precision = torch.tensor(sp_para_precision).type_as(loss)
        sp_para_recall = torch.tensor(sp_para_recall).type_as(loss)
        sp_para_f1 = torch.tensor(sp_para_f1).type_as(loss)
    else:
        sp_para_em, sp_para_precision, sp_para_recall, sp_para_f1 = torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss), torch.tensor(0.0).type_as(loss)




    self.logger.log_metrics({'answer_loss': answer_loss, 'type_loss': type_loss, 'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss,    
                                'answer_score': pre_answer_score, 'start_logit': start_logit, 'end_logit': end_logit,   
                                'type_score': type_score,   
                                'f1': f1, 'prec':prec, 'recall':recall, 'em': em,
                                'sp_sent_f1': sp_sent_f1, 'sp_sent_precision': sp_sent_precision, 'sp_sent_recall': sp_sent_recall,  'sp_sent_em': sp_sent_em, 
                                'sp_para_f1': sp_para_f1, 'sp_para_precision': sp_para_precision, 'sp_para_recall': sp_para_recall,  'sp_para_em': sp_para_em, 
                                'joint_f1': joint_f1, 'joint_prec': joint_prec, 'joint_recall': joint_recall, 'joint_em': joint_em
                            }) 

    # print("pre_answer:\t", pre_answer, "\tgold_answer:\t", gold_answer, "\tstart_logits:\t", start_logits.cpu(), "\tend_logits:\t", end_logits.cpu(), "\ttype_logits:\t", type_logits.cpu()) 
    print("pre_answer:\t", pre_answer, "\tgold_answer:\t", gold_answer)

    return { 'vloss': loss, 'answer_loss': answer_loss, 'type_loss': type_loss, 'sp_para_loss': sp_para_loss, 'sp_sent_loss': sp_sent_loss,
               'answer_score': pre_answer_score, 'f1': f1, 'prec':prec, 'recall':recall, 'em': em,
                'sp_sent_f1': sp_sent_f1, 'sp_sent_precision': sp_sent_precision, 'sp_sent_recall': sp_sent_recall,  'sp_sent_em': sp_sent_em, 
                'sp_para_f1': sp_para_f1, 'sp_para_precision': sp_para_precision, 'sp_para_recall': sp_para_recall,  'sp_para_em': sp_para_em, 
               'joint_em': joint_em, 'joint_f1': joint_f1, 'joint_prec': joint_prec, 'joint_recall': joint_recall}



##### test_end

In [29]:
%%add_to hotpotqa
def test_epoch_end(self, outputs):
    print("test_epoch_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean()  
    avg_answer_loss = torch.stack([x['answer_loss'] for x in outputs]).mean()  
    avg_type_loss = torch.stack([x['type_loss'] for x in outputs]).mean()  
    avg_sp_para_loss = torch.stack([x['sp_para_loss'] for x in outputs]).mean()  
    avg_sp_sent_loss = torch.stack([x['sp_sent_loss'] for x in outputs]).mean()  

    answer_scores = [x['answer_score'] for x in outputs]  # [item for sublist in outputs for item in sublist['answer_score']] #torch.stack([x['answer_score'] for x in outputs]).mean() # 
    f1_scores = [x['f1'] for x in outputs]  
    em_scores = [x['em'] for x in outputs]  
    prec_scores =  [x['prec'] for x in outputs] 
    recall_scores = [x['recall'] for x in outputs]  

    print(f'before sync --> sizes:  {len(answer_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_answer_loss, op=torch.distributed.ReduceOp.SUM)
        avg_answer_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_type_loss, op=torch.distributed.ReduceOp.SUM)
        avg_type_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_para_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_para_loss /= self.trainer.world_size 
        torch.distributed.all_reduce(avg_sp_sent_loss, op=torch.distributed.ReduceOp.SUM)
        avg_sp_sent_loss /= self.trainer.world_size 
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)
#         int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)


    print(f'after sync --> sizes: {len(answer_scores)}')
    print("answer_scores: ", answer_scores)
    avg_test_f1 = sum(f1_scores) / len(f1_scores)
    avg_test_em = sum(em_scores) / len(em_scores)
    avg_test_prec = sum(prec_scores) / len(prec_scores)
    avg_test_recall = sum(recall_scores) / len(recall_scores)     
    print("avg_loss: ", avg_loss, end = '\t') 
    print("avg_answer_loss: ", avg_answer_loss, end = '\t') 
    print("avg_type_loss: ", avg_type_loss, end = '\t') 
    print("avg_sp_para_loss: ", avg_sp_para_loss, end = '\t') 
    print("avg_sp_sent_loss: ", avg_sp_sent_loss, end = '\t')  

    logs = {'avg_test_loss': avg_loss, 'avg_test_answer_loss': avg_answer_loss, 'avg_test_type_loss': avg_type_loss, 
            'avg_test_f1': avg_test_f1 , 'avg_test_em': avg_test_em,  'avg_test_prec': avg_test_prec, 'avg_test_recall': avg_test_recall ,    
            'avg_val_sp_para_loss': avg_sp_para_loss, 'avg_val_sp_sent_loss': avg_sp_sent_loss
           }

    return {'avg_test_loss': avg_loss, 'log': logs}

##### add_model_specific_args

In [30]:
%%add_to hotpotqa
@staticmethod
def add_model_specific_args(parser, root_dir):
    parser.add_argument("--save_dir", type=str, default='jupyter-hotpotqa')
    parser.add_argument("--save_prefix", type=str, required=True)
    parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format")
    parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format")
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
    parser.add_argument("--gpus", type=str, default='0',
                        help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ")
    parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps")
    parser.add_argument("--lr", type=float, default=0.00005, help="Maximum learning rate")
    parser.add_argument("--val_every", type=float, default=1.0, help="How often within one training epoch to check the validation set.")
    parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
    parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers")
    parser.add_argument("--seed", type=int, default=1234, help="Seed")
    parser.add_argument("--epochs", type=int, default=6, help="Number of epochs")
    parser.add_argument("--max_seq_len", type=int, default=4096,
                        help="Maximum length of seq passed to the transformer model")
    parser.add_argument("--max_doc_len", type=int, default=4096,
                        help="Maximum number of wordpieces of the input document")
    parser.add_argument("--max_num_answers", type=int, default=64,
                        help="Maximum number of answer spans per document (64 => 94%)")
    parser.add_argument("--max_question_len", type=int, default=55,
                        help="Maximum length of the question")
    parser.add_argument("--doc_stride", type=int, default=-1,
                        help="Overlap between document chunks. Use -1 to only use the first chunk")
    parser.add_argument("--ignore_seq_with_no_answers", action='store_true',
                        help="each example should have at least one answer. Default is False")
    parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
    parser.add_argument("--n_best_size", type=int, default=20,
                        help="Number of answer candidates. Used at decoding time")
    parser.add_argument("--max_answer_length", type=int, default=30,
                        help="maximum num of wordpieces/answer. Used at decoding time")
    parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss")
    parser.add_argument("--test", action='store_true', help="Test only, no training")
    parser.add_argument("--model_path", type=str,
                        help="Path to the checkpoint directory")
    parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing")
    parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                        default='sliding_chunks', help='Which implementation of selfattention to use')
    parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
    parser.add_argument('--train_percent', type=float, default=1.0)
    return parser

##### class info

In [31]:
dir(hotpotqa)

['CHECKPOINT_HYPER_PARAMS_KEY',
 'CHECKPOINT_HYPER_PARAMS_NAME',
 'CHECKPOINT_HYPER_PARAMS_TYPE',
 'T_destination',
 '_LightningModule__get_hparams_assignment_variable',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_apply',
 '_auto_collect_arguments',
 '_call_impl',
 '_forward_unimplemented',
 '_get_name',
 '_get_special_index',
 '_init_slurm_connection',
 '_load_from_state_dict',
 '_load_model_state',
 '_named_members',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',


In [32]:
from inspect import getmembers, isfunction
getmembers(hotpotqa)

[('CHECKPOINT_HYPER_PARAMS_KEY', 'hyper_parameters'),
 ('CHECKPOINT_HYPER_PARAMS_NAME', 'hparams_name'),
 ('CHECKPOINT_HYPER_PARAMS_TYPE', 'hparams_type'),
 ('T_destination', ~T_destination),
 ('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__abstractmethods__', frozenset()),
 ('__annotations__',
  {'_device': Ellipsis, '_dtype': typing.Union[str, torch.dtype]}),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__class__', abc.ABCMeta),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__init__': <function __main__.hotpotqa.__init__(self, args)>,
                'load_model': <function __main__.hotpotqa.load_model(self)>,
                'train_dataloader': <function pytorch_lightning.core.decorators.data_loader.<local

In [33]:
functions_list = [o for o in getmembers(hotpotqa) if isfunction(o[1])]
functions_list

[('_LightningModule__get_hparams_assignment_variable',
  <function pytorch_lightning.core.lightning.LightningModule.__get_hparams_assignment_variable(self)>),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dir__', <function torch.nn.modules.module.Module.__dir__(self)>),
 ('__getattr__',
  <function torch.nn.modules.module.Module.__getattr__(self, name:str) -> Union[torch.Tensor, _ForwardRef('Module')]>),
 ('__init__', <function __main__.hotpotqa.__init__(self, args)>),
 ('__repr__', <function torch.nn.modules.module.Module.__repr__(self)>),
 ('__setattr__',
  <function torch.nn.modules.module.Module.__setattr__(self, name:str, value:Union[torch.Tensor, _ForwardRef('Module')]) -> None>),
 ('__setstate__',
  <function torch.nn.modules.module.Module.__setstate__(self, state)>),
 ('_apply', <function torch.nn.modules.module.Module._apply(self, fn)>),


In [34]:
import inspect
inspect.getmro(hotpotqa)  # a hierarchy of classes 

(__main__.hotpotqa,
 pytorch_lightning.core.lightning.LightningModule,
 abc.ABC,
 pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin,
 pytorch_lightning.core.grads.GradInformation,
 pytorch_lightning.core.saving.ModelIO,
 pytorch_lightning.core.hooks.ModelHooks,
 torch.nn.modules.module.Module,
 object)

In [35]:
help(hotpotqa.configure_optimizers)

Help on function configure_optimizers in module __main__:

configure_optimizers(self)



In [36]:
# import inspect
# code, line_no = inspect.getsourcelines(hotpotqa.training_step)
# print(''.join(code))

### main

In [37]:
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
    if not args.test:     # if it needs to train, remove exsiting folder
        import shutil
        save_folder = os.path.join(args.save_dir, args.save_prefix)
        if os.path.exists(save_folder):
            shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder 
        
    import shutil
    save_folder = os.path.join(args.save_dir, args.save_prefix)
    if os.path.exists(save_folder):
        shutil.rmtree(save_folder, ignore_errors=True)  #delete non-empty folder


In [40]:
    hotpotqa.__abstractmethods__=set()   # without this, got an error "Can't instantiate abstract class hotpotqa with abstract methods" if these two abstract methods are not implemented in the same cell where class hotpotqa defined 
    model = hotpotqa(args)
#     model.to('cuda')    # this is necessary to use gpu
    

Loaded model with config:
RobertaConfig {
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],
  "attention_mode": "tvm",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

Loaded model with config:
RobertaConfig {
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],

In [41]:
    logger = TestTubeLogger( # The TestTubeLogger adds a nicer folder structure to manage experiments and snapshots all hyperparameters you pass to a LightningModule.
        save_dir=args.save_dir,
        name=args.save_prefix,
        version=0  # always use version=0
    )

In [42]:
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
        save_top_k=5,
        verbose=True,
        monitor='avg_val_f1',
        mode='max',
        prefix=''
    )

In [43]:
    train_set_size = 9 * args.train_percent # 90447 * args.train_percent   # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    print("train_set_size: ", train_set_size) 

    args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus!='' else None
    num_devices = 1 or len(args.gpus)
    print("num_devices: ", num_devices)

    train_set_size = 90447 * args.train_percent    # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    args.steps = args.epochs * train_set_size / (args.batch_size * num_devices)

    print(f'>>>>>>> #train_set_size: {train_set_size}, #steps: {args.steps},  #warmup steps: {args.warmup}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<')

train_set_size:  9.0
num_devices:  1
>>>>>>> #train_set_size: 90447.0, #steps: 271341.0,  #warmup steps: 1000, #epochs: 6, batch_size: 2 <<<<<<<


In [44]:
    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp', # if args.gpus and (len(args.gpus) > 1) else None,
                             track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None, replace_sampler_ddp=False,
                             accumulate_grad_batches=args.batch_size,
                             train_percent_check = args.train_percent,
        #                          val_check_interval=args.val_every,
                             val_percent_check=args.val_percent_check,
                             test_percent_check=args.val_percent_check,
                             logger=logger if not args.disable_checkpointing else False,
                             checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
                             show_progress_bar=args.no_progress_bar,
                             use_amp=not args.fp32, 
                             amp_level='O2',
#                              check_val_every_n_epoch=1
                             )


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Multi-processing is handled by Slurm.
CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


In [45]:
#     if not args.test: 
trainer.fit(model)


initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=ddp
All DDP processes registered. Starting ddp with 1 processes
----------------------------------------------------------------------------------------------------
Set SLURM handle signals.

  | Name          | Type       | Params
---------------------------------------------
0 | model_1       | Longformer | 148 M 
1 | fnn_sp_para_1 | Sequential | 591 K 
2 | model_2       | Longformer | 148 M 
3 | qa_outputs    | Linear     | 1 K   
4 | linear_type   | Linear     | 2 K   
5 | fnn_sp_sent   | Sequential | 591 K 
6 | fnn_sp_para_2 | Sequential | 591 K 


reading file: small.json


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1005],
        [ 0.0000, -0.1093],
        [ 0.0000, -0.1243],
        [ 0.0000, -0.0584],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.0635],
        [ 0.0000, -0.0795],
        [ 0.0000, -0.0632],
        [ 0.0000, -0.0554],
        [ 0.0000, -0.1245],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.1293],
        [ 0.0000, -0.1738],
        [ 0.0000, -0.0656],
        [ 0.0000, -0.1196],
        [ 0.0000, -0.1105],
        [ 0.0000, -0.0721],
        [ 0.0000, -0.0809],
        [ 0.0000, -0.0926],
        [ 0.0000, -0.1753],
        [ 0.0000, -0.1396],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0876],
        [ 0.0000, -0.0456],
        [ 0.0000, -0.0508],
        [ 0.0000, -0.0901],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0509],
        [ 0.0000, -0.1257],
        [ 0.0000, -0.1373],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0657],
        [ 0.0000, -0.1189],
        [ 0.0000, -0.1131],
        [ 0.0000, -0.0875],
        [ 0.0000, -0.0788],
        [ 0.0000, -0.1246],
        [ 0.0000, -0.0721],
        [ 0.0000, -0.1602],
        [ 0.0000, -0.1426],
        [ 0.0000, -0.1064],
        [ 0.0000, -0.1799],
        [ 0.0000, -0.1158],
        [ 0.0

validation_epoch_end
before sync --> sizes:  2, 2, 2
after sync --> sizes: 2, 2, 2
avg_loss:  tensor(22.9866, device='cuda:0')	avg_answer_loss:  tensor(3.2949, device='cuda:0')	avg_type_loss:  tensor(1.2849, device='cuda:0')	avg_sp_para_loss:  tensor(0.6760, device='cuda:0')	avg_sp_sent_loss:  tensor(0.6507, device='cuda:0')
avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.0	avg_val_sp_para_em:  0.0	avg_val_sp_para_prec:  0.0	avg_val_sp_para_recall:  0.0
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
reading file: small.json


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

sp_para_output_t:  tensor([[[-0.0122],
         [-0.0311],
         [-0.0012],
         [-0.0434],
         [-0.0545],
         [-0.0186],
         [-0.0059],
         [-0.0011],
         [-0.0336],
         [-0.1005]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0983],
        [ 0.0000, -0.0744],
        [ 0.0000, -0.1346],
        [ 0.0000, -0.0178],
        [ 0.0000,  0.0026],
        [ 0.0000, -0.0936],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.1317],
        [ 0.0000, -0.0341],
        [ 0.0000, -0.1174],
        [ 0.0000, -0.0701],
        [ 0.0000, -0.0614],
        [ 0.0000, -0.1014],
        [ 0.0000, -0.1104],
        [ 0.0000, -0.0389],
        [ 0.0000, -0.1311],
        [ 0.0000, -0.0700],
        [ 0.0000, -0.0677],
        [ 0.0000, -0.0633],
        [ 0.0000, -0.0930],
        [ 



sp_para_output_t:  tensor([[[-0.0404],
         [-0.0805],
         [-0.0778],
         [ 0.0349],
         [ 0.0441],
         [ 0.0102],
         [ 0.0201],
         [ 0.0666],
         [ 0.0136],
         [ 0.0392]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1187],
        [ 0.0000, -0.1299],
        [ 0.0000, -0.0412],
        [ 0.0000, -0.0503],
        [ 0.0000, -0.0226],
        [ 0.0000, -0.1156],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0758],
        [ 0.0000, -0.1355],
        [ 0.0000, -0.0964],
        [ 0.0000, -0.1010],
        [ 0.0000, -0.1302],
        [ 0.0000, -0.1924],
        [ 0.0000, -0.0544],
        [ 0.0000, -0.1322],
        [ 0.0000, -0.0628],
        [ 0.0000, -0.1676],
        [ 0.0000, -0.0663],
        [ 0.0000, -0.0846],
        [ 0.0000, -0.0421],
        [ 

sp_para_output_t:  tensor([[[ 0.0933],
         [-0.0259],
         [ 0.0154],
         [-0.0977],
         [ 0.1887],
         [ 0.0248],
         [ 0.1147],
         [ 0.0562],
         [ 0.0878],
         [ 0.1188]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([17, 2])
sp_sent.view(-1).size():  torch.Size([17])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0900],
        [ 0.0000, -0.2162],
        [ 0.0000, -0.0622],
        [ 0.0000, -0.1052],
        [ 0.0000, -0.0958],
        [ 0.0000, -0.0784],
        [ 0.0000, -0.0961],
        [ 0.0000, -0.1142],
        [ 0.0000, -0.1013],
        [ 0.0000, -0.0570],
        [ 0.0000, -0.0864],
        [ 0.0000, -0.1050],
        [ 0.0000, -0.0794],
        [ 0.0000, -0.0938],
        [ 0.0000, -0.1562],
        [ 0.0000, -0.1105],
        [ 0.0000, -0.0507]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[-0.0446],
         [ 0.0044],
         [ 0.1429],
         [ 0.0334],
         [ 0.0349],
         [-0.0367],
         [-0.1092],
         [-0.0043],
         [-0.0683],
         [-0.0656]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1163],
        [ 0.0000, -0.0510],
        [ 0.0000, -0.1686],
        [ 0.0000, -0.0275],
        [ 0.0000, -0.0627],
        [ 0.0000, -0.0645],
        [ 0.0000, -0.1069],
        [ 0.0000, -0.0664],
        [ 0.0000, -0.0113],
        [ 0.0000, -0.1230],
        [ 0.0000, -0.0915],
        [ 0.0000, -0.1288],
        [ 0.0000, -0.0714],
        [ 0.0000, -0.0844],
        [ 0.0000, -0.1222],
        [ 0.0000, -0.0631],
        [ 0.0000, -0.0971],
        [ 0.0000, -0.2064],
        [ 0.0000, -0.1875],
        [ 0.0000, -0.1105],
        [ 

sp_para_output_t:  tensor([[[ 0.0172],
         [ 0.0117],
         [ 0.1004],
         [-0.0019],
         [ 0.0141],
         [ 0.0373],
         [ 0.0097],
         [-0.0186],
         [-0.0215],
         [-0.0275]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1101],
        [ 0.0000, -0.0927],
        [ 0.0000, -0.0914],
        [ 0.0000, -0.0537],
        [ 0.0000, -0.0988],
        [ 0.0000, -0.0978],
        [ 0.0000, -0.1593],
        [ 0.0000, -0.0617],
        [ 0.0000,  0.0065],
        [ 0.0000, -0.1586],
        [ 0.0000, -0.0284],
        [ 0.0000, -0.1183],
        [ 0.0000, -0.1387],
        [ 0.0000, -0.0753],
        [ 0.0000, -0.1316],
        [ 0.0000, -0.0692],
        [ 0.0000, -0.0469],
        [ 0.0000, -0.1082],
        [ 0.0000, -0.0950],
        [ 0.0000, -0.0888],
        [ 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1157],
        [ 0.0000, -0.1259],
        [ 0.0000, -0.1418],
        [ 0.0000, -0.0744],
        [ 0.0000, -0.0833],
        [ 0.0000, -0.0793],
        [ 0.0000, -0.0952],
        [ 0.0000, -0.0806],
        [ 0.0000, -0.0716],
        [ 0.0000, -0.1438],
        [ 0.0000, -0.0844],
        [ 0.0000, -0.1506],
        [ 0.0000, -0.1929],
        [ 0.0000, -0.0848],
        [ 0.0000, -0.1407],
        [ 0.0000, -0.1292],
        [ 0.0000, -0.0892],
        [ 0.0000, -0.0972],
        [ 0.0000, -0.1099],
        [ 0.0000, -0.1913],
        [ 0.0000, -0.1587],
        [ 0.0

predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1068],
        [ 0.0000, -0.0640],
        [ 0.0000, -0.0676],
        [ 0.0000, -0.1085],
        [ 0.0000, -0.0630],
        [ 0.0000, -0.0736],
        [ 0.0000, -0.1416],
        [ 0.0000, -0.1567],
        [ 0.0000, -0.0592],
        [ 0.0000, -0.0861],
        [ 0.0000, -0.1407],
        [ 0.0000, -0.1338],
        [ 0.0000, -0.1104],
        [ 0.0000, -0.0996],
        [ 0.0000, -0.1478],
        [ 0.0000, -0.0901],
        [ 0.0000, -0.1766],
        [ 0.0000, -0.1583],
        [ 0.0000, -0.1232],
        [ 0.0000, -0.1980],
        [ 0.0000, -0.1346],
        [ 0.0000, -0.0945],
        [ 0.0000, -0.0834],
        [ 0.0000, -0.1376],
        [ 0.0000, -0.1074],
        [ 0.0000, -0.1254],
        [ 0.0000,  0.0041],
        [ 0.0000, -0.0866],
        [ 0.0000, -0.0796],
        [ 0.0000,  0.0129],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([6, 2])
sp_sent.view(-1).size():  torch.Size([6])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1124],
        [ 0.0000, -0.0406],
        [ 0.0000, -0.0723],
        [ 0.0000, -0.0481],
        [ 0.0000, -0.0615],
        [ 0.0000, -0.0533]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 1, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([12], device='cuda:0')
start_logits:  tensor([ 0.1436,  0.0114,  0.0687,  0.1251,  0.3259,  0.0838,  0.2172,  0.0881,
         0.2820, -0.0008,  0.2222,  0.3418,  0.2158,  0.3408,  0.2158,  0.1436,
         0.2216, -0.0031,  0.2573,  0.2080,  0.0301,  0.1644, -0.2681,  0.0871,
         0.2042, -0.0099,  0.2720,  0.1788, -0.0342,  0.3594,  0.0970,  0.5493,
         0.0942, -0.0945,  0.2214, -0.0228,  0.2920, -0.1113,  0.6606,  0.3142,
         0.2710,  0.2822,  0.1389,  0.1366,  0.1277,  0.2139,  0.0567,  0.0771,
         0.1787, -0.0349,  0.2500,  0.2252, -0.0858,  0.1891, -0.17

predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1059],
        [ 0.0000, -0.1111],
        [ 0.0000, -0.0841],
        [ 0.0000, -0.0674],
        [ 0.0000, -0.0891],
        [ 0.0000, -0.0496],
        [ 0.0000, -0.0903],
        [ 0.0000, -0.0882],
        [ 0.0000, -0.0849],
        [ 0.0000, -0.0974],
        [ 0.0000, -0.0829],
        [ 0.0000, -0.0496],
        [ 0.0000, -0.1169],
        [ 0.0000, -0.1060],
        [ 0.0000, -0.0505],
        [ 0.0000, -0.1137],
        [ 0.0000, -0.0881],
        [ 0.0000, -0.0760],
        [ 0.0000, -0.1248],
        [ 0.0000, -0.0894],
        [ 0.0000, -0.1262],
        [ 0.0000, -0.1137],
        [ 0.0000, -0.0737],
        [ 0.0000, -0.0793],
        [ 0.0000, -0.1091],
        [ 0.0000, -0.0986],
        [ 0.0000, -0.0656],
        [ 0.0000, -0.0496]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0

predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0491],
        [ 0.0000, -0.0528],
        [ 0.0000, -0.0107],
        [ 0.0000, -0.0584],
        [ 0.0000, -0.0735],
        [ 0.0000, -0.0463],
        [ 0.0000, -0.0881],
        [ 0.0000, -0.0582],
        [ 0.0000, -0.0368],
        [ 0.0000, -0.0839],
        [ 0.0000, -0.0842],
        [ 0.0000, -0.0479],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0406],
        [ 0.0000, -0.0947],
        [ 0.0000, -0.0844]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([ 0.3318,  0.3398,  0.3687,  0.4067,  0.5371,  0.5737,  0.3877,  0.3132,
         0.1542,  0.2434,  0.5337,  0.5264,  0.6816,  0.3657,  0.3655,  0.3655,
         0.3345,  0.2350,  0.3430,  0.2010,  0.0657,  0.3428,  0.2330,  0.3

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0957],
        [ 0.0000, -0.0897],
        [ 0.0000, -0.1116],
        [ 0.0000, -0.0258],
        [ 0.0000, -0.1223],
        [ 0.0000, -0.1537],
        [ 0.0000, -0.0848],
        [ 0.0000, -0.1791],
        [ 0.0000, -0.1703],
        [ 0.0000, -0.1243],
        [ 0.0000, -0.1794],
        [ 0.0000, -0.1186],
        [ 0.0000, -0.1510],
        [ 0.0000, -0.1345],
        [ 0.0000, -0.1110],
        [ 0.0000, -0.1620],
        [ 0.0000, -0.1257],
        [ 0.0000, -0.0626]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([37], device='cuda:0')
start_logits:  tensor([ 0.2612,  0.2117,  0.2786,  0.7520,  0.2546,  0.2474,  0.7002,  0.4451,
         0.0506,  0.5273,  0.5073,  0.2378,  0.6411,  0.6982,  0.7427,  0.3977,
         0.43

predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0677],
        [ 0.0000, -0.0418],
        [ 0.0000, -0.0683],
        [ 0.0000, -0.1189],
        [ 0.0000, -0.0700],
        [ 0.0000, -0.0538],
        [ 0.0000, -0.0656],
        [ 0.0000, -0.0869],
        [ 0.0000, -0.0522],
        [ 0.0000, -0.0946],
        [ 0.0000, -0.0304],
        [ 0.0000, -0.0472],
        [ 0.0000, -0.0301],
        [ 0.0000, -0.0375],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.1040],
        [ 0.0000, -0.0388],
        [ 0.0000, -0.0764],
        [ 0.0000, -0.0591],
        [ 0.0000, -0.0634],
        [ 0.0000, -0.1202],
        [ 0.0000, -0.0975],
        [ 0.0000, -0.0797],
        [ 0.0000, -0.0804],
        [ 0.0000, -0.0851],
        [ 0.0000, -0.0847],
        [ 0.0000, -0.0981],
        [ 0.0000, -0.0632],
        [ 0.0000, -0.0600]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0862],
        [ 0.0000, -0.0834],
        [ 0.0000,  0.0076],
        [ 0.0000, -0.0702],
        [ 0.0000, -0.1074],
        [ 0.0000, -0.0404],
        [ 0.0000, -0.0043],
        [ 0.0000, -0.0199],
        [ 0.0000, -0.0089],
        [ 0.0000, -0.0643],
        [ 0.0000, -0.0600],
        [ 0.0000, -0.0181],
        [ 0.0000, -0.0622]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([19], device='cuda:0')
start_logits:  tensor([ 3.2642e-01,  1.5979e-01,  2.5073e-01,  4.8999e-01,  2.3975e-01,
         4.0796e-01,  4.4531e-01,  1.8457e-01,  1.1261e-01,  2.7539e-01,
         3.4351e-01,  5.7861e-01,  7.7942e-02,  3.3752e-02,  1.6809e-01,
         2.6611e-01,  4.3311e-01,  5.7422e-01,  5.3223e-01,  7.8278e-03,
         3.3691e-01,  1.8738e-02,  1

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1014],
        [ 0.0000, -0.0510],
        [ 0.0000, -0.0374],
        [ 0.0000, -0.0500],
        [ 0.0000, -0.0685],
        [ 0.0000, -0.0669],
        [ 0.0000, -0.0377],
        [ 0.0000, -0.0519],
        [ 0.0000, -0.1173],
        [ 0.0000, -0.0779],
        [ 0.0000, -0.1087],
        [ 0.0000, -0.1014],
        [ 0.0000, -0.1036],
        [ 0.0000, -0.0992],
        [ 0.0000, -0.1137],
        [ 0.0000, -0.1177],
        [ 0.0000, -0.0368],
        [ 0.0000, -0.0935]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([17], device='cuda:0')
start_logits:  tensor([ 0.3086,  0.2441,  0.4805,  0.6699,  0.3679,  0.5083,  0.5693,  0.5493,
         0.6191,  0.3079,  0.6025,  0.3621,  0.0825,  0.2385,  0.2048,  0.2559,
         0.20

predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1157],
        [ 0.0000, -0.0610],
        [ 0.0000, -0.0718],
        [ 0.0000, -0.0910],
        [ 0.0000, -0.0425],
        [ 0.0000, -0.0900],
        [ 0.0000, -0.0400],
        [ 0.0000, -0.0339],
        [ 0.0000, -0.0626],
        [ 0.0000, -0.0759],
        [ 0.0000, -0.0481],
        [ 0.0000, -0.0610],
        [ 0.0000, -0.0400],
        [ 0.0000, -0.0892],
        [ 0.0000, -0.0355],
        [ 0.0000, -0.1018],
        [ 0.0000, -0.1055],
        [ 0.0000, -0.1156],
        [ 0.0000, -0.0730],
        [ 0.0000, -0.0834],
        [ 0.0000, -0.0400],
        [ 0.0000, -0.0761],
        [ 0.0000, -0.0400],
        [ 0.0000, -0.0400]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([13], device='cuda

predict_support_sent.view(-1, 2).size():  torch.Size([17, 2])
sp_sent.view(-1).size():  torch.Size([17])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1225],
        [ 0.0000, -0.0771],
        [ 0.0000, -0.1059],
        [ 0.0000, -0.0454],
        [ 0.0000, -0.0479],
        [ 0.0000, -0.0721],
        [ 0.0000, -0.0909],
        [ 0.0000, -0.0646],
        [ 0.0000, -0.0768],
        [ 0.0000, -0.1004],
        [ 0.0000, -0.1317],
        [ 0.0000, -0.0312],
        [ 0.0000, -0.0538],
        [ 0.0000, -0.1381],
        [ 0.0000, -0.0922],
        [ 0.0000, -0.0517],
        [ 0.0000, -0.1138]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([22], device='cuda:0')
start_logits:  tensor([ 3.1396e-01,  2.1191e-01,  1.5344e-01,  2.9150e-01,  2.9321e-01,
         4.1064e-01,  2.8052e-01,  5.6104e-01,  5.6641e-01,  3.1934e-01,
         2.6392e-01,  1.7053e-01,  2.4582e-02,  4.8853e-01

predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1069],
        [ 0.0000, -0.1327],
        [ 0.0000, -0.1263],
        [ 0.0000, -0.1140],
        [ 0.0000, -0.0640],
        [ 0.0000, -0.1625],
        [ 0.0000, -0.1154],
        [ 0.0000, -0.0433],
        [ 0.0000, -0.1032],
        [ 0.0000, -0.0400],
        [ 0.0000, -0.1571],
        [ 0.0000, -0.0638],
        [ 0.0000, -0.0623],
        [ 0.0000, -0.1569],
        [ 0.0000, -0.1423],
        [ 0.0000, -0.1130],
        [ 0.0000, -0.1473],
        [ 0.0000, -0.0924],
        [ 0.0000, -0.1042],
        [ 0.0000, -0.0834],
        [ 0.0000, -0.1147]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([17], device='cuda:0')
start_logits:  tensor([ 3.2153e-01,  1.5759e-01,  3.4033e-01,  2.3462e-01,  2.4719e-01,


predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1294],
        [ 0.0000, -0.0897],
        [ 0.0000, -0.1693],
        [ 0.0000, -0.1714],
        [ 0.0000, -0.0572],
        [ 0.0000, -0.1118],
        [ 0.0000, -0.0903],
        [ 0.0000, -0.0479],
        [ 0.0000, -0.0368],
        [ 0.0000, -0.0232],
        [ 0.0000, -0.1019],
        [ 0.0000, -0.0678],
        [ 0.0000, -0.1147],
        [ 0.0000, -0.1885],
        [ 0.0000, -0.1086],
        [ 0.0000, -0.1666],
        [ 0.0000, -0.0899],
        [ 0.0000, -0.1093],
        [ 0.0000, -0.1414]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([20], device='cuda:0')
start_logits:  tensor([ 2.3181e-01,  5.4016e-02,  1.7529e-01,  2.7930e-01,  6.6772e-02,
         8.1152e-01,  4.2456e-01,  1.1835e-01,  2.6929e-01,  1

predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0908],
        [ 0.0000, -0.0757],
        [ 0.0000, -0.0664],
        [ 0.0000, -0.0544],
        [ 0.0000, -0.0500],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0461],
        [ 0.0000, -0.0707],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0670],
        [ 0.0000, -0.0775],
        [ 0.0000, -0.0431],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0486],
        [ 0.0000, -0.1055],
        [ 0.0000, -0.1464],
        [ 0.0000, -0.1136],
        [ 0.0000, -0.0432],
        [ 0.0000, -0.0890],
        [ 0.0000, -0.0941]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([27], device='cuda:0')
start_logits:  tensor([ 2.8540e-01,  3.0640e-02,  2.6562e

predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1396],
        [ 0.0000, -0.0621],
        [ 0.0000, -0.1002],
        [ 0.0000, -0.0840],
        [ 0.0000, -0.0803],
        [ 0.0000, -0.1105],
        [ 0.0000, -0.0925],
        [ 0.0000, -0.0807],
        [ 0.0000, -0.1227],
        [ 0.0000, -0.0727],
        [ 0.0000, -0.0537],
        [ 0.0000, -0.1103],
        [ 0.0000, -0.0359],
        [ 0.0000, -0.1301],
        [ 0.0000, -0.1362],
        [ 0.0000, -0.1442],
        [ 0.0000, -0.1396],
        [ 0.0000, -0.1339],
        [ 0.0000, -0.1438],
        [ 0.0000, -0.0500],
        [ 0.0000, -0.0892],
        [ 0.0000, -0.0825],
        [ 0.0000, -0.1193],
        [ 0.0000, -0.1373]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda

predict_support_sent.view(-1, 2).size():  torch.Size([30, 2])
sp_sent.view(-1).size():  torch.Size([30])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1366],
        [ 0.0000, -0.1190],
        [ 0.0000, -0.0848],
        [ 0.0000, -0.0585],
        [ 0.0000, -0.1444],
        [ 0.0000, -0.0884],
        [ 0.0000, -0.0504],
        [ 0.0000, -0.0856],
        [ 0.0000, -0.0541],
        [ 0.0000, -0.0436],
        [ 0.0000, -0.0482],
        [ 0.0000, -0.0831],
        [ 0.0000, -0.0567],
        [ 0.0000, -0.0734],
        [ 0.0000, -0.0630],
        [ 0.0000, -0.1659],
        [ 0.0000, -0.1353],
        [ 0.0000, -0.0627],
        [ 0.0000, -0.1425],
        [ 0.0000, -0.1589],
        [ 0.0000, -0.1809],
        [ 0.0000, -0.1309],
        [ 0.0000, -0.1299],
        [ 0.0000, -0.1385],
        [ 0.0000, -0.0709],
        [ 0.0000, -0.0892],
        [ 0.0000, -0.1550],
        [ 0.0000, -0.1475],
        [ 0.0000, -0.1060],
        [ 0.0000, -0.1185]], device='cuda:0')
sp

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0136],
        [ 0.0000, -0.0547],
        [ 0.0000, -0.0530],
        [ 0.0000, -0.0834],
        [ 0.0000, -0.1025],
        [ 0.0000, -0.0990],
        [ 0.0000, -0.0414],
        [ 0.0000, -0.1121],
        [ 0.0000, -0.0555],
        [ 0.0000, -0.1298],
        [ 0.0000, -0.1089],
        [ 0.0000, -0.0497],
        [ 0.0000, -0.1566],
        [ 0.0000, -0.1140],
        [ 0.0000, -0.0418],
        [ 0.0000, -0.1682],
        [ 0.0000, -0.1481],
        [ 0.0000, -0.1285]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([39], device='cuda:0')
start_logits:  tensor([ 2.9956e-01,  1.3904e-01,  1.4014e-01,  2.2522e-01,  3.1323e-01,
         4.6973e-01,  3.1299e-01,  3.6816e-01,  2.3596e-01,  3.0444e-01,
         7.8955e-01,  3.266

predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0891],
        [ 0.0000, -0.0854],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0823],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0735],
        [ 0.0000, -0.0352],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0495],
        [ 0.0000, -0.0662],
        [ 0.0000, -0.0635],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0964],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0398],
        [ 0.0000, -0.0741],
        [ 0.0000, -0.0398]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([ 2.2510e-01,  1.1040e-02,  1.0999e-01,  2.6978e-01,  2.2095e-01,


predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1045],
        [ 0.0000, -0.1135],
        [ 0.0000, -0.1072],
        [ 0.0000, -0.1261],
        [ 0.0000, -0.1105],
        [ 0.0000, -0.1116],
        [ 0.0000, -0.0935],
        [ 0.0000, -0.1481],
        [ 0.0000, -0.1259],
        [ 0.0000, -0.1255],
        [ 0.0000, -0.1212],
        [ 0.0000, -0.1371],
        [ 0.0000, -0.1410],
        [ 0.0000, -0.1229],
        [ 0.0000, -0.1277],
        [ 0.0000, -0.0886],
        [ 0.0000, -0.0861],
        [ 0.0000, -0.1218],
        [ 0.0000, -0.1176],
        [ 0.0000, -0.0891]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([19], device='cuda:0')
start_logits:  tensor([ 1.9873e-01,  4.2725e-02,  2.4060e-01,  1.1639e-01,  3.9233e-01,
         4.6069e-01,  4.6045e-0

predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1334],
        [ 0.0000, -0.1633],
        [ 0.0000, -0.1482],
        [ 0.0000, -0.1203],
        [ 0.0000, -0.0792],
        [ 0.0000, -0.1342],
        [ 0.0000, -0.1528],
        [ 0.0000, -0.1796],
        [ 0.0000, -0.1573],
        [ 0.0000, -0.0838],
        [ 0.0000, -0.1877],
        [ 0.0000, -0.2148],
        [ 0.0000, -0.0590],
        [ 0.0000, -0.1560],
        [ 0.0000, -0.1025]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 0.3337,  0.1531,  0.3093,  0.4683,  0.4150,  0.4285,  0.4260,  0.0325,
        -0.0597,  0.2573,  0.2457,  0.0149,  0.3579, -0.1857,  0.4246,  0.1212,
         0.2969,  0.2107,  0.2301,  0.0230,  0.4021,  0.0560,  0.2715,  0.1299,
         0.2306,  0.5557, 

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1144],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0449],
        [ 0.0000, -0.0572],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0760],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.1185],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363],
        [ 0.0000, -0.0363]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], device='cuda:0')
question_end_index:  tensor([22], device='cuda:0')
start_logits:  tensor([ 0.3191,  0.1268,  0.2384,  0.4580,  0.6060,  0.1632,  0.4763,  0.0811,
         0.1913,  0.3528,  0.2089,  0.2676,  0.0329,  0.1796,  0.3806,  0.4563,
         0.08

start_logits:  tensor([ 2.6880e-01,  7.6111e-02,  1.5063e-01,  1.3599e-01,  7.6074e-01,
         3.6572e-01,  2.7881e-01,  1.0907e-01,  4.2529e-01,  2.2119e-01,
         4.5874e-01, -3.5522e-01,  3.2739e-01,  2.2180e-01,  6.7200e-02,
         3.0273e-01, -1.1542e-01,  3.6896e-02,  4.4189e-01,  8.4277e-01,
         5.6006e-01,  5.4395e-01,  5.9521e-01,  5.7275e-01,  2.6099e-01,
         2.7783e-01,  8.9551e-01,  3.8232e-01,  5.2832e-01,  3.2593e-01,
         2.2546e-01, -1.0950e-01,  4.1260e-01,  2.0154e-01,  4.7046e-01,
         4.9170e-01,  2.6245e-01, -1.3977e-01,  4.9463e-01,  3.7378e-01,
        -3.5474e-01,  6.2402e-01,  4.7974e-01,  6.1584e-02,  5.2637e-01,
         6.6357e-01,  3.7085e-01, -4.9639e-04,  3.8062e-01,  4.8608e-01,
         4.0820e-01,  2.4268e-01,  4.5654e-01,  1.4380e-01,  1.7371e-01,
         2.9443e-01,  1.5784e-01,  7.9395e-01,  1.2341e-01,  6.0645e-01,
         4.3433e-01,  2.9150e-01,  2.8503e-02,  2.9480e-02,  3.4766e-01,
         2.1692e-01,  2.5024e-01, -5

predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0196],
        [ 0.0000, -0.0987],
        [ 0.0000, -0.1239],
        [ 0.0000, -0.0630],
        [ 0.0000, -0.0889],
        [ 0.0000, -0.0446],
        [ 0.0000, -0.1365],
        [ 0.0000, -0.0564],
        [ 0.0000, -0.1294],
        [ 0.0000, -0.0900],
        [ 0.0000, -0.1260],
        [ 0.0000, -0.1394],
        [ 0.0000, -0.1287],
        [ 0.0000, -0.1213],
        [ 0.0000, -0.1461],
        [ 0.0000, -0.0724],
        [ 0.0000, -0.1100],
        [ 0.0000, -0.0573],
        [ 0.0000, -0.1108],
        [ 0.0000, -0.0981],
        [ 0.0000, -0.0647],
        [ 0.0000, -0.0516],
        [ 0.0000, -0.0854],
        [ 0.0000, -0.0913],
        [ 0.0000, -0.0873],
        [ 0.0000, -0.0518],
        [ 0.0000, -0.0264],
        [ 0.0000, -0.0651],
        [ 0.0000, -0.1177],
        [ 0.0000, -0.0608],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0697],
        [ 0.0000, -0.1179],
        [ 0.0000, -0.1326],
        [ 0.0000, -0.1074],
        [ 0.0000, -0.0827],
        [ 0.0000, -0.0988],
        [ 0.0000, -0.0622],
        [ 0.0000, -0.0712],
        [ 0.0000, -0.1191],
        [ 0.0000, -0.1517],
        [ 0.0000, -0.1210],
        [ 0.0000, -0.1588],
        [ 0.0000, -0.1277],
        [ 0.0000, -0.1934],
        [ 0.0000, -0.1863],
        [ 0.0000, -0.1705],
        [ 0.0000, -0.1598],
        [ 0.0000, -0.0899],
        [ 0.0000, -0.1232],
        [ 0.0000, -0.1119],
        [ 0.0000, -0.1677],
        [ 0.0000, -0.1232],
        [ 0.0000, -0.1058],
        [ 0.0000, -0.0898],
        [ 0.0000, -0.1145],
        [ 0.0000, -0.1040],
        [ 0.0000, -0.1111],
        [ 0.0000, -0.0558],
        [ 0.0000, -0.1617],
        [ 0.0000, -0.0848],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0762],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0372],
        [ 0.0000, -0.0643],
        [ 0.0000, -0.0371],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0371],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0511],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0615],
        [ 0.0000, -0.0768],
        [ 0.0000, -0.0525],
        [ 0.0000, -0.0785],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.0370],
        [ 0.0000, -0.0954],
        [ 0.0000, -0.0811],
        [ 0.0000, -0.0370]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([ 1.5344e-01,  1.0889e-01,  1.6748e-01,  1.6785e-01,  3.5840e-01,
        -2.0813e-02,  6.6406e-01,  4.3457e-02,  1.1523e-01, -3

start_logits:  tensor([ 3.1592e-01,  1.3904e-01,  3.5620e-01,  3.3398e-01,  2.9221e-02,
         5.3174e-01,  9.3201e-02,  1.8774e-01,  2.0789e-01,  9.9805e-01,
         3.4399e-01,  1.9128e-01,  1.4856e-01,  3.2129e-01, -1.5625e-01,
         3.9819e-01,  4.8633e-01,  3.5840e-01,  1.2622e-01,  4.2554e-01,
         3.7354e-01, -5.4504e-02,  3.2739e-01, -2.1790e-01,  1.6907e-01,
         1.2091e-01,  1.6577e-01,  4.6436e-01, -2.5574e-02,  5.7745e-04,
         1.9824e-01,  1.5564e-01,  1.3562e-01,  1.2311e-01,  3.3936e-02,
         6.9031e-02,  1.7468e-01,  2.1228e-01,  4.4604e-01, -9.9609e-02,
         3.2422e-01, -8.6243e-02,  2.0898e-01,  4.5044e-02, -3.3020e-02,
         8.5571e-02,  1.1823e-01,  5.4565e-02,  4.8553e-02,  1.9543e-01,
         2.3364e-01,  3.2397e-01,  7.4768e-02,  2.9022e-02,  2.5537e-01,
         2.8467e-01,  2.0288e-01, -4.6112e-02,  1.7395e-01,  1.9507e-01,
        -6.2073e-02,  3.9093e-02,  1.2793e-01,  8.1848e-02,  1.1078e-01,
         5.5511e-02,  7.2021e-02,  1

sp_para_output_t:  tensor([[[ 0.0108],
         [-0.0140],
         [-0.0169],
         [ 0.0097],
         [-0.0190],
         [ 0.0539],
         [ 0.0052],
         [ 0.1025],
         [ 0.0374],
         [ 0.0879]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1527],
        [ 0.0000, -0.1215],
        [ 0.0000, -0.0950],
        [ 0.0000, -0.1228],
        [ 0.0000, -0.0545],
        [ 0.0000, -0.0404],
        [ 0.0000, -0.0457],
        [ 0.0000, -0.0822],
        [ 0.0000, -0.0457],
        [ 0.0000, -0.0492],
        [ 0.0000, -0.0457],
        [ 0.0000, -0.1312],
        [ 0.0000, -0.0457],
        [ 0.0000, -0.1366],
        [ 0.0000, -0.1456],
        [ 0.0000, -0.0457],
        [ 0.0000, -0.1017],
        [ 0.0000, -0.0854],
        [ 0.0000, -0.1107],
        [ 0.0000, -0.1043],
        [ 0.0000, -0.0986],
        [ 0.0


Epoch 00000: avg_val_f1 reached 0.06897 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_0.ckpt as top 5


0.06896551724137931	avg_val_prec:  0.06896551724137931	avg_val_recall:  0.06896551724137931
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1896551724137931	avg_val_sp_para_em:  0.0	avg_val_sp_para_prec:  0.1896551724137931	avg_val_sp_para_recall:  0.1896551724137931
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0




sp_para_output_t:  tensor([[[-0.0509],
         [-0.0191],
         [ 0.0381],
         [ 0.0396],
         [-0.0136],
         [ 0.0641],
         [ 0.1584],
         [ 0.0478],
         [ 0.1272],
         [-0.0372]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([14, 2])
sp_sent.view(-1).size():  torch.Size([14])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000,  0.0399],
        [ 0.0000, -0.0376],
        [ 0.0000, -0.0552],
        [ 0.0000, -0.0975],
        [ 0.0000, -0.1348],
        [ 0.0000, -0.0093],
        [ 0.0000, -0.1501],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.0219],
        [ 0.0000, -0.0176],
        [ 0.0000, -0.1108],
        [ 0.0000, -0.1471],
        [ 0.0000, -0.0740],
        [ 0.0000, -0.1420]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0')
sp_para_output_t:  tensor([[[-0.0558],
         [

sp_para_output_t:  tensor([[[-0.0616],
         [-0.0662],
         [-0.0764],
         [-0.0524],
         [-0.0311],
         [ 0.0721],
         [-0.0159],
         [-0.0465],
         [-0.0079],
         [-0.0019]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.0510],
        [ 0.0000, -0.0267],
        [ 0.0000, -0.0214],
        [ 0.0000, -0.0139],
        [ 0.0000, -0.0700],
        [ 0.0000, -0.1085],
        [ 0.0000, -0.0767],
        [ 0.0000, -0.0145],
        [ 0.0000, -0.0790],
        [ 0.0000, -0.0974],
        [ 0.0000,  0.0045],
        [ 0.0000, -0.1089],
        [ 0.0000, -0.1142],
        [ 0.0000, -0.0982],
        [ 0.0000, -0.0573]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
sp_para_output_t: 

sp_para_output_t:  tensor([[[0.0109],
         [0.1251],
         [0.0630],
         [0.0298],
         [0.0084],
         [0.0264],
         [0.0301],
         [0.1399],
         [0.0637],
         [0.0825]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1925],
        [ 0.0000, -0.1139],
        [ 0.0000, -0.0947],
        [ 0.0000, -0.1936],
        [ 0.0000, -0.1348],
        [ 0.0000, -0.1647],
        [ 0.0000, -0.0933],
        [ 0.0000, -0.1569],
        [ 0.0000, -0.1097],
        [ 0.0000, -0.1405],
        [ 0.0000, -0.1348],
        [ 0.0000, -0.1089],
        [ 0.0000, -0.1234],
        [ 0.0000, -0.1293],
        [ 0.0000, -0.0609],
        [ 0.0000, -0.0380],
        [ 0.0000, -0.0950],
        [ 0.0000, -0.2032],
        [ 0.0000, -0.0701]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sen

predict_support_sent.view(-1, 2).size():  torch.Size([26, 2])
sp_sent.view(-1).size():  torch.Size([26])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2510],
        [ 0.0000, -0.0971],
        [ 0.0000, -0.4031],
        [ 0.0000, -0.2302],
        [ 0.0000, -0.1031],
        [ 0.0000, -0.2339],
        [ 0.0000, -0.1816],
        [ 0.0000, -0.1589],
        [ 0.0000, -0.1622],
        [ 0.0000, -0.0986],
        [ 0.0000, -0.1526],
        [ 0.0000, -0.2517],
        [ 0.0000, -0.2698],
        [ 0.0000, -0.2629],
        [ 0.0000, -0.2144],
        [ 0.0000, -0.1592],
        [ 0.0000, -0.2288],
        [ 0.0000, -0.2542],
        [ 0.0000, -0.2343],
        [ 0.0000, -0.2288],
        [ 0.0000, -0.1735],
        [ 0.0000, -0.2874],
        [ 0.0000, -0.2079],
        [ 0.0000, -0.2413],
        [ 0.0000, -0.2537],
        [ 0.0000, -0.2455]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2617],
        [ 0.0000, -0.2759],
        [ 0.0000, -0.3049],
        [ 0.0000, -0.2286],
        [ 0.0000, -0.2634],
        [ 0.0000, -0.2291],
        [ 0.0000, -0.2480],
        [ 0.0000, -0.2463],
        [ 0.0000, -0.2366],
        [ 0.0000, -0.3286],
        [ 0.0000, -0.2771],
        [ 0.0000, -0.3459],
        [ 0.0000, -0.3743],
        [ 0.0000, -0.2732],
        [ 0.0000, -0.3474],
        [ 0.0000, -0.3074],
        [ 0.0000, -0.2581],
        [ 0.0000, -0.2632],
        [ 0.0000, -0.2783],
        [ 0.0000, -0.3367],
        [ 0.0000, -0.3367],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2866],
        [ 0.0000, -0.2434],
        [ 0.0000, -0.2285],
        [ 0.0000, -0.2930],
        [ 0.0000, -0.2427],
        [ 0.0000, -0.2976],
        [ 0.0000, -0.2949],
        [ 0.0000, -0.3418],
        [ 0.0000, -0.2522],
        [ 0.0000, -0.2864],
        [ 0.0000, -0.3511],
        [ 0.0000, -0.3398],
        [ 0.0000, -0.3333],
        [ 0.0000, -0.3091],
        [ 0.0000, -0.3711],
        [ 0.0000, -0.2717],
        [ 0.0000, -0.3408],
        [ 0.0000, -0.3162],
        [ 0.0000, -0.2905],
        [ 0.0000, -0.3745],
        [ 0.0000, -0.3286],
        [ 0.0

sp_para_output_t:  tensor([[[-0.0082],
         [-0.0432],
         [ 0.0058],
         [ 0.0692],
         [ 0.0869],
         [ 0.0693],
         [-0.0261],
         [ 0.0582],
         [-0.0482],
         [ 0.0462]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([6, 2])
sp_sent.view(-1).size():  torch.Size([6])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2191],
        [ 0.0000, -0.1307],
        [ 0.0000, -0.1694],
        [ 0.0000, -0.1425],
        [ 0.0000, -0.1742],
        [ 0.0000, -0.1094]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 1, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([12], device='cuda:0')
start_logits:  tensor([ 0.1138, -0.0881,  0.0205,  0.0424,  0.2844,  0.0497,  0.1992,  0.0443,
         0.2690, -0.0451,  0.2144,  0.2971,  0.2114,  0.2808,  0.2115,  0.0974,
         0.2070, -0.0312,  0.2388,  0.1903,  0.0021,  0.1071, -0.3989,  0.0452,
         0.1819, -0.0382,  0.2556,  0.1575, -0.0

predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2817],
        [ 0.0000, -0.2920],
        [ 0.0000, -0.2083],
        [ 0.0000, -0.1937],
        [ 0.0000, -0.3232],
        [ 0.0000, -0.2357],
        [ 0.0000, -0.2312],
        [ 0.0000, -0.3000],
        [ 0.0000, -0.3386],
        [ 0.0000, -0.3042],
        [ 0.0000, -0.2952],
        [ 0.0000, -0.2810],
        [ 0.0000, -0.2703],
        [ 0.0000, -0.3242],
        [ 0.0000, -0.3423],
        [ 0.0000, -0.3540],
        [ 0.0000, -0.3162],
        [ 0.0000, -0.3369],
        [ 0.0000, -0.3025],
        [ 0.0000, -0.2961],
        [ 0.0000, -0.3672],
        [ 0.0000, -0.3125],
        [ 0.0000, -0.2830],
        [ 0.0000, -0.2952],
        [ 0.0000, -0.3169],
        [ 0.0000, -0.2922],
        [ 0.0000, -0.2502],
        [ 0.0000, -0.1417]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0

predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2098],
        [ 0.0000, -0.2145],
        [ 0.0000, -0.1818],
        [ 0.0000, -0.1577],
        [ 0.0000, -0.1932],
        [ 0.0000, -0.1270],
        [ 0.0000, -0.2089],
        [ 0.0000, -0.2036],
        [ 0.0000, -0.1975],
        [ 0.0000, -0.2115],
        [ 0.0000, -0.2014],
        [ 0.0000, -0.2070],
        [ 0.0000, -0.2334],
        [ 0.0000, -0.2145],
        [ 0.0000, -0.1876],
        [ 0.0000, -0.2227],
        [ 0.0000, -0.1936],
        [ 0.0000, -0.2089],
        [ 0.0000, -0.2515],
        [ 0.0000, -0.2009],
        [ 0.0000, -0.2365],
        [ 0.0000, -0.2174],
        [ 0.0000, -0.1661],
        [ 0.0000, -0.1698],
        [ 0.0000, -0.2140],
        [ 0.0000, -0.1895],
        [ 0.0000, -0.1542],
        [ 0.0000, -0.0774]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0

sp_para_output_t:  tensor([[[ 0.1250],
         [-0.0010],
         [ 0.0551],
         [-0.0382],
         [ 0.1122],
         [ 0.1360],
         [ 0.0975],
         [ 0.1000],
         [ 0.0876],
         [ 0.1666]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1552],
        [ 0.0000, -0.1663],
        [ 0.0000, -0.1322],
        [ 0.0000, -0.1851],
        [ 0.0000, -0.1587],
        [ 0.0000, -0.1686],
        [ 0.0000, -0.1880],
        [ 0.0000, -0.1683],
        [ 0.0000, -0.1516],
        [ 0.0000, -0.2000],
        [ 0.0000, -0.1962],
        [ 0.0000, -0.1748],
        [ 0.0000, -0.1776],
        [ 0.0000, -0.1549],
        [ 0.0000, -0.2090],
        [ 0.0000, -0.1871]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='

       dtype=torch.float16)
type_logits:  tensor([[0.0047, 0.0699, 0.2017]], device='cuda:0', dtype=torch.float16)
s_to_p_map:  [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 6, 9, 12, 14, 16]
sp_para_pred:  tensor([3, 4], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 16, 7: 17, 8: 18, 9: 19, 10: 20, 11: 21, 12: 25, 13: 26, 14: 30, 15: 31}
sp_para_map:  {0: 0, 1: 4, 2: 5, 3: 7, 4: 9}
sp_para_output_t:  tensor([[[ 0.1091],
         [-0.0345],
         [-0.0087],
         [ 0.0497],
         [ 0.0955],
         [ 0.0043],
         [ 0.0641],
         [ 0.1007],
         [ 0.0546],
         [-0.0192]]], device='cuda:0', dtype=torch.float16)
predict

predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2275],
        [ 0.0000, -0.2185],
        [ 0.0000, -0.2495],
        [ 0.0000, -0.3157],
        [ 0.0000, -0.2510],
        [ 0.0000, -0.2627],
        [ 0.0000, -0.2576],
        [ 0.0000, -0.2644],
        [ 0.0000, -0.2415],
        [ 0.0000, -0.2903],
        [ 0.0000, -0.2289],
        [ 0.0000, -0.2551],
        [ 0.0000, -0.2260],
        [ 0.0000, -0.2396],
        [ 0.0000, -0.2382],
        [ 0.0000, -0.3325],
        [ 0.0000, -0.2566],
        [ 0.0000, -0.2739],
        [ 0.0000, -0.2437],
        [ 0.0000, -0.2277],
        [ 0.0000, -0.3088],
        [ 0.0000, -0.2986],
        [ 0.0000, -0.2725],
        [ 0.0000, -0.2546],
        [ 0.0000, -0.1997],
        [ 0.0000, -0.2585],
        [ 0.0000, -0.2832],
        [ 0.0000, -0.1493],
        [ 0.0000, -0.2598]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2173],
        [ 0.0000, -0.2072],
        [ 0.0000, -0.0488],
        [ 0.0000, -0.2303],
        [ 0.0000, -0.2847],
        [ 0.0000, -0.1785],
        [ 0.0000, -0.0771],
        [ 0.0000, -0.1265],
        [ 0.0000, -0.1147],
        [ 0.0000, -0.2004],
        [ 0.0000, -0.1940],
        [ 0.0000, -0.1074],
        [ 0.0000, -0.2174]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([19], device='cuda:0')
start_logits:  tensor([ 0.3157,  0.1024,  0.2311,  0.4924,  0.2185,  0.4050,  0.4321,  0.1545,
         0.0781,  0.2406,  0.3372,  0.5635,  0.0463, -0.0010,  0.1288,  0.2534,
         0.4329,  0.5156,  0.4888, -0.1968,  0.3240, -0.1606,  0.1481,  0.4690,
         0.1065,  0.2832,  0.1382,  0.3643, -0.4238, -0.0540,  0.3369,  0.0075,
        

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2734],
        [ 0.0000, -0.1814],
        [ 0.0000, -0.2122],
        [ 0.0000, -0.2090],
        [ 0.0000, -0.2380],
        [ 0.0000, -0.2610],
        [ 0.0000, -0.2150],
        [ 0.0000, -0.2306],
        [ 0.0000, -0.2720],
        [ 0.0000, -0.2874],
        [ 0.0000, -0.2869],
        [ 0.0000, -0.2661],
        [ 0.0000, -0.2854],
        [ 0.0000, -0.3035],
        [ 0.0000, -0.2881],
        [ 0.0000, -0.2625],
        [ 0.0000, -0.1920],
        [ 0.0000, -0.2477]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([17], device='cuda:0')
start_logits:  tensor([ 0.3000,  0.1941,  0.4380,  0.6504,  0.3274,  0.4944,  0.5225,  0.5405,
         0.5986,  0.2759,  0.5894,  0.3447,  0.0186,  0.2153,  0.1841,  0.2379,
         0.17

predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2223],
        [ 0.0000, -0.1760],
        [ 0.0000, -0.1793],
        [ 0.0000, -0.1934],
        [ 0.0000, -0.1247],
        [ 0.0000, -0.2115],
        [ 0.0000, -0.1311],
        [ 0.0000, -0.1322],
        [ 0.0000, -0.1641],
        [ 0.0000, -0.1709],
        [ 0.0000, -0.1462],
        [ 0.0000, -0.1501],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.2109],
        [ 0.0000, -0.0878],
        [ 0.0000, -0.2120],
        [ 0.0000, -0.2125],
        [ 0.0000, -0.2273],
        [ 0.0000, -0.1990],
        [ 0.0000, -0.1947],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.1577],
        [ 0.0000, -0.0688],
        [ 0.0000, -0.0687]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([13], device='cuda

sp_para_output_t:  tensor([[[0.0422],
         [0.1191],
         [0.1064],
         [0.1142],
         [0.0518],
         [0.0273],
         [0.1257],
         [0.1339],
         [0.0790],
         [0.0389]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([17, 2])
sp_sent.view(-1).size():  torch.Size([17])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2773],
        [ 0.0000, -0.2413],
        [ 0.0000, -0.2769],
        [ 0.0000, -0.2102],
        [ 0.0000, -0.2253],
        [ 0.0000, -0.2507],
        [ 0.0000, -0.2837],
        [ 0.0000, -0.2520],
        [ 0.0000, -0.2805],
        [ 0.0000, -0.3020],
        [ 0.0000, -0.3091],
        [ 0.0000, -0.1982],
        [ 0.0000, -0.1846],
        [ 0.0000, -0.3325],
        [ 0.0000, -0.2313],
        [ 0.0000, -0.1952],
        [ 0.0000, -0.2744]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  

sp_para_output_t:  tensor([[[ 0.0594],
         [-0.0594],
         [ 0.0709],
         [-0.0013],
         [-0.0516],
         [ 0.0499],
         [-0.0126],
         [-0.0082],
         [ 0.0620],
         [ 0.0451]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2766],
        [ 0.0000, -0.2944],
        [ 0.0000, -0.2600],
        [ 0.0000, -0.2637],
        [ 0.0000, -0.2157],
        [ 0.0000, -0.3506],
        [ 0.0000, -0.2896],
        [ 0.0000, -0.2065],
        [ 0.0000, -0.2668],
        [ 0.0000, -0.2185],
        [ 0.0000, -0.3650],
        [ 0.0000, -0.2233],
        [ 0.0000, -0.2205],
        [ 0.0000, -0.3420],
        [ 0.0000, -0.2686],
        [ 0.0000, -0.2654],
        [ 0.0000, -0.3120],
        [ 0.0000, -0.1447],
        [ 0.0000, -0.2448],
        [ 0.0000, -0.2275],
        [ 0.0000, -0.3015]], device='cuda

sorted_answers: [{'start': tensor(439, device='cuda:0'), 'end': tensor(443, device='cuda:0'), 'start_logit': tensor(1.0254, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4990, device='cuda:0', dtype=torch.float16)}, {'start': tensor(58, device='cuda:0'), 'end': tensor(72, device='cuda:0'), 'start_logit': tensor(1.0059, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4456, device='cuda:0', dtype=torch.float16)}, {'start': tensor(504, device='cuda:0'), 'end': tensor(518, device='cuda:0'), 'start_logit': tensor(1.0322, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2996, device='cuda:0', dtype=torch.float16)}, {'start': tensor(504, device='cuda:0'), 'end': tensor(507, device='cuda:0'), 'start_logit': tensor(1.0322, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2438, device='cuda:0', dtype=torch.float16)}, {'start': tensor(588, device='cuda:0'), 'end': tensor(596, device='cuda:0'), 'start_logit': tensor(0.9893, device='cuda:0', dtype

sp_para_output_t:  tensor([[[ 0.0203],
         [-0.0322],
         [ 0.0471],
         [ 0.0707],
         [-0.0113],
         [-0.0396],
         [ 0.0366],
         [ 0.0046],
         [ 0.0272],
         [ 0.0562]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1934],
        [ 0.0000, -0.1838],
        [ 0.0000, -0.1636],
        [ 0.0000, -0.1504],
        [ 0.0000, -0.1543],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.1470],
        [ 0.0000, -0.1718],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.1881],
        [ 0.0000, -0.1821],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.1451],
        [ 0.0000, -0.2152],
        [ 0.0000, -0.2520],
        [ 0.0000, -0.2236],
        [ 0.0000, -0.0699],
        [ 0.0000, -0.1941],
        [ 0.0

sorted_answers: [{'start': tensor(76, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(1.0234, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(48, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9878, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(76, device='cuda:0'), 'end': tensor(88, device='cuda:0'), 'start_logit': tensor(1.0234, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4663, device='cuda:0', dtype=torch.float16)}, {'start': tensor(56, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9414, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(76, device='cuda:0'), 'end': tensor(84, device='cuda:0'), 'start_logit': tensor(1.0234, device='cuda:0', dtype=torch.f

predict_support_sent.view(-1, 2).size():  torch.Size([30, 2])
sp_sent.view(-1).size():  torch.Size([30])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3018],
        [ 0.0000, -0.2668],
        [ 0.0000, -0.2269],
        [ 0.0000, -0.1431],
        [ 0.0000, -0.3245],
        [ 0.0000, -0.2722],
        [ 0.0000, -0.2294],
        [ 0.0000, -0.2834],
        [ 0.0000, -0.2487],
        [ 0.0000, -0.2327],
        [ 0.0000, -0.2563],
        [ 0.0000, -0.2844],
        [ 0.0000, -0.2423],
        [ 0.0000, -0.2693],
        [ 0.0000, -0.2861],
        [ 0.0000, -0.3364],
        [ 0.0000, -0.3264],
        [ 0.0000, -0.2439],
        [ 0.0000, -0.3423],
        [ 0.0000, -0.3403],
        [ 0.0000, -0.3379],
        [ 0.0000, -0.2896],
        [ 0.0000, -0.3062],
        [ 0.0000, -0.2949],
        [ 0.0000, -0.2114],
        [ 0.0000, -0.2324],
        [ 0.0000, -0.3289],
        [ 0.0000, -0.3337],
        [ 0.0000, -0.2661],
        [ 0.0000, -0.3364]], device='cuda:0')
sp

predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1841],
        [ 0.0000, -0.2173],
        [ 0.0000, -0.2197],
        [ 0.0000, -0.2534],
        [ 0.0000, -0.3196],
        [ 0.0000, -0.2913],
        [ 0.0000, -0.2213],
        [ 0.0000, -0.3267],
        [ 0.0000, -0.2595],
        [ 0.0000, -0.3540],
        [ 0.0000, -0.3154],
        [ 0.0000, -0.2289],
        [ 0.0000, -0.3337],
        [ 0.0000, -0.3228],
        [ 0.0000, -0.2336],
        [ 0.0000, -0.3606],
        [ 0.0000, -0.3406],
        [ 0.0000, -0.3157]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([39], device='cuda:0')
start_logits:  tensor([ 0.2291,  0.0737,  0.1187,  0.1869,  0.2905,  0.4275,  0.2703,  0.3347,
         0.2075,  0.2805,  0.7822,  0.3013,  0.4275,  0.3779,  0.4290,  0.1906,
         0.19

sp_para_output_t:  tensor([[[ 0.1254],
         [-0.0410],
         [ 0.0015],
         [ 0.0570],
         [ 0.0084],
         [ 0.1212],
         [ 0.0782],
         [ 0.0258],
         [ 0.1570],
         [ 0.1115]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1857],
        [ 0.0000, -0.1814],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.1794],
        [ 0.0000, -0.0682],
        [ 0.0000, -0.1704],
        [ 0.0000, -0.1135],
        [ 0.0000, -0.0682],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.1451],
        [ 0.0000, -0.1768],
        [ 0.0000, -0.1755],
        [ 0.0000, -0.1421],
        [ 0.0000, -0.1686],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.2108],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.0681],
        [ 0.0000, -0.1766],
        [ 0.0000, -0.0682]], device='cuda

sp_para_output_t:  tensor([[[ 0.1019],
         [-0.0118],
         [ 0.0320],
         [ 0.0769],
         [ 0.0533],
         [ 0.0350],
         [ 0.1267],
         [-0.0481],
         [-0.0014],
         [ 0.0663]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2188],
        [ 0.0000, -0.2249],
        [ 0.0000, -0.2213],
        [ 0.0000, -0.2673],
        [ 0.0000, -0.2408],
        [ 0.0000, -0.2341],
        [ 0.0000, -0.1881],
        [ 0.0000, -0.2820],
        [ 0.0000, -0.2866],
        [ 0.0000, -0.2493],
        [ 0.0000, -0.2476],
        [ 0.0000, -0.2539],
        [ 0.0000, -0.2859],
        [ 0.0000, -0.2477],
        [ 0.0000, -0.2908],
        [ 0.0000, -0.2112],
        [ 0.0000, -0.2250],
        [ 0.0000, -0.2253],
        [ 0.0000, -0.2332],
        [ 0.0000, -0.2367]], device='cuda:0')
sp_sent.view(-1):  tens

sp_para_output_t:  tensor([[[0.0670],
         [0.0795],
         [0.0413],
         [0.0567],
         [0.0617],
         [0.0717],
         [0.1141],
         [0.0519],
         [0.0416],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2561],
        [ 0.0000, -0.3394],
        [ 0.0000, -0.2937],
        [ 0.0000, -0.2744],
        [ 0.0000, -0.2487],
        [ 0.0000, -0.3105],
        [ 0.0000, -0.3369],
        [ 0.0000, -0.3635],
        [ 0.0000, -0.3406],
        [ 0.0000, -0.2708],
        [ 0.0000, -0.3538],
        [ 0.0000, -0.3916],
        [ 0.0000, -0.2354],
        [ 0.0000, -0.3450],
        [ 0.0000, -0.3179]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 3.2471e-

sp_para_output_t:  tensor([[[ 0.0717],
         [-0.0243],
         [ 0.0627],
         [ 0.0905],
         [-0.0133],
         [-0.0021],
         [ 0.0086],
         [ 0.0320],
         [ 0.0137],
         [ 0.0578]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2162],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.1670],
        [ 0.0000, -0.1350],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0686],
        [ 0.0000, -0.1282],
        [ 0.0000, -0.1357],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.1875],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.2432],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0687],
        [ 0.0000, -0.0686]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0

sp_para_output_t:  tensor([[[ 0.1049],
         [ 0.0013],
         [-0.1049],
         [ 0.0597],
         [ 0.0991],
         [ 0.0183],
         [ 0.0242],
         [ 0.0580],
         [ 0.0475],
         [ 0.0079]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2262],
        [ 0.0000, -0.2974],
        [ 0.0000, -0.3162],
        [ 0.0000, -0.3035],
        [ 0.0000, -0.2214],
        [ 0.0000, -0.2705],
        [ 0.0000, -0.2896],
        [ 0.0000, -0.2617],
        [ 0.0000, -0.2573],
        [ 0.0000, -0.2925],
        [ 0.0000, -0.3955],
        [ 0.0000, -0.3452],
        [ 0.0000, -0.2673],
        [ 0.0000, -0.3545],
        [ 0.0000, -0.2939],
        [ 0.0000, -0.2683],
        [ 0.0000, -0.2485],
        [ 0.0000, -0.1464],
        [ 0.0000, -0.3269],
        [ 0.0000, -0.2737],
        [ 0.0000, -0.2417],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0620],
         [ 0.0051],
         [ 0.0133],
         [-0.0933],
         [ 0.0021],
         [-0.0166],
         [ 0.0784],
         [-0.0864],
         [-0.0267],
         [ 0.0768]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1113],
        [ 0.0000, -0.2627],
        [ 0.0000, -0.3152],
        [ 0.0000, -0.1942],
        [ 0.0000, -0.2727],
        [ 0.0000, -0.1511],
        [ 0.0000, -0.3394],
        [ 0.0000, -0.1328],
        [ 0.0000, -0.1918],
        [ 0.0000, -0.2581],
        [ 0.0000, -0.3083],
        [ 0.0000, -0.3201],
        [ 0.0000, -0.3284],
        [ 0.0000, -0.2864],
        [ 0.0000, -0.3137],
        [ 0.0000, -0.2391],
        [ 0.0000, -0.2913],
        [ 0.0000, -0.2131],
        [ 0.0000, -0.2461],
        [ 0.0000, -0.2465],
        [ 0.0000, -0.1517],
        [ 0.0

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1705],
        [ 0.0000, -0.3013],
        [ 0.0000, -0.2727],
        [ 0.0000, -0.2505],
        [ 0.0000, -0.2280],
        [ 0.0000, -0.2642],
        [ 0.0000, -0.1318],
        [ 0.0000, -0.2744],
        [ 0.0000, -0.3086],
        [ 0.0000, -0.3384],
        [ 0.0000, -0.2615],
        [ 0.0000, -0.3525],
        [ 0.0000, -0.2505],
        [ 0.0000, -0.4026],
        [ 0.0000, -0.3594],
        [ 0.0000, -0.3521],
        [ 0.0000, -0.3384],
        [ 0.0000, -0.2849],
        [ 0.0000, -0.2993],
        [ 0.0000, -0.2502],
        [ 0.0000, -0.3206],
        [ 0.0000, -0.2856],
        [ 0.0000, -0.2568],
        [ 0.0000, -0.2935],
        [ 0.0000, -0.3052],
        [ 0.0000, -0.2671],
        [ 0.0000, -0.3127],
        [ 0.0000, -0.1364],
        [ 0.0000, -0.3550],
        [ 0.0000, -0.2021],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.1644],
        [ 0.0000, -0.0651],
        [ 0.0000, -0.1520],
        [ 0.0000, -0.1532],
        [ 0.0000, -0.1018],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.1421],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.1497],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.1671],
        [ 0.0000, -0.1787],
        [ 0.0000, -0.1543],
        [ 0.0000, -0.1736],
        [ 0.0000, -0.1697],
        [ 0.0000, -0.0650],
        [ 0.0000, -0.1873],
        [ 0.0000, -0.1748],
        [ 0.0000, -0.0650]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([ 0.0837, -0.0039,  0.0990,  0.0756,  0.3127, -0.1125,  0.6890, -0.1260,
         0.0533, -0.1037,  0.0555,  0.0438,  0.1774,  0

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2302],
        [ 0.0000, -0.1672],
        [ 0.0000, -0.2371],
        [ 0.0000, -0.2659],
        [ 0.0000, -0.2418],
        [ 0.0000, -0.2883],
        [ 0.0000, -0.3501],
        [ 0.0000, -0.3132],
        [ 0.0000, -0.3054],
        [ 0.0000, -0.3132],
        [ 0.0000, -0.3345],
        [ 0.0000, -0.3281],
        [ 0.0000, -0.2986],
        [ 0.0000, -0.3135],
        [ 0.0000, -0.2847],
        [ 0.0000, -0.3252],
        [ 0.0000, -0.3911],
        [ 0.0000, -0.2280],
        [ 0.0000, -0.3376],
        [ 0.0000, -0.2939],
        [ 0.0000, -0.2573],
        [ 0.0000, -0.2391],
        [ 0.0000, -0.2927]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda:0')
start_logits:  tensor([ 0.

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3521],
        [ 0.0000, -0.3059],
        [ 0.0000, -0.3440],
        [ 0.0000, -0.2703],
        [ 0.0000, -0.2515],
        [ 0.0000, -0.3132],
        [ 0.0000, -0.3069],
        [ 0.0000, -0.2446],
        [ 0.0000, -0.2612],
        [ 0.0000, -0.3672],
        [ 0.0000, -0.3926],
        [ 0.0000, -0.3772],
        [ 0.0000, -0.3342]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([16], device='cuda:0')
start_logits:  tensor([ 0.2649,  0.2345,  0.2493,  0.3691,  0.2480,  0.2598,  0.0433,  0.7578,
         0.3674,  0.4136,  0.2576,  0.1793,  0.3845,  0.5381,  0.3723,  0.5347,
        -0.1750,  0.3757, -0.1125,  0.1122, -0.1843,  0.2410, -0.1216,  0.0278,
        -0.2419,  0.2469,  1.0527,  0.1061,  0.4531,  0.5835,  0.6479,  0.3289,
        

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2588],
        [ 0.0000, -0.2205],
        [ 0.0000, -0.1761],
        [ 0.0000, -0.2284],
        [ 0.0000, -0.1434],
        [ 0.0000, -0.1284],
        [ 0.0000, -0.1698],
        [ 0.0000, -0.1758],
        [ 0.0000, -0.0726],
        [ 0.0000, -0.1681],
        [ 0.0000, -0.0726],
        [ 0.0000, -0.2346],
        [ 0.0000, -0.0726],
        [ 0.0000, -0.2495],
        [ 0.0000, -0.2581],
        [ 0.0000, -0.0726],
        [ 0.0000, -0.2014],
        [ 0.0000, -0.1844],
        [ 0.0000, -0.2126],
        [ 0.0000, -0.1931],
        [ 0.0000, -0.1965],
        [ 0.0000, -0.1577],
        [ 0.0000, -0.1042]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([ 0.


Epoch 00001: avg_val_f1 reached 0.00000 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_1.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(21.2503, device='cuda:0')	avg_answer_loss:  tensor(3.4176, device='cuda:0')	avg_type_loss:  tensor(1.0982, device='cuda:0')	avg_sp_para_loss:  tensor(0.6352, device='cuda:0')	avg_sp_sent_loss:  tensor(0.5990, device='cuda:0')
avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1724137931034483	avg_val_sp_para_em:  0.034482758620689655	avg_val_sp_para_prec:  0.1724137931034483	avg_val_sp_para_recall:  0.1724137931034483
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
sp_para_output_t:  tensor([[[ 0.0256],
         [ 0.0057],
         [ 0.0095],
         [ 0.0616],
         [-0.0436],
         [-0.0953],
         [ 0.0067],
         [ 0.1423],
         [ 0.1337],
         [-0.0346]]], 

sp_para_output_t:  tensor([[[ 0.0109],
         [-0.0445],
         [ 0.0240],
         [ 0.0623],
         [ 0.0459],
         [-0.0368],
         [-0.0467],
         [ 0.0947],
         [ 0.0175],
         [-0.1643]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([10, 2])
sp_sent.view(-1).size():  torch.Size([10])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4062],
        [ 0.0000, -0.2568],
        [ 0.0000, -0.2983],
        [ 0.0000, -0.3787],
        [ 0.0000, -0.2598],
        [ 0.0000, -0.2423],
        [ 0.0000, -0.3252],
        [ 0.0000, -0.4612],
        [ 0.0000, -0.3328],
        [ 0.0000, -0.4287]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0], device='cuda:0')
sp_para_output_t:  tensor([[[-0.0555],
         [-0.0282],
         [ 0.0398],
         [-0.0471],
         [ 0.1859],
         [ 0.0110],
         [-0.0028],
         [-0.0

sp_para_output_t:  tensor([[[ 0.0563],
         [-0.0659],
         [-0.0284],
         [ 0.0443],
         [-0.0219],
         [ 0.0302],
         [-0.0353],
         [-0.0208],
         [ 0.0086],
         [ 0.0652]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2732],
        [ 0.0000, -0.2245],
        [ 0.0000, -0.3323],
        [ 0.0000, -0.2783],
        [ 0.0000, -0.3457],
        [ 0.0000, -0.2296],
        [ 0.0000, -0.2097],
        [ 0.0000, -0.2271],
        [ 0.0000, -0.4575],
        [ 0.0000, -0.4409],
        [ 0.0000, -0.2881],
        [ 0.0000, -0.2349],
        [ 0.0000, -0.3167],
        [ 0.0000, -0.4104],
        [ 0.0000, -0.2365],
        [ 0.0000, -0.1030],
        [ 0.0000, -0.3289],
        [ 0.0000, -0.3083],
        [ 0.0000, -0.3403],
        [ 0.0000, -0.4080],
        [ 

       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.2952],
        [ 0.0000, -0.2834],
        [ 0.0000, -0.3000],
        [ 0.0000, -0.2964],
        [ 0.0000, -0.4910],
        [ 0.0000, -0.3301],
        [ 0.0000, -0.2744],
        [ 0.0000, -0.2566],
        [ 0.0000, -0.3142],
        [ 0.0000, -0.2712],
        [ 0.0000, -0.2949],
        [ 0.0000, -0.2866],
        [ 0.0000, -0.2583],
        [ 0.0000, -0.2866],
        [ 0.0000, -0.3132],
        [ 0.0000, -0.5244],
        [ 0.0000, -0.3430],
        [ 0.0000, -0.0953],
        [ 0.0000, -0.0879],
        [ 0.0000, -0.3562]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       device='cuda:0')
sp_para_output_t:  tensor([[[-0.0212],
         [-0.0233],
         [ 0.0508],
         [ 0.0057],
         [-0.057

sp_para_output_t:  tensor([[[ 0.0140],
         [ 0.0072],
         [-0.0703],
         [-0.0051],
         [-0.0306],
         [-0.0456],
         [ 0.0322],
         [ 0.0030],
         [-0.0731],
         [-0.0471]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4299],
        [ 0.0000, -0.4707],
        [ 0.0000, -0.7500],
        [ 0.0000, -0.4321],
        [ 0.0000, -0.4309],
        [ 0.0000, -0.3440],
        [ 0.0000, -0.3848],
        [ 0.0000, -0.5371],
        [ 0.0000, -0.6138],
        [ 0.0000, -0.5801],
        [ 0.0000, -0.6367],
        [ 0.0000, -0.4304],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.4675],
        [ 0.0000, -0.4141],
        [ 0.0000, -0.4807],
        [ 0.0000, -0.3721],
        [ 0.0000, -0.5439],
        [ 0.0000, -0.5552],
        [ 0.0000, -0.4570],
        [ 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5029],
        [ 0.0000, -0.4910],
        [ 0.0000, -0.5508],
        [ 0.0000, -0.4783],
        [ 0.0000, -0.5269],
        [ 0.0000, -0.4626],
        [ 0.0000, -0.4878],
        [ 0.0000, -0.5073],
        [ 0.0000, -0.5264],
        [ 0.0000, -0.6118],
        [ 0.0000, -0.5684],
        [ 0.0000, -0.6055],
        [ 0.0000, -0.6206],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.6689],
        [ 0.0000, -0.5874],
        [ 0.0000, -0.5249],
        [ 0.0000, -0.5493],
        [ 0.0000, -0.5503],
        [ 0.0000, -0.5615],
        [ 0.0000, -0.5874],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5566],
        [ 0.0000, -0.5098],
        [ 0.0000, -0.4758],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.4753],
        [ 0.0000, -0.5967],
        [ 0.0000, -0.5400],
        [ 0.0000, -0.6113],
        [ 0.0000, -0.5620],
        [ 0.0000, -0.5903],
        [ 0.0000, -0.6362],
        [ 0.0000, -0.6411],
        [ 0.0000, -0.6470],
        [ 0.0000, -0.6060],
        [ 0.0000, -0.6729],
        [ 0.0000, -0.5835],
        [ 0.0000, -0.6157],
        [ 0.0000, -0.5625],
        [ 0.0000, -0.5542],
        [ 0.0000, -0.6406],
        [ 0.0000, -0.6138],
        [ 0.0

sorted_answers: [{'start': tensor(41, device='cuda:0'), 'end': tensor(51, device='cuda:0'), 'start_logit': tensor(1.0537, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4976, device='cuda:0', dtype=torch.float16)}, {'start': tensor(369, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(1.1484, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3562, device='cuda:0', dtype=torch.float16)}, {'start': tensor(377, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(1.1084, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3562, device='cuda:0', dtype=torch.float16)}, {'start': tensor(633, device='cuda:0'), 'end': tensor(633, device='cuda:0'), 'start_logit': tensor(0.9131, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4570, device='cuda:0', dtype=torch.float16)}, {'start': tensor(365, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(0.9561, device='cuda:0', dtype

sp_para_output_t:  tensor([[[0.1230],
         [0.0267],
         [0.0003],
         [0.0997],
         [0.1021],
         [0.0622],
         [0.0459],
         [0.1267],
         [0.0909],
         [0.0672]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5249],
        [ 0.0000, -0.5278],
        [ 0.0000, -0.4106],
        [ 0.0000, -0.4102],
        [ 0.0000, -0.5376],
        [ 0.0000, -0.4629],
        [ 0.0000, -0.4509],
        [ 0.0000, -0.5327],
        [ 0.0000, -0.5771],
        [ 0.0000, -0.5483],
        [ 0.0000, -0.5518],
        [ 0.0000, -0.5200],
        [ 0.0000, -0.5200],
        [ 0.0000, -0.5991],
        [ 0.0000, -0.5938],
        [ 0.0000, -0.6162],
        [ 0.0000, -0.5815],
        [ 0.0000, -0.5864],
        [ 0.0000, -0.5396],
        [ 0.0000, -0.5386],
        [ 0.0000, -0.6206],
        [ 0.0000, -0.54

sp_para_output_t:  tensor([[[ 0.0556],
         [ 0.0083],
         [ 0.0294],
         [-0.0126],
         [ 0.0560],
         [ 0.0119],
         [ 0.0243],
         [ 0.0448],
         [ 0.0230],
         [ 0.0520]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4033],
        [ 0.0000, -0.4109],
        [ 0.0000, -0.3699],
        [ 0.0000, -0.3091],
        [ 0.0000, -0.3899],
        [ 0.0000, -0.3677],
        [ 0.0000, -0.4309],
        [ 0.0000, -0.4104],
        [ 0.0000, -0.3979],
        [ 0.0000, -0.4158],
        [ 0.0000, -0.4036],
        [ 0.0000, -0.4124],
        [ 0.0000, -0.4368],
        [ 0.0000, -0.4182],
        [ 0.0000, -0.4053],
        [ 0.0000, -0.4126],
        [ 0.0000, -0.3926],
        [ 0.0000, -0.4238],
        [ 0.0000, -0.4626],
        [ 0.0000, -0.3958],
        [ 0.0000, -0.4380],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.1250],
         [-0.0010],
         [ 0.0551],
         [-0.0382],
         [ 0.1122],
         [ 0.1360],
         [ 0.0975],
         [ 0.1000],
         [ 0.0876],
         [ 0.1666]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3936],
        [ 0.0000, -0.3748],
        [ 0.0000, -0.4067],
        [ 0.0000, -0.4087],
        [ 0.0000, -0.3877],
        [ 0.0000, -0.4619],
        [ 0.0000, -0.4172],
        [ 0.0000, -0.3625],
        [ 0.0000, -0.4492],
        [ 0.0000, -0.4482],
        [ 0.0000, -0.4294],
        [ 0.0000, -0.4331],
        [ 0.0000, -0.3955],
        [ 0.0000, -0.3970],
        [ 0.0000, -0.4324],
        [ 0.0000, -0.4451]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='

sorted_answers: [{'start': tensor(332, device='cuda:0'), 'end': tensor(345, device='cuda:0'), 'start_logit': tensor(1.0107, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1169, device='cuda:0', dtype=torch.float16)}, {'start': tensor(331, device='cuda:0'), 'end': tensor(345, device='cuda:0'), 'start_logit': tensor(1.0010, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1169, device='cuda:0', dtype=torch.float16)}, {'start': tensor(239, device='cuda:0'), 'end': tensor(240, device='cuda:0'), 'start_logit': tensor(1.0186, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.0938, device='cuda:0', dtype=torch.float16)}, {'start': tensor(344, device='cuda:0'), 'end': tensor(345, device='cuda:0'), 'start_logit': tensor(0.9780, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1169, device='cuda:0', dtype=torch.float16)}, {'start': tensor(70, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.8975, device='cuda:0', dtype

sp_para_output_t:  tensor([[[ 0.0761],
         [ 0.0216],
         [ 0.0409],
         [ 0.0370],
         [ 0.0201],
         [ 0.0386],
         [ 0.0670],
         [ 0.1090],
         [ 0.0017],
         [-0.1193]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4900],
        [ 0.0000, -0.4741],
        [ 0.0000, -0.4917],
        [ 0.0000, -0.5830],
        [ 0.0000, -0.5386],
        [ 0.0000, -0.5566],
        [ 0.0000, -0.5352],
        [ 0.0000, -0.5444],
        [ 0.0000, -0.5298],
        [ 0.0000, -0.5830],
        [ 0.0000, -0.5264],
        [ 0.0000, -0.5835],
        [ 0.0000, -0.5317],
        [ 0.0000, -0.5566],
        [ 0.0000, -0.5444],
        [ 0.0000, -0.6348],
        [ 0.0000, -0.5762],
        [ 0.0000, -0.5708],
        [ 0.0000, -0.5259],
        [ 0.0000, -0.4963],
        [ 0.0000, -0.5957],
        [ 0.0

sorted_answers: [{'start': tensor(459, device='cuda:0'), 'end': tensor(484, device='cuda:0'), 'start_logit': tensor(0.8940, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4099, device='cuda:0', dtype=torch.float16)}, {'start': tensor(87, device='cuda:0'), 'end': tensor(110, device='cuda:0'), 'start_logit': tensor(0.9526, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3340, device='cuda:0', dtype=torch.float16)}, {'start': tensor(332, device='cuda:0'), 'end': tensor(340, device='cuda:0'), 'start_logit': tensor(0.9390, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3293, device='cuda:0', dtype=torch.float16)}, {'start': tensor(643, device='cuda:0'), 'end': tensor(652, device='cuda:0'), 'start_logit': tensor(0.9458, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3201, device='cuda:0', dtype=torch.float16)}, {'start': tensor(315, device='cuda:0'), 'end': tensor(340, device='cuda:0'), 'start_logit': tensor(0.9351, device='cuda:0', dtyp

sp_para_output_t:  tensor([[[ 0.0764],
         [ 0.0457],
         [-0.0152],
         [ 0.0510],
         [ 0.0996],
         [-0.0208],
         [ 0.0554],
         [ 0.0989],
         [ 0.0224],
         [ 0.0266]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5405],
        [ 0.0000, -0.3953],
        [ 0.0000, -0.5137],
        [ 0.0000, -0.4731],
        [ 0.0000, -0.5107],
        [ 0.0000, -0.5298],
        [ 0.0000, -0.5034],
        [ 0.0000, -0.5132],
        [ 0.0000, -0.5176],
        [ 0.0000, -0.5767],
        [ 0.0000, -0.5859],
        [ 0.0000, -0.5513],
        [ 0.0000, -0.5796],
        [ 0.0000, -0.6084],
        [ 0.0000, -0.5640],
        [ 0.0000, -0.5186],
        [ 0.0000, -0.4490],
        [ 0.0000, -0.4900]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[-0.0049],
         [-0.0566],
         [ 0.0391],
         [-0.0248],
         [-0.0542],
         [-0.0328],
         [-0.0478],
         [-0.0393],
         [ 0.0304],
         [ 0.0587]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4148],
        [ 0.0000, -0.3718],
        [ 0.0000, -0.3826],
        [ 0.0000, -0.3513],
        [ 0.0000, -0.2644],
        [ 0.0000, -0.4111],
        [ 0.0000, -0.3650],
        [ 0.0000, -0.3333],
        [ 0.0000, -0.3760],
        [ 0.0000, -0.3723],
        [ 0.0000, -0.3352],
        [ 0.0000, -0.3293],
        [ 0.0000, -0.1083],
        [ 0.0000, -0.4312],
        [ 0.0000, -0.1748],
        [ 0.0000, -0.4114],
        [ 0.0000, -0.3994],
        [ 0.0000, -0.3994],
        [ 0.0000, -0.4148],
        [ 0.0000, -0.3582],
        [ 0.0000, -0.1083],
        [ 0.0

sorted_answers: [{'start': tensor(214, device='cuda:0'), 'end': tensor(241, device='cuda:0'), 'start_logit': tensor(0.8599, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2888, device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 7, 13, 14, 19, 24]
sp_para_pred:  tensor([2, 4], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, 13: 15, 14: 28, 15: 29, 16: 30, 17: 31, 18: 32, 19: 33, 20: 34, 21: 35, 22: 36, 23: 37}
sp_para_map:  {0: 0, 1: 2, 2: 3, 3: 8, 4: 9}
sp_para_output_t:  tensor([[[0.0422],
         [0.1191],
      

sp_para_output_t:  tensor([[[ 0.0594],
         [-0.0594],
         [ 0.0709],
         [-0.0013],
         [-0.0516],
         [ 0.0499],
         [-0.0126],
         [-0.0082],
         [ 0.0620],
         [ 0.0451]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4973],
        [ 0.0000, -0.5239],
        [ 0.0000, -0.4749],
        [ 0.0000, -0.4937],
        [ 0.0000, -0.4570],
        [ 0.0000, -0.5908],
        [ 0.0000, -0.5508],
        [ 0.0000, -0.4587],
        [ 0.0000, -0.5132],
        [ 0.0000, -0.4771],
        [ 0.0000, -0.6328],
        [ 0.0000, -0.4719],
        [ 0.0000, -0.4614],
        [ 0.0000, -0.5879],
        [ 0.0000, -0.4458],
        [ 0.0000, -0.5073],
        [ 0.0000, -0.5361],
        [ 0.0000, -0.2744],
        [ 0.0000, -0.4802],
        [ 0.0000, -0.4680],
        [ 0.0000, -0.5723]], device='cuda

sorted_answers: [{'start': tensor(439, device='cuda:0'), 'end': tensor(443, device='cuda:0'), 'start_logit': tensor(0.9902, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5200, device='cuda:0', dtype=torch.float16)}, {'start': tensor(58, device='cuda:0'), 'end': tensor(72, device='cuda:0'), 'start_logit': tensor(0.9839, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4907, device='cuda:0', dtype=torch.float16)}, {'start': tensor(72, device='cuda:0'), 'end': tensor(72, device='cuda:0'), 'start_logit': tensor(0.8628, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4907, device='cuda:0', dtype=torch.float16)}, {'start': tensor(58, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9839, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2834, device='cuda:0', dtype=torch.float16)}, {'start': tensor(504, device='cuda:0'), 'end': tensor(518, device='cuda:0'), 'start_logit': tensor(0.9819, device='cuda:0', dtype=tor

sp_para_output_t:  tensor([[[ 0.0203],
         [-0.0322],
         [ 0.0471],
         [ 0.0707],
         [-0.0113],
         [-0.0396],
         [ 0.0366],
         [ 0.0046],
         [ 0.0272],
         [ 0.0562]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3721],
        [ 0.0000, -0.3657],
        [ 0.0000, -0.3242],
        [ 0.0000, -0.3286],
        [ 0.0000, -0.3301],
        [ 0.0000, -0.1091],
        [ 0.0000, -0.3037],
        [ 0.0000, -0.3462],
        [ 0.0000, -0.2644],
        [ 0.0000, -0.1090],
        [ 0.0000, -0.4038],
        [ 0.0000, -0.3635],
        [ 0.0000, -0.1090],
        [ 0.0000, -0.1090],
        [ 0.0000, -0.3840],
        [ 0.0000, -0.3152],
        [ 0.0000, -0.4265],
        [ 0.0000, -0.4241],
        [ 0.0000, -0.4211],
        [ 0.0000, -0.3970],
        [ 0.0000, -0.3530],
        [ 0.0

end_logits:  tensor([-0.3806, -0.5161, -0.3694, -0.0435, -0.3323, -0.2461, -0.2078, -0.2585,
        -0.2078, -0.4109, -0.0332,  0.2108, -0.3135,  0.4165, -0.2040,  0.2423,
        -0.2126, -0.0067, -0.3572, -0.0154, -0.3679, -0.2078, -0.0318, -0.1852,
        -0.1842, -0.3811, -0.1919, -0.5674, -0.0953, -0.4990,  0.0674, -0.0972,
        -0.2871, -0.3376,  0.1024,  0.1055, -0.1316, -0.3420, -0.1711,  0.0608,
        -0.1056, -0.3127,  0.0481, -0.3337, -0.2333, -0.4268, -0.3376, -0.1796,
         0.1511,  0.0640, -0.5410,  0.3555,  0.2125,  0.0093, -0.5049,  0.3469,
        -0.0753,  0.0682, -0.2993,  0.0513, -0.2130, -0.1104, -0.0302, -0.1215,
         0.0555, -0.4128, -0.2849,  0.3103, -0.1711, -0.0017,  0.1110, -0.0643,
         0.4358, -0.1130, -0.4553,  0.3921,  0.0745,  0.6001,  0.1146, -0.1117,
         0.1073,  0.2347,  0.2240,  0.1172,  0.4128, -0.0923, -0.4573, -0.1759,
         0.4434,  0.3821,  0.2820, -0.1760,  0.1030, -0.4502,  0.2135,  0.0168,
         0.1428, -0.2588, -

sp_para_output_t:  tensor([[[ 0.0051],
         [-0.0299],
         [ 0.0267],
         [ 0.0029],
         [-0.0100],
         [ 0.0106],
         [ 0.0380],
         [ 0.0071],
         [ 0.0284],
         [ 0.0005]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.6143],
        [ 0.0000, -0.4128],
        [ 0.0000, -0.4778],
        [ 0.0000, -0.4575],
        [ 0.0000, -0.5205],
        [ 0.0000, -0.5562],
        [ 0.0000, -0.5625],
        [ 0.0000, -0.5312],
        [ 0.0000, -0.5864],
        [ 0.0000, -0.5327],
        [ 0.0000, -0.5737],
        [ 0.0000, -0.6611],
        [ 0.0000, -0.4463],
        [ 0.0000, -0.6621],
        [ 0.0000, -0.6279],
        [ 0.0000, -0.5918],
        [ 0.0000, -0.5664],
        [ 0.0000, -0.6001],
        [ 0.0000, -0.6147],
        [ 0.0000, -0.4089],
        [ 0.0000, -0.4636],
        [ 0.0

sorted_answers: [{'start': tensor(472, device='cuda:0'), 'end': tensor(497, device='cuda:0'), 'start_logit': tensor(0.9805, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4683, device='cuda:0', dtype=torch.float16)}, {'start': tensor(471, device='cuda:0'), 'end': tensor(497, device='cuda:0'), 'start_logit': tensor(0.9751, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4683, device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 6, 8, 12, 17, 24]
sp_para_pred:  tensor([1, 2], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 9, 1: 10, 2: 11, 3: 12, 4: 13, 5: 14, 6: 27, 7: 28, 8: 29, 9: 30, 

sp_para_output_t:  tensor([[[ 0.0070],
         [ 0.0013],
         [ 0.0386],
         [ 0.1003],
         [ 0.0265],
         [ 0.1079],
         [ 0.0259],
         [ 0.1171],
         [-0.0012],
         [ 0.0015]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4531],
        [ 0.0000, -0.4685],
        [ 0.0000, -0.4800],
        [ 0.0000, -0.5327],
        [ 0.0000, -0.6479],
        [ 0.0000, -0.5820],
        [ 0.0000, -0.5034],
        [ 0.0000, -0.6338],
        [ 0.0000, -0.5806],
        [ 0.0000, -0.6582],
        [ 0.0000, -0.6211],
        [ 0.0000, -0.5068],
        [ 0.0000, -0.6187],
        [ 0.0000, -0.6289],
        [ 0.0000, -0.5405],
        [ 0.0000, -0.6138],
        [ 0.0000, -0.6309],
        [ 0.0000, -0.5869]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[ 0.1254],
         [-0.0410],
         [ 0.0015],
         [ 0.0570],
         [ 0.0084],
         [ 0.1212],
         [ 0.0782],
         [ 0.0258],
         [ 0.1570],
         [ 0.1115]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3555],
        [ 0.0000, -0.3618],
        [ 0.0000, -0.2747],
        [ 0.0000, -0.3767],
        [ 0.0000, -0.3206],
        [ 0.0000, -0.3420],
        [ 0.0000, -0.2622],
        [ 0.0000, -0.1142],
        [ 0.0000, -0.1142],
        [ 0.0000, -0.3608],
        [ 0.0000, -0.3325],
        [ 0.0000, -0.3784],
        [ 0.0000, -0.3740],
        [ 0.0000, -0.3792],
        [ 0.0000, -0.4316],
        [ 0.0000, -0.1143],
        [ 0.0000, -0.4065],
        [ 0.0000, -0.4172],
        [ 0.0000, -0.3843],
        [ 0.0000, -0.3594],
        [ 0.0000, -0.1142]], device='cuda

sp_para_output_t:  tensor([[[ 0.1019],
         [-0.0118],
         [ 0.0320],
         [ 0.0769],
         [ 0.0533],
         [ 0.0350],
         [ 0.1267],
         [-0.0481],
         [-0.0014],
         [ 0.0663]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4419],
        [ 0.0000, -0.4351],
        [ 0.0000, -0.4548],
        [ 0.0000, -0.4895],
        [ 0.0000, -0.4717],
        [ 0.0000, -0.4480],
        [ 0.0000, -0.3962],
        [ 0.0000, -0.4856],
        [ 0.0000, -0.5181],
        [ 0.0000, -0.4678],
        [ 0.0000, -0.4946],
        [ 0.0000, -0.4663],
        [ 0.0000, -0.5254],
        [ 0.0000, -0.4695],
        [ 0.0000, -0.5181],
        [ 0.0000, -0.4519],
        [ 0.0000, -0.4629],
        [ 0.0000, -0.4197],
        [ 0.0000, -0.4705],
        [ 0.0000, -0.5044]], device='cuda:0')
sp_sent.view(-1):  tens

sp_para_output_t:  tensor([[[0.0670],
         [0.0795],
         [0.0413],
         [0.0567],
         [0.0617],
         [0.0717],
         [0.1141],
         [0.0519],
         [0.0416],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4695],
        [ 0.0000, -0.5928],
        [ 0.0000, -0.5425],
        [ 0.0000, -0.5068],
        [ 0.0000, -0.5054],
        [ 0.0000, -0.5410],
        [ 0.0000, -0.6128],
        [ 0.0000, -0.6240],
        [ 0.0000, -0.6030],
        [ 0.0000, -0.5317],
        [ 0.0000, -0.5903],
        [ 0.0000, -0.6523],
        [ 0.0000, -0.5020],
        [ 0.0000, -0.6108],
        [ 0.0000, -0.6187]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 0.3247, 

sorted_answers: [{'start': tensor(95, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.8740, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3169, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.8740, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2991, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.7573, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3169, device='cuda:0', dtype=torch.float16)}, {'start': tensor(97, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.7676, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2991, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.7573, device='cuda:0', dtype=torch.f

predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4829],
        [ 0.0000, -0.5303],
        [ 0.0000, -0.5703],
        [ 0.0000, -0.5586],
        [ 0.0000, -0.4707],
        [ 0.0000, -0.5239],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.5396],
        [ 0.0000, -0.5464],
        [ 0.0000, -0.5322],
        [ 0.0000, -0.6567],
        [ 0.0000, -0.6206],
        [ 0.0000, -0.5371],
        [ 0.0000, -0.6255],
        [ 0.0000, -0.5479],
        [ 0.0000, -0.5234],
        [ 0.0000, -0.5024],
        [ 0.0000, -0.3088],
        [ 0.0000, -0.5737],
        [ 0.0000, -0.5073],
        [ 0.0000, -0.4856],
        [ 0.0000, -0.4495],
        [ 0.0000, -0.5566],
        [ 0.0000, -0.5767],
        [ 0.0000, -0.5815],
        [ 0.0000, -0.4609],
        [ 0.0000, -0.5913]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3433],
        [ 0.0000, -0.5386],
        [ 0.0000, -0.6265],
        [ 0.0000, -0.4797],
        [ 0.0000, -0.5645],
        [ 0.0000, -0.3882],
        [ 0.0000, -0.6392],
        [ 0.0000, -0.4600],
        [ 0.0000, -0.3975],
        [ 0.0000, -0.5503],
        [ 0.0000, -0.5942],
        [ 0.0000, -0.6035],
        [ 0.0000, -0.6255],
        [ 0.0000, -0.5396],
        [ 0.0000, -0.5757],
        [ 0.0000, -0.5181],
        [ 0.0000, -0.5698],
        [ 0.0000, -0.4937],
        [ 0.0000, -0.4897],
        [ 0.0000, -0.5005],
        [ 0.0000, -0.4060],
        [ 0.0000, -0.4832],
        [ 0.0000, -0.5376],
        [ 0.0000, -0.5181],
        [ 0.0000, -0.6025],
        [ 0.0000, -0.5083],
        [ 0.0000, -0.4814],
        [ 0.0000, -0.5449],
        [ 0.0000, -0.6206],
        [ 0.0000, -0.5308],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3523],
        [ 0.0000, -0.5693],
        [ 0.0000, -0.5283],
        [ 0.0000, -0.4924],
        [ 0.0000, -0.4773],
        [ 0.0000, -0.5327],
        [ 0.0000, -0.2773],
        [ 0.0000, -0.5874],
        [ 0.0000, -0.5933],
        [ 0.0000, -0.5830],
        [ 0.0000, -0.5034],
        [ 0.0000, -0.6274],
        [ 0.0000, -0.4949],
        [ 0.0000, -0.6772],
        [ 0.0000, -0.6040],
        [ 0.0000, -0.6084],
        [ 0.0000, -0.6133],
        [ 0.0000, -0.5693],
        [ 0.0000, -0.5654],
        [ 0.0000, -0.4692],
        [ 0.0000, -0.5498],
        [ 0.0000, -0.5220],
        [ 0.0000, -0.5312],
        [ 0.0000, -0.5928],
        [ 0.0000, -0.5894],
        [ 0.0000, -0.5298],
        [ 0.0000, -0.6025],
        [ 0.0000, -0.4175],
        [ 0.0000, -0.6235],
        [ 0.0000, -0.4272],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.3228],
        [ 0.0000, -0.3672],
        [ 0.0000, -0.3411],
        [ 0.0000, -0.3228],
        [ 0.0000, -0.3357],
        [ 0.0000, -0.1085],
        [ 0.0000, -0.3560],
        [ 0.0000, -0.1085],
        [ 0.0000, -0.3176],
        [ 0.0000, -0.3237],
        [ 0.0000, -0.3547],
        [ 0.0000, -0.3647],
        [ 0.0000, -0.3237],
        [ 0.0000, -0.3408],
        [ 0.0000, -0.3567],
        [ 0.0000, -0.1086],
        [ 0.0000, -0.3428],
        [ 0.0000, -0.3352],
        [ 0.0000, -0.3442]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([-8.9050e-02, -2.5049e-01, -2.9510e-02, -1.1682e-01,  2.7856e-01,
        -2.5195e-01,  7.0508e-01, -3.9746e-01, -4.5349e-02, -2

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4612],
        [ 0.0000, -0.3855],
        [ 0.0000, -0.4963],
        [ 0.0000, -0.5161],
        [ 0.0000, -0.5161],
        [ 0.0000, -0.5625],
        [ 0.0000, -0.6133],
        [ 0.0000, -0.6387],
        [ 0.0000, -0.5869],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.6191],
        [ 0.0000, -0.5957],
        [ 0.0000, -0.5591],
        [ 0.0000, -0.5747],
        [ 0.0000, -0.5308],
        [ 0.0000, -0.5850],
        [ 0.0000, -0.6426],
        [ 0.0000, -0.4683],
        [ 0.0000, -0.5889],
        [ 0.0000, -0.5571],
        [ 0.0000, -0.4980],
        [ 0.0000, -0.4832],
        [ 0.0000, -0.5698]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda:0')
start_logits:  tensor([ 2.

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.6333],
        [ 0.0000, -0.5903],
        [ 0.0000, -0.6050],
        [ 0.0000, -0.5435],
        [ 0.0000, -0.5303],
        [ 0.0000, -0.6074],
        [ 0.0000, -0.6104],
        [ 0.0000, -0.5615],
        [ 0.0000, -0.5342],
        [ 0.0000, -0.6851],
        [ 0.0000, -0.7085],
        [ 0.0000, -0.6743],
        [ 0.0000, -0.6475]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([16], device='cuda:0')
start_logits:  tensor([ 2.4561e-01,  1.8750e-01,  1.1841e-01,  2.6221e-01,  2.0300e-01,
         1.8091e-01, -8.1482e-02,  7.2949e-01,  3.4326e-01,  4.0723e-01,
         2.3242e-01,  1.4368e-01,  3.5400e-01,  5.1953e-01,  3.4082e-01,
         5.4102e-01, -3.1494e-01,  3.7061e-01, -2.8394e-01,  6.3416e-02,
        -2.7417e-01,  2.3267e-01, -2

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.4473],
        [ 0.0000, -0.4153],
        [ 0.0000, -0.3420],
        [ 0.0000, -0.4121],
        [ 0.0000, -0.3176],
        [ 0.0000, -0.2935],
        [ 0.0000, -0.3875],
        [ 0.0000, -0.3467],
        [ 0.0000, -0.3467],
        [ 0.0000, -0.3596],
        [ 0.0000, -0.1107],
        [ 0.0000, -0.4321],
        [ 0.0000, -0.1107],
        [ 0.0000, -0.4414],
        [ 0.0000, -0.4500],
        [ 0.0000, -0.4062],
        [ 0.0000, -0.3735],
        [ 0.0000, -0.3638],
        [ 0.0000, -0.3696],
        [ 0.0000, -0.3594],
        [ 0.0000, -0.3765],
        [ 0.0000, -0.3450],
        [ 0.0000, -0.2455]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([ 3.


Epoch 00002: avg_val_f1 reached 0.00000 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_2.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(18.2765, device='cuda:0')	avg_answer_loss:  tensor(3.4152, device='cuda:0')	avg_type_loss:  tensor(0.8292, device='cuda:0')	avg_sp_para_loss:  tensor(0.5558, device='cuda:0')	avg_sp_sent_loss:  tensor(0.5157, device='cuda:0')
avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1896551724137931	avg_val_sp_para_em:  0.06896551724137931	avg_val_sp_para_prec:  0.1896551724137931	avg_val_sp_para_recall:  0.1896551724137931
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
sp_para_output_t:  tensor([[[-0.0415],
         [ 0.0132],
         [ 0.0264],
         [-0.0581],
         [ 0.0077],
         [ 0.0364],
         [ 0.0784],
         [ 0.0259],
         [ 0.0271],
         [ 0.0319]]], d

sp_para_output_t:  tensor([[[-0.1091],
         [-0.0285],
         [ 0.0205],
         [ 0.0302],
         [-0.0194],
         [-0.0462],
         [ 0.0247],
         [-0.0090],
         [-0.0099],
         [-0.0341]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5747],
        [ 0.0000, -0.4209],
        [ 0.0000, -0.5469],
        [ 0.0000, -0.5112],
        [ 0.0000, -0.5542],
        [ 0.0000, -0.5776],
        [ 0.0000, -0.4563],
        [ 0.0000, -0.3865],
        [ 0.0000, -0.5513],
        [ 0.0000, -0.4487],
        [ 0.0000, -0.4844],
        [ 0.0000, -0.6934],
        [ 0.0000, -0.5371],
        [ 0.0000, -0.7217],
        [ 0.0000, -0.5063],
        [ 0.0000, -0.5898],
        [ 0.0000, -0.5771],
        [ 0.0000, -0.4958],
        [ 0.0000, -0.5137],
        [ 0.0000, -0.5337],
        [ 

sp_para_output_t:  tensor([[[-0.0398],
         [ 0.0288],
         [ 0.0378],
         [ 0.0723],
         [ 0.0612],
         [ 0.0764],
         [ 0.0172],
         [ 0.0789],
         [-0.0321],
         [ 0.0215]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5029],
        [ 0.0000, -0.6484],
        [ 0.0000, -0.7344],
        [ 0.0000, -0.6538],
        [ 0.0000, -0.6660],
        [ 0.0000, -0.6982],
        [ 0.0000, -0.6646],
        [ 0.0000, -0.6323],
        [ 0.0000, -0.5308],
        [ 0.0000, -0.7017],
        [ 0.0000, -0.6152],
        [ 0.0000, -0.5947],
        [ 0.0000, -0.6050],
        [ 0.0000, -0.6348],
        [ 0.0000, -0.6309],
        [ 0.0000, -0.5479],
        [ 0.0000, -0.6309],
        [ 0.0000, -0.4626],
        [ 0.0000, -0.5854],
        [ 0.0000, -0.3694]], device='c

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7905],
        [ 0.0000, -0.6118],
        [ 0.0000, -0.5864],
        [ 0.0000, -0.7197],
        [ 0.0000, -0.6426],
        [ 0.0000, -0.5308],
        [ 0.0000, -0.5796],
        [ 0.0000, -0.5840],
        [ 0.0000, -0.6523],
        [ 0.0000, -0.6646],
        [ 0.0000, -0.5703],
        [ 0.0000, -0.5371],
        [ 0.0000, -0.5698]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0120],
         [ 0.0204],
         [ 0.0392],
         [-0.0146],
         [-0.0252],
         [ 0.0135],
         [ 0.0360],
         [ 0.0641],
         [-0.0548],
         [-0.0923]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).s

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8696],
        [ 0.0000, -0.8037],
        [ 0.0000, -0.8955],
        [ 0.0000, -0.8369],
        [ 0.0000, -0.8755],
        [ 0.0000, -0.8247],
        [ 0.0000, -0.8423],
        [ 0.0000, -0.8711],
        [ 0.0000, -0.9160],
        [ 0.0000, -0.9629],
        [ 0.0000, -0.9321],
        [ 0.0000, -0.9512],
        [ 0.0000, -0.9453],
        [ 0.0000, -0.9575],
        [ 0.0000, -1.0020],
        [ 0.0000, -0.9521],
        [ 0.0000, -0.8887],
        [ 0.0000, -0.9297],
        [ 0.0000, -0.9087],
        [ 0.0000, -0.9087],
        [ 0.0000, -0.9429],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.9268],
        [ 0.0000, -0.8682],
        [ 0.0000, -0.8594],
        [ 0.0000, -0.9458],
        [ 0.0000, -0.8247],
        [ 0.0000, -0.9634],
        [ 0.0000, -0.9082],
        [ 0.0000, -0.9629],
        [ 0.0000, -0.9600],
        [ 0.0000, -0.9526],
        [ 0.0000, -1.0020],
        [ 0.0000, -1.0234],
        [ 0.0000, -1.0107],
        [ 0.0000, -0.9878],
        [ 0.0000, -1.0244],
        [ 0.0000, -0.9805],
        [ 0.0000, -0.9834],
        [ 0.0000, -0.9258],
        [ 0.0000, -0.9131],
        [ 0.0000, -0.9868],
        [ 0.0000, -0.9639],
        [ 0.0

sorted_answers: [{'start': tensor(41, device='cuda:0'), 'end': tensor(51, device='cuda:0'), 'start_logit': tensor(0.9639, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6191, device='cuda:0', dtype=torch.float16)}, {'start': tensor(369, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(1.1211, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3511, device='cuda:0', dtype=torch.float16)}, {'start': tensor(377, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(1.0518, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3511, device='cuda:0', dtype=torch.float16)}, {'start': tensor(535, device='cuda:0'), 'end': tensor(537, device='cuda:0'), 'start_logit': tensor(0.9131, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4104, device='cuda:0', dtype=torch.float16)}, {'start': tensor(365, device='cuda:0'), 'end': tensor(383, device='cuda:0'), 'start_logit': tensor(0.9360, device='cuda:0', dtype

sp_para_output_t:  tensor([[[0.1230],
         [0.0267],
         [0.0003],
         [0.0997],
         [0.1021],
         [0.0622],
         [0.0459],
         [0.1267],
         [0.0909],
         [0.0672]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8496],
        [ 0.0000, -0.8486],
        [ 0.0000, -0.7246],
        [ 0.0000, -0.7495],
        [ 0.0000, -0.8418],
        [ 0.0000, -0.7915],
        [ 0.0000, -0.7793],
        [ 0.0000, -0.8550],
        [ 0.0000, -0.9058],
        [ 0.0000, -0.8867],
        [ 0.0000, -0.8999],
        [ 0.0000, -0.8623],
        [ 0.0000, -0.8667],
        [ 0.0000, -0.9331],
        [ 0.0000, -0.9331],
        [ 0.0000, -0.9468],
        [ 0.0000, -0.9282],
        [ 0.0000, -0.9277],
        [ 0.0000, -0.8647],
        [ 0.0000, -0.8623],
        [ 0.0000, -0.9546],
        [ 0.0000, -0.87

sp_para_output_t:  tensor([[[ 0.0556],
         [ 0.0083],
         [ 0.0294],
         [-0.0126],
         [ 0.0560],
         [ 0.0119],
         [ 0.0243],
         [ 0.0448],
         [ 0.0230],
         [ 0.0520]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7158],
        [ 0.0000, -0.7354],
        [ 0.0000, -0.6826],
        [ 0.0000, -0.6108],
        [ 0.0000, -0.7197],
        [ 0.0000, -0.7114],
        [ 0.0000, -0.7808],
        [ 0.0000, -0.7065],
        [ 0.0000, -0.7031],
        [ 0.0000, -0.7427],
        [ 0.0000, -0.7075],
        [ 0.0000, -0.7158],
        [ 0.0000, -0.7500],
        [ 0.0000, -0.7578],
        [ 0.0000, -0.7310],
        [ 0.0000, -0.6948],
        [ 0.0000, -0.7349],
        [ 0.0000, -0.7510],
        [ 0.0000, -0.7852],
        [ 0.0000, -0.7202],
        [ 0.0000, -0.7471],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.1250],
         [-0.0010],
         [ 0.0551],
         [-0.0382],
         [ 0.1122],
         [ 0.1360],
         [ 0.0975],
         [ 0.1000],
         [ 0.0876],
         [ 0.1666]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7021],
        [ 0.0000, -0.6846],
        [ 0.0000, -0.7383],
        [ 0.0000, -0.7134],
        [ 0.0000, -0.7114],
        [ 0.0000, -0.7822],
        [ 0.0000, -0.7485],
        [ 0.0000, -0.6519],
        [ 0.0000, -0.8145],
        [ 0.0000, -0.7686],
        [ 0.0000, -0.7207],
        [ 0.0000, -0.7549],
        [ 0.0000, -0.7368],
        [ 0.0000, -0.7158],
        [ 0.0000, -0.7310],
        [ 0.0000, -0.7705]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='

sorted_answers: [{'start': tensor(332, device='cuda:0'), 'end': tensor(345, device='cuda:0'), 'start_logit': tensor(1.0186, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.0831, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(113, device='cuda:0'), 'start_logit': tensor(0.9102, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1788, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(123, device='cuda:0'), 'start_logit': tensor(0.9102, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1656, device='cuda:0', dtype=torch.float16)}, {'start': tensor(331, device='cuda:0'), 'end': tensor(345, device='cuda:0'), 'start_logit': tensor(0.9902, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.0831, device='cuda:0', dtype=torch.float16)}, {'start': tensor(70, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.8926, device='cuda:0', dtype=t

sp_para_output_t:  tensor([[[ 0.0761],
         [ 0.0216],
         [ 0.0409],
         [ 0.0370],
         [ 0.0201],
         [ 0.0386],
         [ 0.0670],
         [ 0.1090],
         [ 0.0017],
         [-0.1193]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8745],
        [ 0.0000, -0.8174],
        [ 0.0000, -0.8311],
        [ 0.0000, -0.9580],
        [ 0.0000, -0.9336],
        [ 0.0000, -0.9302],
        [ 0.0000, -0.8979],
        [ 0.0000, -0.9160],
        [ 0.0000, -0.9102],
        [ 0.0000, -0.9531],
        [ 0.0000, -0.9141],
        [ 0.0000, -0.9785],
        [ 0.0000, -0.9199],
        [ 0.0000, -0.9521],
        [ 0.0000, -0.9253],
        [ 0.0000, -0.9912],
        [ 0.0000, -0.9751],
        [ 0.0000, -0.9326],
        [ 0.0000, -0.8921],
        [ 0.0000, -0.8511],
        [ 0.0000, -0.9712],
        [ 0.0

sorted_answers: [{'start': tensor(643, device='cuda:0'), 'end': tensor(652, device='cuda:0'), 'start_logit': tensor(0.9883, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3762, device='cuda:0', dtype=torch.float16)}, {'start': tensor(87, device='cuda:0'), 'end': tensor(110, device='cuda:0'), 'start_logit': tensor(0.9360, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3887, device='cuda:0', dtype=torch.float16)}, {'start': tensor(459, device='cuda:0'), 'end': tensor(484, device='cuda:0'), 'start_logit': tensor(0.8560, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4414, device='cuda:0', dtype=torch.float16)}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': tensor(0.9546, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3245, device='cuda:0', dtype=torch.float16)}, {'start': tensor(332, device='cuda:0'), 'end': tensor(340, device='cuda:0'), 'start_logit': tensor(0.9097, device='cuda:0', dtype=

sp_para_output_t:  tensor([[[ 0.0764],
         [ 0.0457],
         [-0.0152],
         [ 0.0510],
         [ 0.0996],
         [-0.0208],
         [ 0.0554],
         [ 0.0989],
         [ 0.0224],
         [ 0.0266]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8877],
        [ 0.0000, -0.7212],
        [ 0.0000, -0.8882],
        [ 0.0000, -0.8281],
        [ 0.0000, -0.8735],
        [ 0.0000, -0.8833],
        [ 0.0000, -0.8779],
        [ 0.0000, -0.8911],
        [ 0.0000, -0.8867],
        [ 0.0000, -0.9150],
        [ 0.0000, -0.9624],
        [ 0.0000, -0.9204],
        [ 0.0000, -0.9541],
        [ 0.0000, -0.9697],
        [ 0.0000, -0.9121],
        [ 0.0000, -0.8491],
        [ 0.0000, -0.8335],
        [ 0.0000, -0.8726]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[-0.0049],
         [-0.0566],
         [ 0.0391],
         [-0.0248],
         [-0.0542],
         [-0.0328],
         [-0.0478],
         [-0.0393],
         [ 0.0304],
         [ 0.0587]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7012],
        [ 0.0000, -0.6704],
        [ 0.0000, -0.7119],
        [ 0.0000, -0.6572],
        [ 0.0000, -0.4990],
        [ 0.0000, -0.7310],
        [ 0.0000, -0.6694],
        [ 0.0000, -0.6685],
        [ 0.0000, -0.6929],
        [ 0.0000, -0.7168],
        [ 0.0000, -0.6484],
        [ 0.0000, -0.6387],
        [ 0.0000, -0.7656],
        [ 0.0000, -0.7446],
        [ 0.0000, -0.3113],
        [ 0.0000, -0.7329],
        [ 0.0000, -0.7012],
        [ 0.0000, -0.6982],
        [ 0.0000, -0.7334],
        [ 0.0000, -0.6245],
        [ 0.0000, -0.7744],
        [ 0.0

sorted_answers: [{'start': tensor(20, device='cuda:0'), 'end': tensor(42, device='cuda:0'), 'start_logit': tensor(0.8257, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2426, device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 7, 13, 14, 19, 24]
sp_para_pred:  tensor([2, 3], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, 13: 15, 14: 28, 15: 29, 16: 30, 17: 31, 18: 32, 19: 33, 20: 34, 21: 35, 22: 36, 23: 37}
sp_para_map:  {0: 0, 1: 2, 2: 3, 3: 8, 4: 9}
sp_para_output_t:  tensor([[[0.0422],
         [0.1191],
        

sp_para_output_t:  tensor([[[ 0.0594],
         [-0.0594],
         [ 0.0709],
         [-0.0013],
         [-0.0516],
         [ 0.0499],
         [-0.0126],
         [-0.0082],
         [ 0.0620],
         [ 0.0451]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8213],
        [ 0.0000, -0.8555],
        [ 0.0000, -0.7979],
        [ 0.0000, -0.8276],
        [ 0.0000, -0.8145],
        [ 0.0000, -0.9194],
        [ 0.0000, -0.9004],
        [ 0.0000, -0.8193],
        [ 0.0000, -0.8550],
        [ 0.0000, -0.8091],
        [ 0.0000, -0.9629],
        [ 0.0000, -0.8140],
        [ 0.0000, -0.8125],
        [ 0.0000, -0.9116],
        [ 0.0000, -0.7300],
        [ 0.0000, -0.8604],
        [ 0.0000, -0.8760],
        [ 0.0000, -0.5366],
        [ 0.0000, -0.8218],
        [ 0.0000, -0.8184],
        [ 0.0000, -0.9229]], device='cuda

sorted_answers: [{'start': tensor(439, device='cuda:0'), 'end': tensor(443, device='cuda:0'), 'start_logit': tensor(0.9800, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5396, device='cuda:0', dtype=torch.float16)}, {'start': tensor(58, device='cuda:0'), 'end': tensor(72, device='cuda:0'), 'start_logit': tensor(0.9658, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5430, device='cuda:0', dtype=torch.float16)}, {'start': tensor(588, device='cuda:0'), 'end': tensor(614, device='cuda:0'), 'start_logit': tensor(0.9761, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3120, device='cuda:0', dtype=torch.float16)}, {'start': tensor(58, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9658, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2991, device='cuda:0', dtype=torch.float16)}, {'start': tensor(29, device='cuda:0'), 'end': tensor(58, device='cuda:0'), 'start_logit': tensor(0.9917, device='cuda:0', dtype=tor

sp_para_output_t:  tensor([[[ 0.0203],
         [-0.0322],
         [ 0.0471],
         [ 0.0707],
         [-0.0113],
         [-0.0396],
         [ 0.0366],
         [ 0.0046],
         [ 0.0272],
         [ 0.0562]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.6704],
        [ 0.0000, -0.6719],
        [ 0.0000, -0.6230],
        [ 0.0000, -0.6343],
        [ 0.0000, -0.5977],
        [ 0.0000, -0.1704],
        [ 0.0000, -0.5957],
        [ 0.0000, -0.6543],
        [ 0.0000, -0.5210],
        [ 0.0000, -0.1703],
        [ 0.0000, -0.7207],
        [ 0.0000, -0.6743],
        [ 0.0000, -0.6909],
        [ 0.0000, -0.7041],
        [ 0.0000, -0.7002],
        [ 0.0000, -0.6016],
        [ 0.0000, -0.7505],
        [ 0.0000, -0.6948],
        [ 0.0000, -0.7407],
        [ 0.0000, -0.6924],
        [ 0.0000, -0.6396],
        [ 0.0

sorted_answers: [{'start': tensor(76, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9746, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(48, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.9121, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(56, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.8862, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6479, device='cuda:0', dtype=torch.float16)}, {'start': tensor(76, device='cuda:0'), 'end': tensor(88, device='cuda:0'), 'start_logit': tensor(0.9746, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4238, device='cuda:0', dtype=torch.float16)}, {'start': tensor(76, device='cuda:0'), 'end': tensor(84, device='cuda:0'), 'start_logit': tensor(0.9746, device='cuda:0', dtype=torch.f

sp_para_output_t:  tensor([[[ 0.0377],
         [-0.0421],
         [ 0.0136],
         [-0.0545],
         [ 0.0500],
         [-0.0019],
         [-0.0224],
         [-0.0699],
         [-0.0588],
         [-0.0276]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([30, 2])
sp_sent.view(-1).size():  torch.Size([30])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.9082],
        [ 0.0000, -0.9341],
        [ 0.0000, -0.8560],
        [ 0.0000, -0.7524],
        [ 0.0000, -0.9424],
        [ 0.0000, -0.9097],
        [ 0.0000, -0.8901],
        [ 0.0000, -0.9512],
        [ 0.0000, -0.9458],
        [ 0.0000, -0.9380],
        [ 0.0000, -0.9609],
        [ 0.0000, -1.0127],
        [ 0.0000, -0.8911],
        [ 0.0000, -0.9414],
        [ 0.0000, -1.0059],
        [ 0.0000, -0.9473],
        [ 0.0000, -0.9741],
        [ 0.0000, -0.9551],
        [ 0.0000, -0.9976],
        [ 0.0000, -0.9595],
        [ 0.0000, -0.9551],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0070],
         [ 0.0013],
         [ 0.0386],
         [ 0.1003],
         [ 0.0265],
         [ 0.1079],
         [ 0.0259],
         [ 0.1171],
         [-0.0012],
         [ 0.0015]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8203],
        [ 0.0000, -0.8218],
        [ 0.0000, -0.8384],
        [ 0.0000, -0.9194],
        [ 0.0000, -1.0166],
        [ 0.0000, -0.9424],
        [ 0.0000, -0.8784],
        [ 0.0000, -0.9873],
        [ 0.0000, -0.9458],
        [ 0.0000, -0.9917],
        [ 0.0000, -0.9868],
        [ 0.0000, -0.8804],
        [ 0.0000, -0.9980],
        [ 0.0000, -1.0029],
        [ 0.0000, -0.9141],
        [ 0.0000, -0.9556],
        [ 0.0000, -0.9819],
        [ 0.0000, -0.9463]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[ 0.1254],
         [-0.0410],
         [ 0.0015],
         [ 0.0570],
         [ 0.0084],
         [ 0.1212],
         [ 0.0782],
         [ 0.0258],
         [ 0.1570],
         [ 0.1115]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.6401],
        [ 0.0000, -0.6455],
        [ 0.0000, -0.6172],
        [ 0.0000, -0.6680],
        [ 0.0000, -0.5850],
        [ 0.0000, -0.6396],
        [ 0.0000, -0.5083],
        [ 0.0000, -0.1780],
        [ 0.0000, -0.1781],
        [ 0.0000, -0.6577],
        [ 0.0000, -0.6245],
        [ 0.0000, -0.6475],
        [ 0.0000, -0.6504],
        [ 0.0000, -0.6733],
        [ 0.0000, -0.7256],
        [ 0.0000, -0.7295],
        [ 0.0000, -0.7178],
        [ 0.0000, -0.7012],
        [ 0.0000, -0.6377],
        [ 0.0000, -0.6270],
        [ 0.0000, -0.1781]], device='cuda

sp_para_output_t:  tensor([[[ 0.1019],
         [-0.0118],
         [ 0.0320],
         [ 0.0769],
         [ 0.0533],
         [ 0.0350],
         [ 0.1267],
         [-0.0481],
         [-0.0014],
         [ 0.0663]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7368],
        [ 0.0000, -0.7383],
        [ 0.0000, -0.7578],
        [ 0.0000, -0.8193],
        [ 0.0000, -0.7871],
        [ 0.0000, -0.7437],
        [ 0.0000, -0.7202],
        [ 0.0000, -0.7944],
        [ 0.0000, -0.8496],
        [ 0.0000, -0.7754],
        [ 0.0000, -0.8394],
        [ 0.0000, -0.7837],
        [ 0.0000, -0.8442],
        [ 0.0000, -0.7861],
        [ 0.0000, -0.8623],
        [ 0.0000, -0.7915],
        [ 0.0000, -0.7681],
        [ 0.0000, -0.6997],
        [ 0.0000, -0.7871],
        [ 0.0000, -0.8105]], device='cuda:0')
sp_sent.view(-1):  tens

sp_para_output_t:  tensor([[[0.0670],
         [0.0795],
         [0.0413],
         [0.0567],
         [0.0617],
         [0.0717],
         [0.1141],
         [0.0519],
         [0.0416],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8081],
        [ 0.0000, -0.9419],
        [ 0.0000, -0.9023],
        [ 0.0000, -0.8472],
        [ 0.0000, -0.8662],
        [ 0.0000, -0.8721],
        [ 0.0000, -0.9370],
        [ 0.0000, -0.9468],
        [ 0.0000, -0.9268],
        [ 0.0000, -0.8799],
        [ 0.0000, -0.9385],
        [ 0.0000, -0.9766],
        [ 0.0000, -0.8682],
        [ 0.0000, -0.9478],
        [ 0.0000, -0.9624]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 0.3186, 

sorted_answers: [{'start': tensor(95, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.8462, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3306, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.8462, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3149, device='cuda:0', dtype=torch.float16)}, {'start': tensor(161, device='cuda:0'), 'end': tensor(162, device='cuda:0'), 'start_logit': tensor(0.8164, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2477, device='cuda:0', dtype=torch.float16)}, {'start': tensor(97, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.7310, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3149, device='cuda:0', dtype=torch.float16)}, {'start': tensor(27, device='cuda:0'), 'end': tensor(28, device='cuda:0'), 'start_logit': tensor(0.7974, device='cuda:0', dtype=torch

predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.8340],
        [ 0.0000, -0.8521],
        [ 0.0000, -0.9131],
        [ 0.0000, -0.9204],
        [ 0.0000, -0.8228],
        [ 0.0000, -0.8721],
        [ 0.0000, -0.9546],
        [ 0.0000, -0.8804],
        [ 0.0000, -0.9004],
        [ 0.0000, -0.8374],
        [ 0.0000, -0.9731],
        [ 0.0000, -0.9673],
        [ 0.0000, -0.8896],
        [ 0.0000, -0.9580],
        [ 0.0000, -0.8994],
        [ 0.0000, -0.8594],
        [ 0.0000, -0.8604],
        [ 0.0000, -0.6392],
        [ 0.0000, -0.9072],
        [ 0.0000, -0.8657],
        [ 0.0000, -0.8247],
        [ 0.0000, -0.7876],
        [ 0.0000, -0.9102],
        [ 0.0000, -0.9204],
        [ 0.0000, -0.9321],
        [ 0.0000, -0.8115],
        [ 0.0000, -0.9209]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7300],
        [ 0.0000, -0.9053],
        [ 0.0000, -0.9863],
        [ 0.0000, -0.8696],
        [ 0.0000, -0.9604],
        [ 0.0000, -0.7383],
        [ 0.0000, -1.0029],
        [ 0.0000, -0.9307],
        [ 0.0000, -0.7466],
        [ 0.0000, -0.9424],
        [ 0.0000, -0.9590],
        [ 0.0000, -0.9678],
        [ 0.0000, -0.9888],
        [ 0.0000, -0.8896],
        [ 0.0000, -0.9238],
        [ 0.0000, -0.8940],
        [ 0.0000, -0.9360],
        [ 0.0000, -0.8955],
        [ 0.0000, -0.8491],
        [ 0.0000, -0.8813],
        [ 0.0000, -0.8066],
        [ 0.0000, -0.8774],
        [ 0.0000, -0.9160],
        [ 0.0000, -0.9009],
        [ 0.0000, -0.9810],
        [ 0.0000, -0.8882],
        [ 0.0000, -0.8809],
        [ 0.0000, -0.9062],
        [ 0.0000, -0.9883],
        [ 0.0000, -0.9160],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.6055],
        [ 0.0000, -0.9331],
        [ 0.0000, -0.8984],
        [ 0.0000, -0.8696],
        [ 0.0000, -0.8867],
        [ 0.0000, -0.9204],
        [ 0.0000, -0.5083],
        [ 0.0000, -0.9551],
        [ 0.0000, -0.9712],
        [ 0.0000, -0.9404],
        [ 0.0000, -0.8735],
        [ 0.0000, -0.9839],
        [ 0.0000, -0.8848],
        [ 0.0000, -1.0117],
        [ 0.0000, -0.9385],
        [ 0.0000, -0.9692],
        [ 0.0000, -0.9766],
        [ 0.0000, -0.9312],
        [ 0.0000, -0.9229],
        [ 0.0000, -0.8208],
        [ 0.0000, -0.9258],
        [ 0.0000, -0.8799],
        [ 0.0000, -0.9224],
        [ 0.0000, -0.9541],
        [ 0.0000, -0.9565],
        [ 0.0000, -0.9028],
        [ 0.0000, -0.9829],
        [ 0.0000, -0.8091],
        [ 0.0000, -0.9790],
        [ 0.0000, -0.7744],
        [ 0.0000, -0

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5781],
        [ 0.0000, -0.6382],
        [ 0.0000, -0.6230],
        [ 0.0000, -0.5996],
        [ 0.0000, -0.6450],
        [ 0.0000, -0.1696],
        [ 0.0000, -0.6475],
        [ 0.0000, -0.6982],
        [ 0.0000, -0.5933],
        [ 0.0000, -0.6035],
        [ 0.0000, -0.6377],
        [ 0.0000, -0.6509],
        [ 0.0000, -0.5977],
        [ 0.0000, -0.6118],
        [ 0.0000, -0.6372],
        [ 0.0000, -0.6616],
        [ 0.0000, -0.6187],
        [ 0.0000, -0.5991],
        [ 0.0000, -0.6016]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([-1.8408e-01, -5.1416e-01, -1.7139e-01, -3.3643e-01,  2.4341e-01,
        -4.7266e-01,  5.8398e-01, -6.4307e-01, -1.3953e-01, -3

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7974],
        [ 0.0000, -0.7202],
        [ 0.0000, -0.8618],
        [ 0.0000, -0.8794],
        [ 0.0000, -0.8896],
        [ 0.0000, -0.9185],
        [ 0.0000, -0.9521],
        [ 0.0000, -0.9849],
        [ 0.0000, -0.9565],
        [ 0.0000, -0.9268],
        [ 0.0000, -0.9585],
        [ 0.0000, -0.9355],
        [ 0.0000, -0.9111],
        [ 0.0000, -0.9307],
        [ 0.0000, -0.8677],
        [ 0.0000, -0.9365],
        [ 0.0000, -0.9775],
        [ 0.0000, -0.8032],
        [ 0.0000, -0.9390],
        [ 0.0000, -0.8965],
        [ 0.0000, -0.8530],
        [ 0.0000, -0.8364],
        [ 0.0000, -0.9463]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda:0')
start_logits:  tensor([ 2.

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.0039],
        [ 0.0000, -0.9561],
        [ 0.0000, -0.9766],
        [ 0.0000, -0.9473],
        [ 0.0000, -0.9272],
        [ 0.0000, -0.9834],
        [ 0.0000, -0.9751],
        [ 0.0000, -0.9888],
        [ 0.0000, -0.8979],
        [ 0.0000, -1.0684],
        [ 0.0000, -1.0615],
        [ 0.0000, -1.0303],
        [ 0.0000, -1.0439]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([16], device='cuda:0')
start_logits:  tensor([ 0.2446,  0.1082,  0.0422,  0.1421,  0.1597,  0.1064, -0.1919,  0.6719,
         0.3225,  0.3860,  0.2119,  0.0931,  0.3633,  0.5127,  0.2849,  0.5264,
        -0.3945,  0.3677, -0.3921,  0.0266, -0.3606,  0.2546, -0.2620, -0.0695,
        -0.4529,  0.2130,  0.9731,  0.0808,  0.4419,  0.5767,  0.6538,  0.3918,
        

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.7319],
        [ 0.0000, -0.7163],
        [ 0.0000, -0.6201],
        [ 0.0000, -0.6968],
        [ 0.0000, -0.6001],
        [ 0.0000, -0.5913],
        [ 0.0000, -0.6934],
        [ 0.0000, -0.6157],
        [ 0.0000, -0.6670],
        [ 0.0000, -0.6509],
        [ 0.0000, -0.1694],
        [ 0.0000, -0.7012],
        [ 0.0000, -0.7256],
        [ 0.0000, -0.7490],
        [ 0.0000, -0.7295],
        [ 0.0000, -0.6987],
        [ 0.0000, -0.6548],
        [ 0.0000, -0.6538],
        [ 0.0000, -0.6401],
        [ 0.0000, -0.6357],
        [ 0.0000, -0.6763],
        [ 0.0000, -0.6489],
        [ 0.0000, -0.5137]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([-4.


Epoch 00003: avg_val_f1 reached 0.00000 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_3.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(15.6381, device='cuda:0')	avg_answer_loss:  tensor(3.4111, device='cuda:0')	avg_type_loss:  tensor(0.5865, device='cuda:0')	avg_sp_para_loss:  tensor(0.5074, device='cuda:0')	avg_sp_sent_loss:  tensor(0.4221, device='cuda:0')
avg_val_f1:  0.0	avg_val_em:  0.0	avg_val_prec:  0.0	avg_val_recall:  0.0
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1724137931034483	avg_val_sp_para_em:  0.034482758620689655	avg_val_sp_para_prec:  0.1724137931034483	avg_val_sp_para_recall:  0.1724137931034483
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
sp_para_output_t:  tensor([[[0.0810],
         [0.0656],
         [0.0286],
         [0.1017],
         [0.1542],
         [0.0816],
         [0.0820],
         [0.1010],
         [0.0636],
         [0.0726]]], device='cu

sp_para_output_t:  tensor([[[ 0.0626],
         [-0.0596],
         [ 0.0381],
         [ 0.1136],
         [ 0.0433],
         [-0.0141],
         [-0.0143],
         [-0.0493],
         [ 0.0110],
         [ 0.0421]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.5659],
        [ 0.0000, -0.1619],
        [ 0.0000, -0.5669],
        [ 0.0000, -0.5093],
        [ 0.0000, -0.6230],
        [ 0.0000, -0.1617],
        [ 0.0000, -0.5444],
        [ 0.0000, -0.6104],
        [ 0.0000, -0.5400],
        [ 0.0000, -0.6880],
        [ 0.0000, -0.1467],
        [ 0.0000, -0.6323],
        [ 0.0000, -0.5698]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tensor([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
sp_para_output_t:  tensor([[[ 0.0079],
         [ 0.0354],
         [ 0.0033],
 

sp_para_output_t:  tensor([[[ 0.0654],
         [ 0.0425],
         [ 0.0219],
         [ 0.0512],
         [-0.0777],
         [-0.0211],
         [ 0.0533],
         [ 0.0424],
         [ 0.1085],
         [-0.0297]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -0.9883],
        [ 0.0000, -0.9458],
        [ 0.0000, -1.1221],
        [ 0.0000, -0.9766],
        [ 0.0000, -0.9619],
        [ 0.0000, -0.9907],
        [ 0.0000, -0.9785],
        [ 0.0000, -1.0107],
        [ 0.0000, -0.7720],
        [ 0.0000, -1.1416],
        [ 0.0000, -0.9985],
        [ 0.0000, -1.0088],
        [ 0.0000, -1.0723],
        [ 0.0000, -1.0996],
        [ 0.0000, -0.9043],
        [ 0.0000, -1.0811],
        [ 0.0000, -0.8872],
        [ 0.0000, -1.0703],
        [ 0.0000, -1.1406],
        [ 0.0000, -1.1543],
        [ 

sp_para_output_t:  tensor([[[ 0.1595],
         [ 0.0757],
         [ 0.0237],
         [-0.0014],
         [ 0.0218],
         [ 0.0596],
         [ 0.1371],
         [-0.0062],
         [-0.0123],
         [-0.0035]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.1904],
        [ 0.0000, -1.0391],
        [ 0.0000, -1.1094],
        [ 0.0000, -1.1924],
        [ 0.0000, -1.2139],
        [ 0.0000, -1.0459],
        [ 0.0000, -1.1387],
        [ 0.0000, -1.0537],
        [ 0.0000, -1.1074],
        [ 0.0000, -1.1094],
        [ 0.0000, -1.2061],
        [ 0.0000, -0.9717],
        [ 0.0000, -1.2500],
        [ 0.0000, -1.0625],
        [ 0.0000, -1.1377],
        [ 0.0000, -1.1572],
        [ 0.0000, -0.9111],
        [ 0.0000, -1.1494]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tenso

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.3174],
        [ 0.0000, -1.2686],
        [ 0.0000, -1.3135],
        [ 0.0000, -1.2842],
        [ 0.0000, -1.3555],
        [ 0.0000, -1.3164],
        [ 0.0000, -1.3271],
        [ 0.0000, -1.2988],
        [ 0.0000, -1.2891],
        [ 0.0000, -1.3447],
        [ 0.0000, -1.3564],
        [ 0.0000, -1.3984],
        [ 0.0000, -1.3887],
        [ 0.0000, -1.3086],
        [ 0.0000, -1.3262],
        [ 0.0000, -1.3643],
        [ 0.0000, -1.3076],
        [ 0.0000, -1.3154],
        [ 0.0000, -1.3135],
        [ 0.0000, -1.3389],
        [ 0.0000, -1.3584],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.3760],
        [ 0.0000, -1.2881],
        [ 0.0000, -1.3643],
        [ 0.0000, -1.3740],
        [ 0.0000, -1.3145],
        [ 0.0000, -1.3848],
        [ 0.0000, -1.3682],
        [ 0.0000, -1.4053],
        [ 0.0000, -1.3799],
        [ 0.0000, -1.3379],
        [ 0.0000, -1.4004],
        [ 0.0000, -1.3994],
        [ 0.0000, -1.3955],
        [ 0.0000, -1.3711],
        [ 0.0000, -1.3994],
        [ 0.0000, -1.3408],
        [ 0.0000, -1.3779],
        [ 0.0000, -1.4268],
        [ 0.0000, -1.3076],
        [ 0.0000, -1.3926],
        [ 0.0000, -1.3818],
        [ 0.0

sorted_answers: [{'start': tensor(535, device='cuda:0'), 'end': tensor(537, device='cuda:0'), 'start_logit': tensor(0.9268, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4595, device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 7, 13, 21, 27, 36]
sp_para_pred:  tensor([3, 1], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 13, 8: 14, 9: 15, 10: 16, 11: 17, 12: 18, 13: 19, 14: 20, 15: 21, 16: 22, 17: 23, 18: 24, 19: 25, 20: 26, 21: 41, 22: 42, 23: 43, 24: 44, 25: 45, 26: 46, 27: 47, 28: 48, 29: 49, 30: 50, 31: 51, 32: 52,

sp_para_output_t:  tensor([[[0.1230],
         [0.0267],
         [0.0003],
         [0.0997],
         [0.1021],
         [0.0622],
         [0.0459],
         [0.1267],
         [0.0909],
         [0.0672]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2725],
        [ 0.0000, -1.3076],
        [ 0.0000, -1.1982],
        [ 0.0000, -1.2266],
        [ 0.0000, -1.2783],
        [ 0.0000, -1.2705],
        [ 0.0000, -1.2412],
        [ 0.0000, -1.3066],
        [ 0.0000, -1.3359],
        [ 0.0000, -1.3047],
        [ 0.0000, -1.3047],
        [ 0.0000, -1.3184],
        [ 0.0000, -1.3027],
        [ 0.0000, -1.3057],
        [ 0.0000, -1.3564],
        [ 0.0000, -1.3086],
        [ 0.0000, -1.3125],
        [ 0.0000, -1.3613],
        [ 0.0000, -1.3125],
        [ 0.0000, -1.2705],
        [ 0.0000, -1.3799],
        [ 0.0000, -1.29

sp_para_output_t:  tensor([[[ 0.0556],
         [ 0.0083],
         [ 0.0294],
         [-0.0126],
         [ 0.0560],
         [ 0.0119],
         [ 0.0243],
         [ 0.0448],
         [ 0.0230],
         [ 0.0520]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2715],
        [ 0.0000, -1.2783],
        [ 0.0000, -1.2412],
        [ 0.0000, -1.2412],
        [ 0.0000, -1.2803],
        [ 0.0000, -1.2852],
        [ 0.0000, -1.3418],
        [ 0.0000, -1.2480],
        [ 0.0000, -1.2490],
        [ 0.0000, -1.3154],
        [ 0.0000, -1.2559],
        [ 0.0000, -1.2734],
        [ 0.0000, -1.2871],
        [ 0.0000, -1.3047],
        [ 0.0000, -1.2959],
        [ 0.0000, -1.2324],
        [ 0.0000, -1.2812],
        [ 0.0000, -1.3086],
        [ 0.0000, -1.3516],
        [ 0.0000, -1.2930],
        [ 0.0000, -1.2920],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.1250],
         [-0.0010],
         [ 0.0551],
         [-0.0382],
         [ 0.1122],
         [ 0.1360],
         [ 0.0975],
         [ 0.1000],
         [ 0.0876],
         [ 0.1666]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2383],
        [ 0.0000, -1.2041],
        [ 0.0000, -1.2676],
        [ 0.0000, -1.2549],
        [ 0.0000, -1.2354],
        [ 0.0000, -1.2773],
        [ 0.0000, -1.2432],
        [ 0.0000, -1.1992],
        [ 0.0000, -1.2783],
        [ 0.0000, -1.2549],
        [ 0.0000, -1.2119],
        [ 0.0000, -1.2666],
        [ 0.0000, -1.2324],
        [ 0.0000, -1.2285],
        [ 0.0000, -1.2373],
        [ 0.0000, -1.3018]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='

sorted_answers: [{'start': tensor(95, device='cuda:0'), 'end': tensor(113, device='cuda:0'), 'start_logit': tensor(0.9580, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2252, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(113, device='cuda:0'), 'start_logit': tensor(0.9087, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2252, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(123, device='cuda:0'), 'start_logit': tensor(0.9580, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1587, device='cuda:0', dtype=torch.float16)}, {'start': tensor(72, device='cuda:0'), 'end': tensor(77, device='cuda:0'), 'start_logit': tensor(0.8999, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1711, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(123, device='cuda:0'), 'start_logit': tensor(0.9087, device='cuda:0', dtype=tor

sp_para_output_t:  tensor([[[ 0.0761],
         [ 0.0216],
         [ 0.0409],
         [ 0.0370],
         [ 0.0201],
         [ 0.0386],
         [ 0.0670],
         [ 0.1090],
         [ 0.0017],
         [-0.1193]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.3506],
        [ 0.0000, -1.3252],
        [ 0.0000, -1.3467],
        [ 0.0000, -1.3779],
        [ 0.0000, -1.3965],
        [ 0.0000, -1.3301],
        [ 0.0000, -1.3750],
        [ 0.0000, -1.4072],
        [ 0.0000, -1.3945],
        [ 0.0000, -1.4170],
        [ 0.0000, -1.3828],
        [ 0.0000, -1.3711],
        [ 0.0000, -1.3701],
        [ 0.0000, -1.3750],
        [ 0.0000, -1.3750],
        [ 0.0000, -1.3945],
        [ 0.0000, -1.4033],
        [ 0.0000, -1.4072],
        [ 0.0000, -1.3408],
        [ 0.0000, -1.3047],
        [ 0.0000, -1.4453],
        [ 0.0

sorted_answers: [{'start': tensor(643, device='cuda:0'), 'end': tensor(652, device='cuda:0'), 'start_logit': tensor(0.9336, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5249, device='cuda:0', dtype=torch.float16)}, {'start': tensor(87, device='cuda:0'), 'end': tensor(110, device='cuda:0'), 'start_logit': tensor(0.9077, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4619, device='cuda:0', dtype=torch.float16)}, {'start': tensor(482, device='cuda:0'), 'end': tensor(484, device='cuda:0'), 'start_logit': tensor(0.8433, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4961, device='cuda:0', dtype=torch.float16)}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': tensor(0.9644, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3708, device='cuda:0', dtype=torch.float16)}, {'start': tensor(643, device='cuda:0'), 'end': tensor(650, device='cuda:0'), 'start_logit': tensor(0.9336, device='cuda:0', dtype=

sp_para_output_t:  tensor([[[ 0.0764],
         [ 0.0457],
         [-0.0152],
         [ 0.0510],
         [ 0.0996],
         [-0.0208],
         [ 0.0554],
         [ 0.0989],
         [ 0.0224],
         [ 0.0266]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.3447],
        [ 0.0000, -1.2119],
        [ 0.0000, -1.3301],
        [ 0.0000, -1.2861],
        [ 0.0000, -1.2998],
        [ 0.0000, -1.3525],
        [ 0.0000, -1.3066],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.3809],
        [ 0.0000, -1.3320],
        [ 0.0000, -1.3359],
        [ 0.0000, -1.2852],
        [ 0.0000, -1.3721],
        [ 0.0000, -1.3779],
        [ 0.0000, -1.3428],
        [ 0.0000, -1.2344],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.3135]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[-0.0049],
         [-0.0566],
         [ 0.0391],
         [-0.0248],
         [-0.0542],
         [-0.0328],
         [-0.0478],
         [-0.0393],
         [ 0.0304],
         [ 0.0587]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2412],
        [ 0.0000, -1.2266],
        [ 0.0000, -1.2852],
        [ 0.0000, -1.2285],
        [ 0.0000, -1.0547],
        [ 0.0000, -1.2793],
        [ 0.0000, -1.2412],
        [ 0.0000, -1.2510],
        [ 0.0000, -1.2529],
        [ 0.0000, -1.2979],
        [ 0.0000, -1.2197],
        [ 0.0000, -1.2246],
        [ 0.0000, -1.3232],
        [ 0.0000, -1.2900],
        [ 0.0000, -0.6460],
        [ 0.0000, -1.2871],
        [ 0.0000, -1.2568],
        [ 0.0000, -1.2637],
        [ 0.0000, -1.2949],
        [ 0.0000, -1.1982],
        [ 0.0000, -1.2998],
        [ 0.0

sorted_answers: [{'start': tensor(619, device='cuda:0'), 'end': tensor(623, device='cuda:0'), 'start_logit': tensor(0.7842, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4016, device='cuda:0', dtype=torch.float16)}, {'start': tensor(619, device='cuda:0'), 'end': tensor(624, device='cuda:0'), 'start_logit': tensor(0.7842, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3625, device='cuda:0', dtype=torch.float16)}, {'start': tensor(226, device='cuda:0'), 'end': tensor(241, device='cuda:0'), 'start_logit': tensor(0.7983, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3452, device='cuda:0', dtype=torch.float16)}, {'start': tensor(214, device='cuda:0'), 'end': tensor(241, device='cuda:0'), 'start_logit': tensor(0.7563, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3452, device='cuda:0', dtype=torch.float16)}, {'start': tensor(619, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': tensor(0.7842, device='cuda:0', dty

sp_para_output_t:  tensor([[[ 0.0594],
         [-0.0594],
         [ 0.0709],
         [-0.0013],
         [-0.0516],
         [ 0.0499],
         [-0.0126],
         [-0.0082],
         [ 0.0620],
         [ 0.0451]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2617],
        [ 0.0000, -1.2832],
        [ 0.0000, -1.2793],
        [ 0.0000, -1.2871],
        [ 0.0000, -1.2588],
        [ 0.0000, -1.3379],
        [ 0.0000, -1.2949],
        [ 0.0000, -1.3145],
        [ 0.0000, -1.3271],
        [ 0.0000, -1.2324],
        [ 0.0000, -1.3516],
        [ 0.0000, -1.2637],
        [ 0.0000, -1.2646],
        [ 0.0000, -1.3457],
        [ 0.0000, -1.2119],
        [ 0.0000, -1.2900],
        [ 0.0000, -1.3076],
        [ 0.0000, -1.0225],
        [ 0.0000, -1.2793],
        [ 0.0000, -1.2656],
        [ 0.0000, -1.3105]], device='cuda

sorted_answers: [{'start': tensor(58, device='cuda:0'), 'end': tensor(72, device='cuda:0'), 'start_logit': tensor(0.9150, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5996, device='cuda:0', dtype=torch.float16)}, {'start': tensor(439, device='cuda:0'), 'end': tensor(443, device='cuda:0'), 'start_logit': tensor(0.9614, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5527, device='cuda:0', dtype=torch.float16)}, {'start': tensor(122, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': tensor(0.8281, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5728, device='cuda:0', dtype=torch.float16)}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': tensor(0.8208, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5728, device='cuda:0', dtype=torch.float16)}, {'start': tensor(588, device='cuda:0'), 'end': tensor(614, device='cuda:0'), 'start_logit': tensor(1.0010, device='cuda:0', dtype

sp_para_output_t:  tensor([[[ 0.0203],
         [-0.0322],
         [ 0.0471],
         [ 0.0707],
         [-0.0113],
         [-0.0396],
         [ 0.0366],
         [ 0.0046],
         [ 0.0272],
         [ 0.0562]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2051],
        [ 0.0000, -1.2012],
        [ 0.0000, -1.1777],
        [ 0.0000, -1.1855],
        [ 0.0000, -1.1445],
        [ 0.0000, -1.2080],
        [ 0.0000, -1.1436],
        [ 0.0000, -1.2109],
        [ 0.0000, -1.0352],
        [ 0.0000, -0.2678],
        [ 0.0000, -1.2568],
        [ 0.0000, -1.2158],
        [ 0.0000, -1.2080],
        [ 0.0000, -1.1982],
        [ 0.0000, -1.2373],
        [ 0.0000, -1.1514],
        [ 0.0000, -1.2871],
        [ 0.0000, -1.2090],
        [ 0.0000, -1.2627],
        [ 0.0000, -1.2158],
        [ 0.0000, -1.1748],
        [ 0.0

end_logits:  tensor([-5.0342e-01, -6.2646e-01, -3.0054e-01,  6.3110e-02, -5.4980e-01,
        -5.1270e-01, -5.5762e-01, -3.7402e-01, -2.1265e-01, -6.2061e-01,
        -1.1505e-01,  5.2686e-01, -5.9277e-01,  3.9062e-01, -4.7168e-01,
         5.3925e-02, -3.5059e-01,  5.4398e-03, -5.4395e-01,  4.9469e-02,
        -5.2783e-01, -2.1265e-01,  4.5746e-02, -2.0056e-01, -2.4902e-01,
        -5.0244e-01, -1.9788e-01, -6.5576e-01, -3.6469e-02, -6.7188e-01,
         2.0410e-01, -5.6839e-03, -2.8711e-01, -3.4668e-01,  1.3159e-01,
         1.2415e-01, -7.8491e-02, -4.3530e-01, -1.7664e-01,  1.4905e-01,
        -2.8030e-02, -3.2104e-01,  5.4962e-02, -4.0967e-01, -2.5562e-01,
        -4.7705e-01, -4.4629e-01, -1.6821e-01,  2.8320e-01,  2.0471e-01,
        -5.8447e-01,  4.3359e-01,  2.8076e-01,  7.0068e-02, -6.8994e-01,
         2.2815e-01, -9.5825e-02,  9.9548e-02, -4.3774e-01, -1.1365e-01,
        -2.7637e-01, -9.4055e-02, -3.6255e-02, -2.1484e-01,  1.4793e-02,
        -5.7861e-01, -3.2861e-01,  3.6

sp_para_output_t:  tensor([[[ 0.0051],
         [-0.0299],
         [ 0.0267],
         [ 0.0029],
         [-0.0100],
         [ 0.0106],
         [ 0.0380],
         [ 0.0071],
         [ 0.0284],
         [ 0.0005]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2969],
        [ 0.0000, -1.2178],
        [ 0.0000, -1.2793],
        [ 0.0000, -1.2764],
        [ 0.0000, -1.3604],
        [ 0.0000, -1.3848],
        [ 0.0000, -1.3457],
        [ 0.0000, -1.3652],
        [ 0.0000, -1.3350],
        [ 0.0000, -1.3125],
        [ 0.0000, -1.2764],
        [ 0.0000, -1.3867],
        [ 0.0000, -1.2852],
        [ 0.0000, -1.3037],
        [ 0.0000, -1.3057],
        [ 0.0000, -1.2607],
        [ 0.0000, -1.3408],
        [ 0.0000, -1.3086],
        [ 0.0000, -1.2910],
        [ 0.0000, -1.2383],
        [ 0.0000, -1.2451],
        [ 0.0

sorted_answers: [{'start': tensor(90, device='cuda:0'), 'end': tensor(101, device='cuda:0'), 'start_logit': tensor(0.7998, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6265, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.7998, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5991, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(98, device='cuda:0'), 'start_logit': tensor(0.7998, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5967, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(115, device='cuda:0'), 'start_logit': tensor(0.7998, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5942, device='cuda:0', dtype=torch.float16)}, {'start': tensor(115, device='cuda:0'), 'end': tensor(115, device='cuda:0'), 'start_logit': tensor(0.7935, device='cuda:0', dtype=tor

sp_para_output_t:  tensor([[[ 0.0070],
         [ 0.0013],
         [ 0.0386],
         [ 0.1003],
         [ 0.0265],
         [ 0.1079],
         [ 0.0259],
         [ 0.1171],
         [-0.0012],
         [ 0.0015]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2773],
        [ 0.0000, -1.2744],
        [ 0.0000, -1.2832],
        [ 0.0000, -1.3447],
        [ 0.0000, -1.3076],
        [ 0.0000, -1.3740],
        [ 0.0000, -1.3213],
        [ 0.0000, -1.3330],
        [ 0.0000, -1.3389],
        [ 0.0000, -1.3555],
        [ 0.0000, -1.3711],
        [ 0.0000, -1.3232],
        [ 0.0000, -1.4395],
        [ 0.0000, -1.3740],
        [ 0.0000, -1.3027],
        [ 0.0000, -1.3438],
        [ 0.0000, -1.3438],
        [ 0.0000, -1.3770]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[ 0.1254],
         [-0.0410],
         [ 0.0015],
         [ 0.0570],
         [ 0.0084],
         [ 0.1212],
         [ 0.0782],
         [ 0.0258],
         [ 0.1570],
         [ 0.1115]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.1602],
        [ 0.0000, -1.1621],
        [ 0.0000, -1.1113],
        [ 0.0000, -1.2021],
        [ 0.0000, -1.0703],
        [ 0.0000, -1.1924],
        [ 0.0000, -0.9951],
        [ 0.0000, -0.2700],
        [ 0.0000, -0.2703],
        [ 0.0000, -1.2207],
        [ 0.0000, -1.1719],
        [ 0.0000, -1.1768],
        [ 0.0000, -1.2031],
        [ 0.0000, -1.2080],
        [ 0.0000, -1.2295],
        [ 0.0000, -1.2588],
        [ 0.0000, -1.2422],
        [ 0.0000, -1.2246],
        [ 0.0000, -1.1523],
        [ 0.0000, -1.1348],
        [ 0.0000, -0.2703]], device='cuda

sp_para_output_t:  tensor([[[ 0.1019],
         [-0.0118],
         [ 0.0320],
         [ 0.0769],
         [ 0.0533],
         [ 0.0350],
         [ 0.1267],
         [-0.0481],
         [-0.0014],
         [ 0.0663]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2461],
        [ 0.0000, -1.2529],
        [ 0.0000, -1.2500],
        [ 0.0000, -1.3174],
        [ 0.0000, -1.2842],
        [ 0.0000, -1.2568],
        [ 0.0000, -1.2617],
        [ 0.0000, -1.3066],
        [ 0.0000, -1.3486],
        [ 0.0000, -1.2910],
        [ 0.0000, -1.3193],
        [ 0.0000, -1.3154],
        [ 0.0000, -1.3438],
        [ 0.0000, -1.3252],
        [ 0.0000, -1.3662],
        [ 0.0000, -1.3037],
        [ 0.0000, -1.2637],
        [ 0.0000, -1.2773],
        [ 0.0000, -1.2705],
        [ 0.0000, -1.2920]], device='cuda:0')
sp_sent.view(-1):  tens

sp_para_output_t:  tensor([[[0.0670],
         [0.0795],
         [0.0413],
         [0.0567],
         [0.0617],
         [0.0717],
         [0.1141],
         [0.0519],
         [0.0416],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2500],
        [ 0.0000, -1.3311],
        [ 0.0000, -1.2949],
        [ 0.0000, -1.3301],
        [ 0.0000, -1.2939],
        [ 0.0000, -1.3154],
        [ 0.0000, -1.2432],
        [ 0.0000, -1.2666],
        [ 0.0000, -1.2578],
        [ 0.0000, -1.2949],
        [ 0.0000, -1.3584],
        [ 0.0000, -1.3213],
        [ 0.0000, -1.2920],
        [ 0.0000, -1.3486],
        [ 0.0000, -1.2920]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 0.3257, 

sorted_answers: [{'start': tensor(95, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.8428, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3687, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.8428, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3560, device='cuda:0', dtype=torch.float16)}, {'start': tensor(161, device='cuda:0'), 'end': tensor(162, device='cuda:0'), 'start_logit': tensor(0.8345, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3098, device='cuda:0', dtype=torch.float16)}, {'start': tensor(161, device='cuda:0'), 'end': tensor(169, device='cuda:0'), 'start_logit': tensor(0.8345, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2656, device='cuda:0', dtype=torch.float16)}, {'start': tensor(27, device='cuda:0'), 'end': tensor(28, device='cuda:0'), 'start_logit': tensor(0.7886, device='cuda:0', dtype=tor

predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2900],
        [ 0.0000, -1.3066],
        [ 0.0000, -1.2998],
        [ 0.0000, -1.3516],
        [ 0.0000, -1.3096],
        [ 0.0000, -1.3057],
        [ 0.0000, -1.3447],
        [ 0.0000, -1.3330],
        [ 0.0000, -1.3555],
        [ 0.0000, -1.2393],
        [ 0.0000, -1.3730],
        [ 0.0000, -1.3584],
        [ 0.0000, -1.3174],
        [ 0.0000, -1.3701],
        [ 0.0000, -1.3760],
        [ 0.0000, -1.2822],
        [ 0.0000, -1.3145],
        [ 0.0000, -1.2119],
        [ 0.0000, -1.3604],
        [ 0.0000, -1.3457],
        [ 0.0000, -1.2832],
        [ 0.0000, -1.2754],
        [ 0.0000, -1.3398],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.3271],
        [ 0.0000, -1.2812],
        [ 0.0000, -1.3340]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2432],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3633],
        [ 0.0000, -1.3613],
        [ 0.0000, -1.2041],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3721],
        [ 0.0000, -1.2676],
        [ 0.0000, -1.3604],
        [ 0.0000, -1.3994],
        [ 0.0000, -1.3818],
        [ 0.0000, -1.3779],
        [ 0.0000, -1.3799],
        [ 0.0000, -1.3584],
        [ 0.0000, -1.3389],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3311],
        [ 0.0000, -1.3369],
        [ 0.0000, -1.3379],
        [ 0.0000, -1.2822],
        [ 0.0000, -1.3213],
        [ 0.0000, -1.3389],
        [ 0.0000, -1.3584],
        [ 0.0000, -1.3389],
        [ 0.0000, -1.3281],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.2852],
        [ 0.0000, -1.3672],
        [ 0.0000, -1.3682],
        [ 0.0000, -1

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.0889],
        [ 0.0000, -1.3828],
        [ 0.0000, -1.3770],
        [ 0.0000, -1.3877],
        [ 0.0000, -1.3711],
        [ 0.0000, -1.4102],
        [ 0.0000, -0.9341],
        [ 0.0000, -1.3584],
        [ 0.0000, -1.3877],
        [ 0.0000, -1.4072],
        [ 0.0000, -1.3477],
        [ 0.0000, -1.3770],
        [ 0.0000, -1.3945],
        [ 0.0000, -1.3877],
        [ 0.0000, -1.3848],
        [ 0.0000, -1.4307],
        [ 0.0000, -1.3652],
        [ 0.0000, -1.3701],
        [ 0.0000, -1.3242],
        [ 0.0000, -1.3340],
        [ 0.0000, -1.3877],
        [ 0.0000, -1.3613],
        [ 0.0000, -1.3975],
        [ 0.0000, -1.3701],
        [ 0.0000, -1.3721],
        [ 0.0000, -1.3789],
        [ 0.0000, -1.4258],
        [ 0.0000, -1.2900],
        [ 0.0000, -1.3799],
        [ 0.0000, -1.3076],
        [ 0.0000, -1

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.1035],
        [ 0.0000, -1.1855],
        [ 0.0000, -1.1484],
        [ 0.0000, -1.1221],
        [ 0.0000, -1.1875],
        [ 0.0000, -0.2554],
        [ 0.0000, -1.1777],
        [ 0.0000, -1.2119],
        [ 0.0000, -1.1367],
        [ 0.0000, -1.1611],
        [ 0.0000, -1.1924],
        [ 0.0000, -1.1777],
        [ 0.0000, -1.1074],
        [ 0.0000, -1.1309],
        [ 0.0000, -1.1592],
        [ 0.0000, -1.1582],
        [ 0.0000, -1.1689],
        [ 0.0000, -1.1514],
        [ 0.0000, -1.1309]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([-2.5586e-01, -8.0664e-01, -3.3179e-01, -5.1709e-01,  1.9080e-01,
        -7.6611e-01,  4.0210e-01, -7.8223e-01, -2.6001e-01, -6

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2773],
        [ 0.0000, -1.1494],
        [ 0.0000, -1.3271],
        [ 0.0000, -1.2969],
        [ 0.0000, -1.3027],
        [ 0.0000, -1.3604],
        [ 0.0000, -1.3242],
        [ 0.0000, -1.3525],
        [ 0.0000, -1.3809],
        [ 0.0000, -1.3682],
        [ 0.0000, -1.3477],
        [ 0.0000, -1.3955],
        [ 0.0000, -1.3955],
        [ 0.0000, -1.3906],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3652],
        [ 0.0000, -1.3682],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.3594],
        [ 0.0000, -1.3320],
        [ 0.0000, -1.3203],
        [ 0.0000, -1.3037],
        [ 0.0000, -1.3945]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda:0')
start_logits:  tensor([ 2.

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.3799],
        [ 0.0000, -1.3906],
        [ 0.0000, -1.4150],
        [ 0.0000, -1.3730],
        [ 0.0000, -1.3271],
        [ 0.0000, -1.3662],
        [ 0.0000, -1.3350],
        [ 0.0000, -1.3369],
        [ 0.0000, -1.3496],
        [ 0.0000, -1.3730],
        [ 0.0000, -1.3574],
        [ 0.0000, -1.3486],
        [ 0.0000, -1.4258]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([16], device='cuda:0')
start_logits:  tensor([ 2.8027e-01,  6.8588e-03,  1.6891e-02,  2.2888e-03,  1.3782e-01,
         3.3966e-02, -2.5586e-01,  5.9180e-01,  2.8955e-01,  3.1372e-01,
         1.4563e-01, -2.4124e-02,  4.0674e-01,  4.7339e-01,  2.3218e-01,
         5.2051e-01, -2.9297e-01,  3.7061e-01, -3.4399e-01,  2.8870e-02,
        -4.4507e-01,  3.2300e-01, -1

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2305],
        [ 0.0000, -1.2422],
        [ 0.0000, -1.1387],
        [ 0.0000, -1.2129],
        [ 0.0000, -1.1543],
        [ 0.0000, -1.1582],
        [ 0.0000, -1.2383],
        [ 0.0000, -1.1309],
        [ 0.0000, -1.2051],
        [ 0.0000, -1.1885],
        [ 0.0000, -1.2129],
        [ 0.0000, -1.1836],
        [ 0.0000, -1.2178],
        [ 0.0000, -1.2676],
        [ 0.0000, -1.2363],
        [ 0.0000, -1.2227],
        [ 0.0000, -1.1865],
        [ 0.0000, -1.1914],
        [ 0.0000, -1.1650],
        [ 0.0000, -1.1670],
        [ 0.0000, -1.1992],
        [ 0.0000, -1.1963],
        [ 0.0000, -1.0869]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([-1.


Epoch 00004: avg_val_f1 reached 0.00287 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_4.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(13.4025, device='cuda:0')	avg_answer_loss:  tensor(3.4093, device='cuda:0')	avg_type_loss:  tensor(0.3800, device='cuda:0')	avg_sp_para_loss:  tensor(0.4756, device='cuda:0')	avg_sp_sent_loss:  tensor(0.3337, device='cuda:0')
avg_val_f1:  0.002873563304029662	avg_val_em:  0.0	avg_val_prec:  0.002873563304029662	avg_val_recall:  0.002873563304029662
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1896551724137931	avg_val_sp_para_em:  0.034482758620689655	avg_val_sp_para_prec:  0.1896551724137931	avg_val_sp_para_recall:  0.1896551724137931
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0
sp_para_output_t:  tensor([[[ 0.1004],
         [ 0.0486],
         [ 0.1609],
         [-0.0250],
         [-0.0430],
         [-0.0130],
         [-0.0659],
         [ 

sp_para_output_t:  tensor([[[ 0.0165],
         [-0.0964],
         [-0.0284],
         [ 0.0037],
         [ 0.1122],
         [-0.0352],
         [ 0.0569],
         [ 0.0079],
         [-0.0625],
         [-0.0296]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([25, 2])
sp_sent.view(-1).size():  torch.Size([25])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.2295],
        [ 0.0000, -1.4014],
        [ 0.0000, -1.3213],
        [ 0.0000, -1.3447],
        [ 0.0000, -1.2510],
        [ 0.0000, -1.4463],
        [ 0.0000, -1.2939],
        [ 0.0000, -1.3105],
        [ 0.0000, -1.4629],
        [ 0.0000, -1.3828],
        [ 0.0000, -1.3252],
        [ 0.0000, -1.3428],
        [ 0.0000, -1.4248],
        [ 0.0000, -1.4082],
        [ 0.0000, -1.3340],
        [ 0.0000, -1.3945],
        [ 0.0000, -1.3799],
        [ 0.0000, -1.4268],
        [ 0.0000, -1.3809],
        [ 0.0000, -1.4570],
        [ 

sp_para_output_t:  tensor([[[ 0.0344],
         [ 0.0089],
         [-0.0667],
         [ 0.0234],
         [ 0.0041],
         [ 0.0380],
         [ 0.0354],
         [ 0.0096],
         [-0.0129],
         [-0.0707]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.4775],
        [ 0.0000, -1.5605],
        [ 0.0000, -1.3984],
        [ 0.0000, -1.3008],
        [ 0.0000, -1.4102],
        [ 0.0000, -1.3291],
        [ 0.0000, -1.3330],
        [ 0.0000, -1.3740],
        [ 0.0000, -1.3418],
        [ 0.0000, -1.3838],
        [ 0.0000, -1.3867],
        [ 0.0000, -1.3740],
        [ 0.0000, -1.2979],
        [ 0.0000, -1.2920],
        [ 0.0000, -1.5859],
        [ 0.0000, -1.4033],
        [ 0.0000, -1.4160],
        [ 0.0000, -1.4375],
        [ 0.0000, -1.4287]], device='cuda:0', grad_fn=<ViewBackwar

sp_para_output_t:  tensor([[[ 0.1055],
         [-0.0745],
         [ 0.0563],
         [-0.0114],
         [ 0.0594],
         [-0.0293],
         [ 0.0183],
         [-0.0314],
         [-0.0731],
         [ 0.0047]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddBackward0>)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.4541],
        [ 0.0000, -1.4219],
        [ 0.0000, -1.4355],
        [ 0.0000, -1.4727],
        [ 0.0000, -1.5010],
        [ 0.0000, -1.3701],
        [ 0.0000, -1.3789],
        [ 0.0000, -1.5205],
        [ 0.0000, -1.5215],
        [ 0.0000, -1.4150],
        [ 0.0000, -1.4111],
        [ 0.0000, -1.3906],
        [ 0.0000, -1.4844],
        [ 0.0000, -1.3398],
        [ 0.0000, -1.3096],
        [ 0.0000, -1.3535],
        [ 0.0000, -1.2803],
        [ 0.0000, -1.5195]], device='cuda:0', grad_fn=<ViewBackward>)
sp_sent.view(-1):  tenso

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

sp_para_output_t:  tensor([[[ 0.0901],
         [-0.0195],
         [-0.0067],
         [ 0.0177],
         [ 0.0817],
         [ 0.0364],
         [ 0.0061],
         [ 0.0639],
         [-0.0488],
         [ 0.0101]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6924],
        [ 0.0000, -1.6553],
        [ 0.0000, -1.6885],
        [ 0.0000, -1.6650],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.6943],
        [ 0.0000, -1.7139],
        [ 0.0000, -1.6689],
        [ 0.0000, -1.6426],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.6846],
        [ 0.0000, -1.7402],
        [ 0.0000, -1.7441],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.6846],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.6914],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.6885],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.0432],
         [-0.0465],
         [ 0.0324],
         [ 0.0601],
         [ 0.0131],
         [-0.0060],
         [ 0.0294],
         [ 0.0386],
         [-0.0420],
         [-0.0339]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([36, 2])
sp_sent.view(-1).size():  torch.Size([36])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7559],
        [ 0.0000, -1.7061],
        [ 0.0000, -1.7529],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.7432],
        [ 0.0000, -1.7715],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.7500],
        [ 0.0000, -1.7510],
        [ 0.0000, -1.7656],
        [ 0.0000, -1.7344],
        [ 0.0000, -1.7412],
        [ 0.0000, -1.7695],
        [ 0.0000, -1.7158],
        [ 0.0000, -1.7656],
        [ 0.0000, -1.7881],
        [ 0.0000, -1.6943],
        [ 0.0000, -1.7373],
        [ 0.0000, -1.7471],
        [ 0.0

sorted_answers: [{'start': tensor(271, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': tensor(0.8052, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5005, device='cuda:0', dtype=torch.float16)}]
s_to_p_map:  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4]
evidence_candidates:  {0: tensor([], device='cuda:0', dtype=torch.int64), 1: tensor([], device='cuda:0', dtype=torch.int64), 2: tensor([], device='cuda:0', dtype=torch.int64), 3: tensor([], device='cuda:0', dtype=torch.int64), 4: tensor([], device='cuda:0', dtype=torch.int64)}
para_sents_offset:  [0, 7, 13, 21, 27, 36]
sp_para_pred:  tensor([3, 1], device='cuda:0')
sp_sent_pred:  []
sp_sent_map:  {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 13, 8: 14, 9: 15, 10: 16, 11: 17, 12: 18, 13: 19, 14: 20, 15: 21, 16: 22, 17: 23, 18: 24, 19: 25, 20: 26, 21: 41, 22: 42, 23: 43, 24: 44, 25: 45, 26: 46, 27: 47, 28: 48, 29: 49, 30: 50, 31: 51, 32: 52,

sorted_answers: [{'start': tensor(151, device='cuda:0'), 'end': tensor(153, device='cuda:0'), 'start_logit': tensor(0.7222, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3757, device='cuda:0', dtype=torch.float16)}, {'start': tensor(83, device='cuda:0'), 'end': tensor(83, device='cuda:0'), 'start_logit': tensor(0.6055, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4395, device='cuda:0', dtype=torch.float16)}, {'start': tensor(151, device='cuda:0'), 'end': tensor(155, device='cuda:0'), 'start_logit': tensor(0.7222, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3098, device='cuda:0', dtype=torch.float16)}, {'start': tensor(135, device='cuda:0'), 'end': tensor(136, device='cuda:0'), 'start_logit': tensor(0.5029, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5137, device='cuda:0', dtype=torch.float16)}, {'start': tensor(151, device='cuda:0'), 'end': tensor(152, device='cuda:0'), 'start_logit': tensor(0.7222, device='cuda:0', dtype

predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6904],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.6260],
        [ 0.0000, -1.6650],
        [ 0.0000, -1.6963],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.6709],
        [ 0.0000, -1.7207],
        [ 0.0000, -1.7471],
        [ 0.0000, -1.6982],
        [ 0.0000, -1.7139],
        [ 0.0000, -1.7217],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.6963],
        [ 0.0000, -1.7402],
        [ 0.0000, -1.7012],
        [ 0.0000, -1.7129],
        [ 0.0000, -1.7578],
        [ 0.0000, -1.7197],
        [ 0.0000, -1.6875],
        [ 0.0000, -1.7666],
        [ 0.0000, -1.7090],
        [ 0.0000, -1.6758],
        [ 0.0000, -1.6934],
        [ 0.0000, -1.6709],
        [ 0.0000, -1.6924],
        [ 0.0000, -1.7129],
        [ 0.0000, -1.6221]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[ 0.0556],
         [ 0.0083],
         [ 0.0294],
         [-0.0126],
         [ 0.0560],
         [ 0.0119],
         [ 0.0243],
         [ 0.0448],
         [ 0.0230],
         [ 0.0520]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([28, 2])
sp_sent.view(-1).size():  torch.Size([28])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7080],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.7344],
        [ 0.0000, -1.7461],
        [ 0.0000, -1.7646],
        [ 0.0000, -1.6924],
        [ 0.0000, -1.7139],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.6953],
        [ 0.0000, -1.7041],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.7383],
        [ 0.0000, -1.7500],
        [ 0.0000, -1.6934],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.7529],
        [ 0.0000, -1.7744],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.7275],
        [ 0.0

sp_para_output_t:  tensor([[[ 0.1250],
         [-0.0010],
         [ 0.0551],
         [-0.0382],
         [ 0.1122],
         [ 0.1360],
         [ 0.0975],
         [ 0.1000],
         [ 0.0876],
         [ 0.1666]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([16, 2])
sp_sent.view(-1).size():  torch.Size([16])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6572],
        [ 0.0000, -1.6514],
        [ 0.0000, -1.6943],
        [ 0.0000, -1.6816],
        [ 0.0000, -1.6572],
        [ 0.0000, -1.6650],
        [ 0.0000, -1.6523],
        [ 0.0000, -1.6484],
        [ 0.0000, -1.6416],
        [ 0.0000, -1.6436],
        [ 0.0000, -1.6387],
        [ 0.0000, -1.6826],
        [ 0.0000, -1.6562],
        [ 0.0000, -1.6475],
        [ 0.0000, -1.6455],
        [ 0.0000, -1.7148]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')
question_end_index:  tensor([15], device='

sorted_answers: [{'start': tensor(95, device='cuda:0'), 'end': tensor(113, device='cuda:0'), 'start_logit': tensor(0.9956, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2435, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(113, device='cuda:0'), 'start_logit': tensor(0.9321, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2435, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(123, device='cuda:0'), 'start_logit': tensor(0.9956, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1115, device='cuda:0', dtype=torch.float16)}, {'start': tensor(96, device='cuda:0'), 'end': tensor(123, device='cuda:0'), 'start_logit': tensor(0.9321, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.1115, device='cuda:0', dtype=torch.float16)}, {'start': tensor(239, device='cuda:0'), 'end': tensor(246, device='cuda:0'), 'start_logit': tensor(0.9409, device='cuda:0', dtype=t

sp_para_output_t:  tensor([[[ 0.0761],
         [ 0.0216],
         [ 0.0409],
         [ 0.0370],
         [ 0.0201],
         [ 0.0386],
         [ 0.0670],
         [ 0.1090],
         [ 0.0017],
         [-0.1193]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([29, 2])
sp_sent.view(-1).size():  torch.Size([29])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7080],
        [ 0.0000, -1.7314],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7402],
        [ 0.0000, -1.7598],
        [ 0.0000, -1.6777],
        [ 0.0000, -1.7529],
        [ 0.0000, -1.8105],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7822],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7510],
        [ 0.0000, -1.7363],
        [ 0.0000, -1.7607],
        [ 0.0000, -1.7178],
        [ 0.0000, -1.7158],
        [ 0.0000, -1.7920],
        [ 0.0

sorted_answers: [{'start': tensor(482, device='cuda:0'), 'end': tensor(484, device='cuda:0'), 'start_logit': tensor(0.7651, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5278, device='cuda:0', dtype=torch.float16)}, {'start': tensor(483, device='cuda:0'), 'end': tensor(484, device='cuda:0'), 'start_logit': tensor(0.6968, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5278, device='cuda:0', dtype=torch.float16)}, {'start': tensor(238, device='cuda:0'), 'end': tensor(255, device='cuda:0'), 'start_logit': tensor(0.7959, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4060, device='cuda:0', dtype=torch.float16)}, {'start': tensor(593, device='cuda:0'), 'end': tensor(607, device='cuda:0'), 'start_logit': tensor(0.8003, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3979, device='cuda:0', dtype=torch.float16)}, {'start': tensor(332, device='cuda:0'), 'end': tensor(340, device='cuda:0'), 'start_logit': tensor(0.7373, device='cuda:0', dty

sp_para_output_t:  tensor([[[ 0.0764],
         [ 0.0457],
         [-0.0152],
         [ 0.0510],
         [ 0.0996],
         [-0.0208],
         [ 0.0554],
         [ 0.0989],
         [ 0.0224],
         [ 0.0266]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7324],
        [ 0.0000, -1.6836],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.7129],
        [ 0.0000, -1.7070],
        [ 0.0000, -1.7568],
        [ 0.0000, -1.7266],
        [ 0.0000, -1.7158],
        [ 0.0000, -1.7627],
        [ 0.0000, -1.7188],
        [ 0.0000, -1.7207],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.7744],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.6338],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.7139]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[-0.0049],
         [-0.0566],
         [ 0.0391],
         [-0.0248],
         [-0.0542],
         [-0.0328],
         [-0.0478],
         [-0.0393],
         [ 0.0304],
         [ 0.0587]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7051],
        [ 0.0000, -1.7061],
        [ 0.0000, -1.7256],
        [ 0.0000, -1.7070],
        [ 0.0000, -1.6230],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.6924],
        [ 0.0000, -1.7100],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.6631],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.7949],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.1963],
        [ 0.0000, -1.7295],
        [ 0.0000, -1.7158],
        [ 0.0000, -1.7285],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.6709],
        [ 0.0000, -1.7207],
        [ 0.0

sorted_answers: [{'start': tensor(306, device='cuda:0'), 'end': tensor(315, device='cuda:0'), 'start_logit': tensor(0.9277, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2302, device='cuda:0', dtype=torch.float16)}, {'start': tensor(226, device='cuda:0'), 'end': tensor(241, device='cuda:0'), 'start_logit': tensor(0.7002, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3899, device='cuda:0', dtype=torch.float16)}, {'start': tensor(407, device='cuda:0'), 'end': tensor(431, device='cuda:0'), 'start_logit': tensor(0.7466, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3196, device='cuda:0', dtype=torch.float16)}, {'start': tensor(491, device='cuda:0'), 'end': tensor(499, device='cuda:0'), 'start_logit': tensor(0.6147, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3677, device='cuda:0', dtype=torch.float16)}, {'start': tensor(403, device='cuda:0'), 'end': tensor(431, device='cuda:0'), 'start_logit': tensor(0.6353, device='cuda:0', dty

sp_para_output_t:  tensor([[[ 0.0594],
         [-0.0594],
         [ 0.0709],
         [-0.0013],
         [-0.0516],
         [ 0.0499],
         [-0.0126],
         [-0.0082],
         [ 0.0620],
         [ 0.0451]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6748],
        [ 0.0000, -1.6875],
        [ 0.0000, -1.7031],
        [ 0.0000, -1.7090],
        [ 0.0000, -1.6836],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.6914],
        [ 0.0000, -1.7324],
        [ 0.0000, -1.7588],
        [ 0.0000, -1.6543],
        [ 0.0000, -1.7002],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.7100],
        [ 0.0000, -1.7344],
        [ 0.0000, -1.6611],
        [ 0.0000, -1.6924],
        [ 0.0000, -1.7041],
        [ 0.0000, -1.4863],
        [ 0.0000, -1.6982],
        [ 0.0000, -1.6816],
        [ 0.0000, -1.7021]], device='cuda

sorted_answers: [{'start': tensor(607, device='cuda:0'), 'end': tensor(623, device='cuda:0'), 'start_logit': tensor(0.7969, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5806, device='cuda:0', dtype=torch.float16)}, {'start': tensor(439, device='cuda:0'), 'end': tensor(443, device='cuda:0'), 'start_logit': tensor(0.8560, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5142, device='cuda:0', dtype=torch.float16)}, {'start': tensor(122, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': tensor(0.7803, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5845, device='cuda:0', dtype=torch.float16)}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': tensor(0.7700, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.5845, device='cuda:0', dtype=torch.float16)}, {'start': tensor(588, device='cuda:0'), 'end': tensor(614, device='cuda:0'), 'start_logit': tensor(0.8770, device='cuda:0', dty

sp_para_output_t:  tensor([[[ 0.0203],
         [-0.0322],
         [ 0.0471],
         [ 0.0707],
         [-0.0113],
         [-0.0396],
         [ 0.0366],
         [ 0.0046],
         [ 0.0272],
         [ 0.0562]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([22, 2])
sp_sent.view(-1).size():  torch.Size([22])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6826],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.6768],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.6475],
        [ 0.0000, -1.6748],
        [ 0.0000, -1.6592],
        [ 0.0000, -1.7051],
        [ 0.0000, -1.5889],
        [ 0.0000, -0.4568],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.6895],
        [ 0.0000, -1.6855],
        [ 0.0000, -1.6885],
        [ 0.0000, -1.7324],
        [ 0.0000, -1.6846],
        [ 0.0000, -1.7109],
        [ 0.0000, -1.6963],
        [ 0.0000, -1.6699],
        [ 0.0

end_logits:  tensor([-0.5889, -0.5845, -0.4141,  0.0633, -0.6187, -0.5186, -0.6060, -0.4280,
        -0.2303, -0.6450, -0.2140,  0.3501, -0.6489,  0.1814, -0.5200, -0.1394,
        -0.5688, -0.1061, -0.7026, -0.1238, -0.5767, -0.0372, -0.0689, -0.2524,
        -0.3599, -0.4695, -0.2856, -0.5981, -0.0880, -0.6284,  0.2047, -0.0399,
        -0.2917, -0.3662,  0.0331, -0.0040, -0.0882, -0.4963, -0.1913,  0.1375,
        -0.0627, -0.2893, -0.0392, -0.4358, -0.3528, -0.5381, -0.5552, -0.0527,
         0.2788,  0.2542, -0.5103,  0.4192,  0.3079,  0.1198, -0.6821,  0.0825,
        -0.0989,  0.0428, -0.5630, -0.3152, -0.2976, -0.1554, -0.1219, -0.2422,
        -0.1550, -0.5601, -0.3164,  0.2340, -0.2502, -0.0254,  0.0875, -0.0452,
         0.4558, -0.2761, -0.6577,  0.1091, -0.0504,  0.6685, -0.0089, -0.2489,
        -0.1140,  0.1127,  0.0338, -0.1680,  0.3738, -0.3198, -0.6309, -0.2327,
         0.3147,  0.4377,  0.2888, -0.4219,  0.1218, -0.4707,  0.3557,  0.0402,
         0.3120, -0.5962, -

sp_para_output_t:  tensor([[[ 0.0051],
         [-0.0299],
         [ 0.0267],
         [ 0.0029],
         [-0.0100],
         [ 0.0106],
         [ 0.0380],
         [ 0.0071],
         [ 0.0284],
         [ 0.0005]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([24, 2])
sp_sent.view(-1).size():  torch.Size([24])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6992],
        [ 0.0000, -1.6641],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.7178],
        [ 0.0000, -1.7539],
        [ 0.0000, -1.7637],
        [ 0.0000, -1.7295],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7188],
        [ 0.0000, -1.7344],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7402],
        [ 0.0000, -1.7168],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7305],
        [ 0.0000, -1.6982],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7373],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.6777],
        [ 0.0000, -1.6895],
        [ 0.0

sorted_answers: [{'start': tensor(90, device='cuda:0'), 'end': tensor(101, device='cuda:0'), 'start_logit': tensor(0.6875, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6772, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(98, device='cuda:0'), 'start_logit': tensor(0.6875, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6641, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(115, device='cuda:0'), 'start_logit': tensor(0.6875, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6274, device='cuda:0', dtype=torch.float16)}, {'start': tensor(115, device='cuda:0'), 'end': tensor(115, device='cuda:0'), 'start_logit': tensor(0.6562, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.6274, device='cuda:0', dtype=torch.float16)}, {'start': tensor(90, device='cuda:0'), 'end': tensor(111, device='cuda:0'), 'start_logit': tensor(0.6875, device='cuda:0', dtype=to

sp_para_output_t:  tensor([[[ 0.0070],
         [ 0.0013],
         [ 0.0386],
         [ 0.1003],
         [ 0.0265],
         [ 0.1079],
         [ 0.0259],
         [ 0.1171],
         [-0.0012],
         [ 0.0015]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([18, 2])
sp_sent.view(-1).size():  torch.Size([18])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6406],
        [ 0.0000, -1.6338],
        [ 0.0000, -1.6611],
        [ 0.0000, -1.7002],
        [ 0.0000, -1.6201],
        [ 0.0000, -1.7197],
        [ 0.0000, -1.6699],
        [ 0.0000, -1.6328],
        [ 0.0000, -1.6758],
        [ 0.0000, -1.6738],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.6641],
        [ 0.0000, -1.7637],
        [ 0.0000, -1.6777],
        [ 0.0000, -1.6377],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.6738],
        [ 0.0000, -1.7236]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

sp_para_output_t:  tensor([[[ 0.1254],
         [-0.0410],
         [ 0.0015],
         [ 0.0570],
         [ 0.0084],
         [ 0.1212],
         [ 0.0782],
         [ 0.0258],
         [ 0.1570],
         [ 0.1115]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([21, 2])
sp_sent.view(-1).size():  torch.Size([21])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6289],
        [ 0.0000, -1.6406],
        [ 0.0000, -1.5781],
        [ 0.0000, -1.6895],
        [ 0.0000, -1.5518],
        [ 0.0000, -1.6836],
        [ 0.0000, -1.5195],
        [ 0.0000, -0.4688],
        [ 0.0000, -1.3906],
        [ 0.0000, -1.6914],
        [ 0.0000, -1.6533],
        [ 0.0000, -1.6367],
        [ 0.0000, -1.6787],
        [ 0.0000, -1.6650],
        [ 0.0000, -1.6680],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.6699],
        [ 0.0000, -1.6475],
        [ 0.0000, -1.6367],
        [ 0.0000, -1.6250],
        [ 0.0000, -1.6289]], device='cuda

sp_para_output_t:  tensor([[[ 0.1019],
         [-0.0118],
         [ 0.0320],
         [ 0.0769],
         [ 0.0533],
         [ 0.0350],
         [ 0.1267],
         [-0.0481],
         [-0.0014],
         [ 0.0663]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([20, 2])
sp_sent.view(-1).size():  torch.Size([20])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6650],
        [ 0.0000, -1.6621],
        [ 0.0000, -1.6533],
        [ 0.0000, -1.6953],
        [ 0.0000, -1.6465],
        [ 0.0000, -1.6445],
        [ 0.0000, -1.6553],
        [ 0.0000, -1.6768],
        [ 0.0000, -1.7031],
        [ 0.0000, -1.6660],
        [ 0.0000, -1.6592],
        [ 0.0000, -1.6934],
        [ 0.0000, -1.7012],
        [ 0.0000, -1.6816],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.6533],
        [ 0.0000, -1.6309],
        [ 0.0000, -1.6758],
        [ 0.0000, -1.6465],
        [ 0.0000, -1.6758]], device='cuda:0')
sp_sent.view(-1):  tens

sp_para_output_t:  tensor([[[0.0670],
         [0.0795],
         [0.0413],
         [0.0567],
         [0.0617],
         [0.0717],
         [0.1141],
         [0.0519],
         [0.0416],
         [0.0511]]], device='cuda:0', dtype=torch.float16)
predict_support_sent.view(-1, 2).size():  torch.Size([15, 2])
sp_sent.view(-1).size():  torch.Size([15])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6602],
        [ 0.0000, -1.7012],
        [ 0.0000, -1.6768],
        [ 0.0000, -1.7188],
        [ 0.0000, -1.6777],
        [ 0.0000, -1.6973],
        [ 0.0000, -1.6143],
        [ 0.0000, -1.6523],
        [ 0.0000, -1.6377],
        [ 0.0000, -1.6748],
        [ 0.0000, -1.7129],
        [ 0.0000, -1.6768],
        [ 0.0000, -1.6465],
        [ 0.0000, -1.7041],
        [ 0.0000, -1.6494]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([11], device='cuda:0')
start_logits:  tensor([ 0.2499, 

sorted_answers: [{'start': tensor(220, device='cuda:0'), 'end': tensor(247, device='cuda:0'), 'start_logit': tensor(1.0225, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.2145, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.7300, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3979, device='cuda:0', dtype=torch.float16)}, {'start': tensor(72, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': tensor(0.6675, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.4590, device='cuda:0', dtype=torch.float16)}, {'start': tensor(95, device='cuda:0'), 'end': tensor(96, device='cuda:0'), 'start_logit': tensor(0.7300, device='cuda:0', dtype=torch.float16), 'end_logit': tensor(0.3835, device='cuda:0', dtype=torch.float16)}, {'start': tensor(72, device='cuda:0'), 'end': tensor(97, device='cuda:0'), 'start_logit': tensor(0.6675, device='cuda:0', dtype=torch

predict_support_sent.view(-1, 2).size():  torch.Size([27, 2])
sp_sent.view(-1).size():  torch.Size([27])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7236],
        [ 0.0000, -1.7314],
        [ 0.0000, -1.6924],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7217],
        [ 0.0000, -1.7393],
        [ 0.0000, -1.7334],
        [ 0.0000, -1.7178],
        [ 0.0000, -1.6689],
        [ 0.0000, -1.7627],
        [ 0.0000, -1.7412],
        [ 0.0000, -1.6895],
        [ 0.0000, -1.7695],
        [ 0.0000, -1.7812],
        [ 0.0000, -1.7197],
        [ 0.0000, -1.7197],
        [ 0.0000, -1.6436],
        [ 0.0000, -1.7598],
        [ 0.0000, -1.7383],
        [ 0.0000, -1.6855],
        [ 0.0000, -1.6904],
        [ 0.0000, -1.7266],
        [ 0.0000, -1.7061],
        [ 0.0000, -1.7295],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7109]], device='cuda:0')
sp_sent.view(-1):  tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

predict_support_sent.view(-1, 2).size():  torch.Size([33, 2])
sp_sent.view(-1).size():  torch.Size([33])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6680],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7354],
        [ 0.0000, -1.7686],
        [ 0.0000, -1.7334],
        [ 0.0000, -1.6748],
        [ 0.0000, -1.7510],
        [ 0.0000, -1.7529],
        [ 0.0000, -1.6855],
        [ 0.0000, -1.7373],
        [ 0.0000, -1.7783],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.7822],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.7344],
        [ 0.0000, -1.7598],
        [ 0.0000, -1.7363],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.6816],
        [ 0.0000, -1.7256],
        [ 0.0000, -1.7246],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.7217],
        [ 0.0000, -1.6797],
        [ 0.0000, -1.6797],
        [ 0.0000, -1.7373],
        [ 0.0000, -1.7461],
        [ 0.0000, -1

predict_support_sent.view(-1, 2).size():  torch.Size([31, 2])
sp_sent.view(-1).size():  torch.Size([31])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.5967],
        [ 0.0000, -1.7568],
        [ 0.0000, -1.7637],
        [ 0.0000, -1.7764],
        [ 0.0000, -1.7646],
        [ 0.0000, -1.7686],
        [ 0.0000, -1.4277],
        [ 0.0000, -1.7236],
        [ 0.0000, -1.7568],
        [ 0.0000, -1.7676],
        [ 0.0000, -1.7314],
        [ 0.0000, -1.7422],
        [ 0.0000, -1.7832],
        [ 0.0000, -1.7471],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.7979],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7051],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.7627],
        [ 0.0000, -1.7559],
        [ 0.0000, -1.7617],
        [ 0.0000, -1.7363],
        [ 0.0000, -1.7461],
        [ 0.0000, -1.7568],
        [ 0.0000, -1.7705],
        [ 0.0000, -1.7012],
        [ 0.0000, -1.7441],
        [ 0.0000, -1.7207],
        [ 0.0000, -1

predict_support_sent.view(-1, 2).size():  torch.Size([19, 2])
sp_sent.view(-1).size():  torch.Size([19])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6406],
        [ 0.0000, -1.7051],
        [ 0.0000, -1.6621],
        [ 0.0000, -1.6436],
        [ 0.0000, -1.6953],
        [ 0.0000, -0.4255],
        [ 0.0000, -1.6709],
        [ 0.0000, -1.6885],
        [ 0.0000, -1.6621],
        [ 0.0000, -1.6816],
        [ 0.0000, -1.6953],
        [ 0.0000, -1.6729],
        [ 0.0000, -1.6191],
        [ 0.0000, -1.6357],
        [ 0.0000, -1.6582],
        [ 0.0000, -1.6738],
        [ 0.0000, -1.6865],
        [ 0.0000, -1.6562],
        [ 0.0000, -1.6289]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([29], device='cuda:0')
start_logits:  tensor([-2.9639e-01, -8.3008e-01, -6.0107e-01, -5.0732e-01, -7.5989e-02,
        -8.5938e-01,  6.1432e-02, -7.1143e-01, -3.9673e-01, -7

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6934],
        [ 0.0000, -1.6357],
        [ 0.0000, -1.7432],
        [ 0.0000, -1.7070],
        [ 0.0000, -1.7119],
        [ 0.0000, -1.7539],
        [ 0.0000, -1.7314],
        [ 0.0000, -1.7295],
        [ 0.0000, -1.7695],
        [ 0.0000, -1.7783],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.7754],
        [ 0.0000, -1.7959],
        [ 0.0000, -1.7861],
        [ 0.0000, -1.7695],
        [ 0.0000, -1.7539],
        [ 0.0000, -1.7520],
        [ 0.0000, -1.7402],
        [ 0.0000, -1.7480],
        [ 0.0000, -1.7285],
        [ 0.0000, -1.7227],
        [ 0.0000, -1.7275],
        [ 0.0000, -1.7764]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       device='cuda:0')
question_end_index:  tensor([21], device='cuda:0')
start_logits:  tensor([ 1.

predict_support_sent.view(-1, 2).size():  torch.Size([13, 2])
sp_sent.view(-1).size():  torch.Size([13])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.7441],
        [ 0.0000, -1.7744],
        [ 0.0000, -1.7852],
        [ 0.0000, -1.7598],
        [ 0.0000, -1.7383],
        [ 0.0000, -1.7510],
        [ 0.0000, -1.6992],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.7598],
        [ 0.0000, -1.7148],
        [ 0.0000, -1.7490],
        [ 0.0000, -1.7500],
        [ 0.0000, -1.7959]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
question_end_index:  tensor([16], device='cuda:0')
start_logits:  tensor([ 2.5342e-01, -1.2091e-01, -5.7526e-02, -2.0764e-01,  1.1035e-01,
        -9.0332e-02, -2.3853e-01,  4.4751e-01,  1.8665e-01,  1.4685e-01,
         4.4525e-02, -2.3230e-01,  4.3359e-01,  3.7134e-01,  1.9641e-01,
         4.4141e-01, -2.0557e-01,  3.6499e-01, -2.8101e-01,  1.4290e-02,
        -4.8682e-01,  3.6255e-01, -7

predict_support_sent.view(-1, 2).size():  torch.Size([23, 2])
sp_sent.view(-1).size():  torch.Size([23])
predict_support_sent.view(-1, 2):  tensor([[ 0.0000, -1.6621],
        [ 0.0000, -1.6738],
        [ 0.0000, -1.6426],
        [ 0.0000, -1.6592],
        [ 0.0000, -1.6445],
        [ 0.0000, -1.6396],
        [ 0.0000, -1.6748],
        [ 0.0000, -1.6240],
        [ 0.0000, -1.6211],
        [ 0.0000, -1.6387],
        [ 0.0000, -1.6631],
        [ 0.0000, -1.6084],
        [ 0.0000, -1.6611],
        [ 0.0000, -1.6846],
        [ 0.0000, -1.6680],
        [ 0.0000, -1.6533],
        [ 0.0000, -1.6592],
        [ 0.0000, -1.6572],
        [ 0.0000, -1.6299],
        [ 0.0000, -1.6455],
        [ 0.0000, -1.6553],
        [ 0.0000, -1.6289],
        [ 0.0000, -1.6299]], device='cuda:0')
sp_sent.view(-1):  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:0')
question_end_index:  tensor([15], device='cuda:0')
start_logits:  tensor([ 7.


Epoch 00005: avg_val_f1 reached 0.00887 (best 0.06897), saving model to /xdisk/msurdeanu/fanluo/hotpotQA/Data/jupyter-hotpotqa/hotpotqa-longformer_jupyter/_ckpt_epoch_5.ckpt as top 5


validation_epoch_end
before sync --> sizes:  29, 29, 29
after sync --> sizes: 29, 29, 29
avg_loss:  tensor(12.2815, device='cuda:0')	avg_answer_loss:  tensor(3.3925, device='cuda:0')	avg_type_loss:  tensor(0.2978, device='cuda:0')	avg_sp_para_loss:  tensor(0.4540, device='cuda:0')	avg_sp_sent_loss:  tensor(0.2859, device='cuda:0')
avg_val_f1:  0.008870564914982894	avg_val_em:  0.0	avg_val_prec:  0.006157635614789766	avg_val_recall:  0.03735632192471932
avg_val_sp_sent_f1:  0.0	avg_val_sp_sent_em:  0.0	avg_val_sp_sent_prec:  0.0	avg_val_sp_sent_recall:  0.0
avg_val_sp_para_f1:  0.1724137931034483	avg_val_sp_para_em:  0.034482758620689655	avg_val_sp_para_prec:  0.1724137931034483	avg_val_sp_para_recall:  0.1724137931034483
avg_val_joint_f1:  0.0	avg_val_joint_em:  0.0	avg_val_joint_prec:  0.0	avg_val_joint_recall:  0.0



1

In [None]:
#     trainer.test(model)

In [None]:
### To install apex ### 
#     !git clone https://github.com/NVIDIA/apex
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/apex/")
#     !module load cuda101/neuralnet/7/7.6.4  
#     !module load cuda10.1/toolkit/10.1.243 
#     !conda install -c conda-forge cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf201901 cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf202003 cudatoolkit-dev --yes
#     !which nvcc
#     !python -m pip install -v --no-cache-dir ./
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/")

In [39]:
# debug: check args
import shlex
argString ='--train_dataset small.json --dev_dataset small.json  \
    --gpus 0 --num_workers 4 \
    --max_seq_len 4096 --doc_stride -1  \
    --save_prefix hotpotqa-longformer_jupyter  --model_path /xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096'
# hotpot_dev_distractor_v1.json
#  --train_dataset /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_train_reduced_context_coref_fuzzy.json --dev_dataset /xdisk/msurdeanu/fanluo/hotpotQA/Data/reduced_questions/hotpot_reduced_context_04-08-2021-01:12:53/hotpot_dev_reduced_context_coref_fuzzy.json  \ 

import argparse 
if __name__ == "__main__":
    main_arg_parser = argparse.ArgumentParser(description="hotpotqa")
    parser = hotpotqa.add_model_specific_args(main_arg_parser, os.getcwd())
    args = parser.parse_args(shlex.split(argString)) 
    for arg in vars(args):
        print((arg, getattr(args, arg)))
    main(args)


('save_dir', 'jupyter-hotpotqa')
('save_prefix', 'hotpotqa-longformer_jupyter')
('train_dataset', 'small.json')
('dev_dataset', 'small.json')
('batch_size', 2)
('gpus', '0')
('warmup', 1000)
('lr', 5e-05)
('val_every', 1.0)
('val_percent_check', 1.0)
('num_workers', 4)
('seed', 1234)
('epochs', 6)
('max_seq_len', 4096)
('max_doc_len', 4096)
('max_num_answers', 64)
('max_question_len', 55)
('doc_stride', -1)
('ignore_seq_with_no_answers', False)
('disable_checkpointing', False)
('n_best_size', 20)
('max_answer_length', 30)
('regular_softmax_loss', False)
('test', False)
('model_path', '/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096')
('no_progress_bar', False)
('attention_mode', 'sliding_chunks')
('fp32', False)
('train_percent', 1.0)


### Sandbox

In [None]:
import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096') 
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

In [None]:
model = Longformer.from_pretrained('/xdisk/msurdeanu/fanluo/hotpotQA/longformer-base-4096', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

In [None]:
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
model = model.cuda() 
input_ids = input_ids.cuda()

In [None]:
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens

# padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
input_ids, attention_mask = pad_to_window_size(
        input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

In [None]:
output = model(input_ids, attention_mask=attention_mask)[0]
output

In [None]:

!nvidia-smi
!nvidia-smi -L

In [None]:
import torch
crossentropy_average = torch.nn.CrossEntropyLoss(reduction = 'mean', ignore_index=-1) 
crossentropy_weighted_average = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 4.0]), reduction = 'mean', ignore_index=-1) 

sp_para_output_t = torch.tensor([[
         [ 0.0227],
         [ 0.0365],
         [ 0.0054],
         [ 0.0401],
         [ 0.0590],
         [ 0.0134],
         [ 0.0336],
         [-0.0062],
         [ 0.0631],
         [ 0.0811]]])
sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float) 
predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).view(-1,2)
# predict_support_para = torch.cat([sp_para_output_t, sp_para_output_aux], dim=-1).view(-1,2)
predict_support_para

sp_para =  torch.tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
#sp_para_loss:  tensor(0.7061, device='cuda:0')

crossentropy_average(predict_support_para, sp_para)

crossentropy_weighted_average(predict_support_para, sp_para)

sp_para

positive_index = torch.nonzero(sp_para.view(-1)).view(-1).tolist()

from random import choice
choice([i for i in range(0,9) if i not in poositive_index])

set(range(10))

predict_support_para_1 = torch.tensor([[ 0.0000,  0.0227]])
sp_para_1 =  torch.tensor([0])
loss1 = crossentropy_average(predict_support_para_1, sp_para_1)
loss1

predict_support_para_2 = torch.tensor([[ 0.0000,  0.0365]])
sp_para_2 =  torch.tensor([0])
loss2 = crossentropy_average(predict_support_para_2, sp_para_2)
loss2

predict_support_para_3 = torch.tensor([[ 0.0000,  0.0054]])
sp_para_3 =  torch.tensor([0])
loss3 = crossentropy_average(predict_support_para_3, sp_para_3)
loss3

predict_support_para_4 = torch.tensor([[ 0.0000,  0.0401]])
sp_para_4 =  torch.tensor([0])
loss4 = crossentropy_average(predict_support_para_4, sp_para_4)
loss4

predict_support_para_5 = torch.tensor([[ 0.0000,  0.0590]])
sp_para_5 =  torch.tensor([0])
loss5 = crossentropy_average(predict_support_para_5, sp_para_5)
loss5

predict_support_para_6 = torch.tensor([[ 0.0000,  0.0134]])
sp_para_6 =  torch.tensor([1])
loss6 = crossentropy_average(predict_support_para_6, sp_para_6)
loss6

predict_support_para_7 = torch.tensor([[ 0.0000,  0.0336]])
sp_para_7 =  torch.tensor([1])
loss7 = crossentropy_average(predict_support_para_7, sp_para_7)
loss7

predict_support_para_8 = torch.tensor([[ 0.0000, -0.0062]])
sp_para_8 =  torch.tensor([0])
loss8 = crossentropy_average(predict_support_para_8, sp_para_8)
loss8

predict_support_para_9 = torch.tensor([[ 0.0000,  0.0631]])
sp_para_9 =  torch.tensor([0])
loss9 = crossentropy_average(predict_support_para_9, sp_para_9)
loss9

predict_support_para_10 = torch.tensor([[ 0.0000,  0.0811]])
sp_para_10 =  torch.tensor([0])
loss10 = crossentropy_average(predict_support_para_10, sp_para_10)
loss10

(loss1+loss2+loss3+loss4+loss5+loss6+loss7+loss8+loss9+loss10)/10

(loss1+loss2+loss3+loss4+loss5+4.0*loss6+4.0*loss7+loss8+loss9+loss10)/(8+2*4)

In [None]:
s_to_p_map = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 7, 7, 8, 9]
sp_sent_logits = torch.tensor([[[-0.0143],
         [ 0.0416],
         [ 0.1065],
         [-0.0007],
         [ 0.0407],
         [ 0.1273],
         [ 0.0663],
         [ 0.0500],
         [ 0.0630],
         [ 0.1230],
         [ 0.0140],
         [ 0.0205],
         [-0.0505],
         [ 0.0224],
         [ 0.0907],
         [-0.0142],
         [ 0.0180],
         [ 0.0564],
         [-0.0132],
         [ 0.0016],
         [ 0.0327],
         [ 0.0687],
         [ 0.1049],
         [ 0.0568],
         [ 0.0740],
         [ 0.0006],
         [ 0.1159],
         [ 0.0566],
         [ 0.0543],
         [-0.0304],
         [ 0.1315],
         [-0.0143],
         [ 0.0444],
         [-0.0682],
         [ 0.1120],
         [ 0.1247]]])


In [None]:

if(len(s_to_p_map)>0):
    for i in range(s_to_p_map[-1]+1)
        sent_logits_torch.masked_select(sp_sent_logits.squeeze(), torch.tensor([p==0 for p in s_to_p_map]))

In [None]:
list(range(0))
