### convert hotpotqa to squard format

According to Longformer: use the following input format with special tokens:  “[CLS] [q] question [/q] [p] sent1,1 [s] sent1,2 [s] ... [p] sent2,1 [s] sent2,2 [s] ...” 
where [s] and [p] are special tokens representing sentences and paragraphs. The special tokens were added to the RoBERTa vocabulary and randomly initialized before task finetuning.

In [1]:
# helper functions to convert hotpotqa to squard format modified from  https://github.com/chiayewken/bert-qa/blob/master/run_hotpot.py

import tqdm

def create_example_dict(context, answers, id, is_impossible, question, is_sup_fact, is_supporting_para):
    return {
        "context": context,
        "qas": [                        # each context corresponds to only one qa in hotpotqa
            {
                "answers": answers,
                "id": id,
                "is_impossible": is_impossible,
                "question": question,
                "is_sup_fact": is_sup_fact,
                "is_supporting_para": is_supporting_para
            }
        ],
    }


def create_para_dict(example_dicts):
    if type(example_dicts) == dict:
        example_dicts = [example_dicts]   # each paragraph corresponds to only one [context, qas] in hotpotqa
    return {"paragraphs": example_dicts}   

In [2]:
import re
def convert_hotpot_to_squad_format(json_dict, gold_paras_only=False):
    
    """function to convert hotpotqa to squard format.


    Note: A context corresponds to several qas in SQuard. In hotpotqa, one question corresponds to several paragraphs as context. 
          "paragraphs" means different: each paragraph in SQuard contains a context and a list of qas; while 10 paragraphs in hotpotqa concatenated into a context for one question.

    Args:
        json_dict: The original data load from hotpotqa file.
        gold_paras_only: when is true, only use the 2 paragraphs that contain the gold supporting facts; if false, use all the 10 paragraphs
 

    Returns:
        new_dict: The converted dict of hotpotqa dataset, use it as a dict would load from SQuAD json file
                  usage: input_data = new_dict["data"]   https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_squad.py#L230

    """

    new_dict = {"data": []} 
    for example in json_dict: 

        support_para = set(
            para_title for para_title, _ in example["supporting_facts"]
        )
        sp_set = set(list(map(tuple, example['supporting_facts'])))
        
        raw_contexts = example["context"]
        if gold_paras_only: 
            raw_contexts = [lst for lst in raw_contexts if lst[0] in support_para]
            
        contexts = [" <s> ".join(lst[1]) for lst in raw_contexts]    # extra space is fine, which would be ignored latter. most sentences has already have heading space, there are several no heading space 
        context = " <p> " + " <p> ".join(contexts)
        
        is_supporting_para = []  # a boolean list with 10 True/False elements, one for each paragraph
        is_sup_fact = []         # a boolean list with True/False elements, one for each context sentence
        for para_title, para_lines in raw_contexts:
            is_supporting_para.append(para_title in support_para)   
            for sent_id, sent in enumerate(para_lines):
                is_sup_fact.append( (para_title, sent_id) in sp_set )


        answer = example["answer"].strip() 
        if answer.lower() == 'yes':
            answers = [{"answer_start": -1, "answer_end": -1, "text": answer}] 
        elif answer.lower() == 'no':
            answers = [{"answer_start": -2, "answer_end": -2, "text": answer}] 
        else:
            answers = []          # keep all the occurences of answer in the context
            for m in re.finditer(re.escape(answer), context):    
                answer_start, answer_end = m.span() 
                answers.append({"answer_start": answer_start, "answer_end": answer_end, "text": answer})
             

        new_dict["data"].append(
            create_para_dict(
                create_example_dict(
                    context=context,
                    answers=answers,
                    id = example["_id"],
                    is_impossible=(answers == []),
                    question=example["question"],
                    is_sup_fact = is_sup_fact,
                    is_supporting_para = is_supporting_para 
                )
            )
        ) 
    return new_dict

In [3]:
# debug: check whether convert_hotpot_to_squad_format() works
import os
os.chdir('/xdisk/msurdeanu/fanluo/hotpotQA/')
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/hotpot_train_v1.1.json | ../jq-linux64 -c '.[0:16]' > small.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/hotpot_train_v1.1.json | ../jq-linux64 -c '.[17:30]' > small_dev.json

import json
with open("small.json", "r", encoding='utf-8') as f:  
    json_dict = convert_hotpot_to_squad_format(json.load(f))['data']
    print(json.dumps(json_dict[0], indent=2))

{
  "paragraphs": [
    {
      "context": " <p> Radio City is India's first private FM radio station and was started on 3 July 2001. <s>  It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). <s>  It plays Hindi, English and regional songs. <s>  It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. <s>  Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. <s>  The Radio station currently plays a mix of Hindi and Regional music. <s>  Abraham Thomas is the CEO of the company. <p> Football in Albania existed before the Albanian Football Federation (FSHF) was created. <s>  This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 

### longfomer's fine-tuning


- For answer span extraction we use BERT’s QA model with addition of a question type (yes/no/span) classification head over the first special token ([CLS]).

- For evidence extraction we apply 2 layer feedforward networks on top of the representations corresponding to sentence and paragraph tokens to get the corresponding evidence prediction scores and use binary cross entropy loss to train the model.

- We combine span, question classification, sentence, and paragraphs losses and train the model in a multitask way using linear combination of losses.


In [4]:
### Section2: This is modified from longfomer's fine-tuning with triviaqa.py from https://github.com/allenai/longformer/blob/master/scripts/triviaqa.py
# !conda install transformers --yes
# !conda install cudatoolkit=10.0 --yes
# !python -m pip install git+https://github.com/allenai/longformer.git
# !conda install -c conda-forge regex --force-reinstall --yes
# !conda install pytorch-lightning -c conda-forge
# !pip install jdc 
# !pip install test-tube 
# !conda install ipywidgets --yes
# !conda update --force conda --yes  
# !jupyter nbextension enable --py widgetsnbextension 
# !conda install jupyter --yes

# need to run this every time start this notebook, to add python3.7/site-packages to sys.pat, in order to import ipywidgets, which is used when RobertaTokenizer.from_pretrained('roberta-base') 
import sys
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.7/site-packages')

import os
from collections import defaultdict
import json
import string
import random
import numpy as np
import torch
from torch.optim.lr_scheduler import LambdaLR

from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer 

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
from pytorch_lightning.logging import TestTubeLogger    # sometimes pytorch_lightning.loggers works instead


from longformer.longformer import Longformer
from longformer.sliding_chunks import pad_to_window_size
import jdc
from more_itertools import locate
from collections import Counter

#### class hotpotqaDataset

##### \_\_init\_\_, \_\_getitem\_\_ and \_\_len\_\_ 

In [5]:
class hotpotqaDataset(Dataset):
    """
    Largely based on
    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
    and
    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
    """
    
    
    def __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride,
                 max_num_answers, ignore_seq_with_no_answers, max_question_len):
        assert os.path.isfile(file_path)
        self.file_path = file_path
        with open(self.file_path, "r", encoding='utf-8') as f:
            print(f'reading file: {self.file_path}')
            self.data_json = convert_hotpot_to_squad_format(json.load(f))['data']
#             print(self.data_json[0])
        
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.max_doc_len = max_doc_len
        self.doc_stride = doc_stride
        self.max_num_answers = max_num_answers
        self.ignore_seq_with_no_answers = ignore_seq_with_no_answers
        self.max_question_len = max_question_len

        print(tokenizer.all_special_tokens)
        print(tokenizer.all_special_ids)
    
        # A mapping from qid to an int, which can be synched across gpus using `torch.distributed`
        if 'train' not in self.file_path:  # only for the evaluation set 
            self.val_qid_string_to_int_map =  \
                {
                    entry["paragraphs"][0]['qas'][0]['id']: index
                    for index, entry in enumerate(self.data_json)
                }
        else:
            self.val_qid_string_to_int_map = None
            
            
    def __len__(self):
        return len(self.data_json)

    def __getitem__(self, idx):
        entry = self.data_json[idx]
        tensors_list = self.one_example_to_tensors(entry, idx)
        assert len(tensors_list) == 1
        return tensors_list[0]
    

##### one_example_to_tensors

In [6]:
    %%add_to hotpotqaDataset
    def one_example_to_tensors(self, example, idx):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
                return True
            return False
        tensors_list = []
        for paragraph in example["paragraphs"]:  # example["paragraphs"] only contains one paragraph in hotpotqa
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c) # add a new token
                    else:
                        doc_tokens[-1] += c  # append the character to the last token
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                question_text = qa["question"]
#                 print("question text: ", question_text)  
                sp_sent = qa["is_sup_fact"]
                sp_para = qa["is_supporting_para"]
                start_position = None
                end_position = None
                orig_answer_text = None
                
                p_list = list(locate(doc_tokens , lambda x: x == "<p>")) 
                assert(len(p_list) == len(sp_para))
                s_list = list(locate(doc_tokens , lambda x: x == "<s>"))
#                 
#                 if(len(s_list) + len(p_list) != len(sp_sent)):
#                     print("len(s_list):", len(s_list))
#                     print("len(p_list):", len(p_list))
#                     print("len(sp_sent):", len(sp_sent))
#                     print("sp_sent", sp_sent)
#                     print("paragraph_text", paragraph_text)
#                     print("doc_tokens", doc_tokens)
                assert(len(s_list) + len(p_list) == len(sp_sent) )
                
                # keep all answers in the document, not just the first matched answer. It also added the list of textual answers to make evaluation easy.
                answer_spans = []
                for answer in qa["answers"]:
                    orig_answer_text = answer["text"]
#                     print("answer", orig_answer_text)
                    answer_start = answer["answer_start"]
                    answer_end = answer["answer_end"]  
                    if(answer_start >= 0 and answer_end > 0):
                        try:
                            start_word_position = char_to_word_offset[answer_start]
                            end_word_position = char_to_word_offset[answer_end-1]
                        except:
                            print(f'error: Reading example {idx} failed')
                            start_word_position = -3
                            end_word_position = -3
                    else:
                        start_word_position = answer["answer_start"]
                        end_word_position = answer["answer_end"]
                    answer_spans.append({'start': start_word_position, 'end': end_word_position})

                    
                # ===== Given an example, convert it into tensors  =============
                query_tokens = self.tokenizer.tokenize(question_text)
                query_tokens = query_tokens[:self.max_question_len]
                tok_to_orig_index = []
                orig_to_tok_index = []
                all_doc_tokens = []
                
                # each original token in the context is tokenized to multiple sub_tokens
                for (i, token) in enumerate(doc_tokens):
                    orig_to_tok_index.append(len(all_doc_tokens))
                    # hack: the line below should have been `self.tokenizer.tokenize(token')`
                    # but roberta tokenizer uses a different subword if the token is the beginning of the string
                    # or in the middle. So for all tokens other than the first, simulate that it is not the first
                    # token by prepending a period before tokenizing, then dropping the period afterwards
                    sub_tokens = self.tokenizer.tokenize(f'. {token}')[1:] if i > 0 else self.tokenizer.tokenize(token)
                    for sub_token in sub_tokens:
                        tok_to_orig_index.append(i)
                        all_doc_tokens.append(sub_token)
                
                # all sub tokens, truncate up to limit
                all_doc_tokens = all_doc_tokens[:self.max_doc_len-3]

                # The -3 accounts for [CLS], [q], [/q]  
                max_tokens_per_doc_slice = self.max_seq_len - len(query_tokens) - 3
                assert max_tokens_per_doc_slice > 0
                if self.doc_stride < 0:                           # default
                    # negative doc_stride indicates no sliding window, but using first slice
                    self.doc_stride = -100 * len(all_doc_tokens)  # large -negtive value for the next loop to execute once
                
                # inputs to the model
                input_ids_list = []
                input_mask_list = []
                segment_ids_list = []
                start_positions_list = []
                end_positions_list = []
                q_type_list = []
                sp_sent_list =  [1 if ss else 0 for ss in sp_sent]
                sp_para_list = [1 if sp else 0 for sp in sp_para]
                
                for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - self.doc_stride):    # execute once by default
                    slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens))

                    doc_slice_tokens = all_doc_tokens[slice_start:slice_end]
                    tokens = ["<cls>"] + ["<q>"] + query_tokens + ["</q>"] + doc_slice_tokens   
                    segment_ids = [0] * (len(query_tokens) + 3) + [1] *  len(doc_slice_tokens) 
                    assert len(segment_ids) == len(tokens)

                    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)   
                    input_mask = [1] * len(input_ids)

                    if self.doc_stride >= 0:  # no need to pad if document is not strided
                        # Zero-pad up to the sequence length.
                        padding_len = self.max_seq_len - len(input_ids)
                        input_ids.extend([self.tokenizer.pad_token_id] * padding_len)
                        input_mask.extend([0] * padding_len)
                        segment_ids.extend([0] * padding_len)

                        assert len(input_ids) == self.max_seq_len
                        assert len(input_mask) == self.max_seq_len
                        assert len(segment_ids) == self.max_seq_len

                    # ===== answer positions tensors  ============
                    doc_offset = len(query_tokens) + 3 - slice_start  # where context starts
                    start_positions = []
                    end_positions = []
                    q_type = None
                    for answer_span in answer_spans:
                        start_position = answer_span['start']   # reletive to context
                        end_position = answer_span['end']
                        if(start_position >= 0):
                            tok_start_position_in_doc = orig_to_tok_index[start_position]  # sub_tokens postion reletive to context
                            not_end_of_doc = int(end_position + 1 < len(orig_to_tok_index))
                            tok_end_position_in_doc = orig_to_tok_index[end_position + not_end_of_doc] - not_end_of_doc
                            if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end:
                                assert("this answer is outside the current slice")   # only has one slice with the large negative doc_stride
                                continue                                
                            start_positions.append(tok_start_position_in_doc + doc_offset)   # sub_tokens postion reletive to begining of all the tokens, including query sub tokens  
                            end_positions.append(tok_end_position_in_doc + doc_offset)
                            if(q_type != None and q_type != 0):
                                assert("inconsistance q_type")
                            q_type = 0
                
                        elif(start_position == -1):
                            if(q_type != None and q_type != 1):
                                assert("inconsistance q_type")
                            q_type = 1
                            start_positions.append(-1)  # will be ignored
                            end_positions.append(-1)     
                        elif(start_position == -2):
                            if(q_type != None and q_type != 2):
                                assert("inconsistance q_type")
                            q_type = 2
                            start_positions.append(-1)
                            end_positions.append(-1)     
                        else:
                            assert("unknown start_positions")
                            continue
                    assert len(start_positions) == len(end_positions)
                    
                    
                    if self.ignore_seq_with_no_answers and len(start_positions) == 0:
                        continue

                    # answers from start_positions and end_positions if > self.max_num_answers
                    start_positions = start_positions[:self.max_num_answers]
                    end_positions = end_positions[:self.max_num_answers]

                    # -1 padding up to self.max_num_answers
                    padding_len = self.max_num_answers - len(start_positions)
                    start_positions.extend([-1] * padding_len)
                    end_positions.extend([-1] * padding_len)

                    # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values
                    found_start_positions = set()
                    found_end_positions = set()
                    for i, (start_position, end_position) in enumerate(zip(start_positions, end_positions)):
                        if start_position in found_start_positions:
                            start_positions[i] = -1
                        if end_position in found_end_positions:
                            end_positions[i] = -1
                        found_start_positions.add(start_position)
                        found_end_positions.add(end_position)

                    input_ids_list.append(input_ids)
                    input_mask_list.append(input_mask)
                    segment_ids_list.append(segment_ids)
                    start_positions_list.append(start_positions)
                    end_positions_list.append(end_positions)
                    q_type_list.append(q_type)
                tensors_list.append((torch.tensor(input_ids_list), torch.tensor(input_mask_list), torch.tensor(segment_ids_list),
                                     torch.tensor(start_positions_list), torch.tensor(end_positions_list), torch.tensor(q_type_list),
                                      torch.tensor([sp_sent_list]),  torch.tensor([sp_para_list]),
                                     qa['id']))    
#                 tensors_list.append((doc_tokens))
        return tensors_list

##### collate_one_doc_and_lists

In [7]:
    %%add_to hotpotqaDataset
    @staticmethod
    def collate_one_doc_and_lists(batch):
        num_metadata_fields = 1  # qids  
        fields = [x for x in zip(*batch)]
        stacked_fields = [torch.stack(field) for field in fields[:-num_metadata_fields]]  # don't stack metadata fields
        stacked_fields.extend(fields[-num_metadata_fields:])  # add them as lists not torch tensors

        # always use batch_size=1 where each batch is one document
        # will use grad_accum to increase effective batch size
        assert len(batch) == 1
        fields_with_batch_size_one = [f[0] for f in stacked_fields]
        return fields_with_batch_size_one

##### class info

In [8]:
dir(hotpotqaDataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'collate_one_doc_and_lists',
 'one_example_to_tensors']

In [9]:
from inspect import getmembers
getmembers(hotpotqaDataset)

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__class__', type),
 ('__delattr__', <slot wrapper '__delattr__' of 'object' objects>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__doc__': '\n    Largely based on\n    https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py\n    and\n    https://github.com/huggingface/transformers/blob/master/examples/run_squad.py\n    ',
                '__init__': <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>,
                '__len__': <function __main__.hotpotqaDataset.__len__(self)>,
                '__getitem__': <function __main__.hotpotqaDataset.__getitem__(self, idx)>,
                'one_example_to_tensors': <function __main__.one_example_to_tensors(self, example, idx)>,
                'collate_one

In [10]:
from inspect import isfunction
functions_list = [o for o in getmembers(hotpotqaDataset) if isfunction(o[1])]
functions_list

[('__add__', <function torch.utils.data.dataset.Dataset.__add__(self, other)>),
 ('__getitem__', <function __main__.hotpotqaDataset.__getitem__(self, idx)>),
 ('__init__',
  <function __main__.hotpotqaDataset.__init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)>),
 ('__len__', <function __main__.hotpotqaDataset.__len__(self)>),
 ('collate_one_doc_and_lists',
  <function __main__.collate_one_doc_and_lists(batch)>),
 ('one_example_to_tensors',
  <function __main__.one_example_to_tensors(self, example, idx)>)]

In [11]:
import inspect
inspect.getmro(hotpotqaDataset)  # a hierarchy of classes 

(__main__.hotpotqaDataset, torch.utils.data.dataset.Dataset, object)

In [12]:
inspect.getfullargspec(hotpotqaDataset.one_example_to_tensors)

FullArgSpec(args=['self', 'example', 'idx'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={})

In [13]:
help(hotpotqaDataset)

Help on class hotpotqaDataset in module __main__:

class hotpotqaDataset(torch.utils.data.dataset.Dataset)
 |  Largely based on
 |  https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
 |  and
 |  https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
 |  
 |  Method resolution order:
 |      hotpotqaDataset
 |      torch.utils.data.dataset.Dataset
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, idx)
 |  
 |  __init__(self, file_path, tokenizer, max_seq_len, max_doc_len, doc_stride, max_num_answers, ignore_seq_with_no_answers, max_question_len)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  one_example_to_tensors(self, example, idx)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  collate_one_doc_and_lists(batch)
 |  
 |  -----------------------

#### class hotpotqa

##### \_\_init\_\_,  forward, dataloaders

In [98]:
class hotpotqa(pl.LightningModule):
    def __init__(self, args):
        super(hotpotqa, self).__init__()
        self.args = args
        self.hparams = args

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        num_new_tokens = self.tokenizer.add_special_tokens({"additional_special_tokens": ["<cls>", "<p>", "<q>", "</q>"]})
#         print(self.tokenizer.all_special_tokens)
        self.tokenizer.model_max_length = self.args.max_seq_len
        self.model = self.load_model()
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.num_labels = 2
        self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
        
        self.dense_type = torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)
        self.linear_type = torch.nn.Linear(self.model.config.hidden_size, 3)   #  question type (yes/no/span) classification 
        self.dense_sp_sent = torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)
        self.linear_sp_sent = torch.nn.Linear(self.model.config.hidden_size, 1)    
        self.dense_sp_para = torch.nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)
        self.linear_sp_para = torch.nn.Linear(self.model.config.hidden_size, 1) 
        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
    
    def load_model(self):
#         model = Longformer.from_pretrained(self.args.model_path)
        model = Longformer.from_pretrained('longformer-base-4096')
        for layer in model.encoder.layer:
            layer.attention.self.attention_mode = self.args.attention_mode
            self.args.attention_window = layer.attention.self.attention_window

        print("Loaded model with config:")
        print(model.config)

        for p in model.parameters():
            p.requires_grad_(True)
        model.train()
        return model

#%%add_to hotpotqa    # does not seems to work for the @pl.data_loader decorator, missing which causes error "validation_step() takes 3 positional arguments but 4 were given"    
###################################################### dataloaders ########################################################### 
    @pl.data_loader
    def train_dataloader(self):
        if self.train_dataloader_object is not None:
            return self.train_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.train_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=self.args.ignore_seq_with_no_answers)

        sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None
        dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None),
                        num_workers=self.args.num_workers, sampler=sampler,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

        self.train_dataloader_object = dl
        return self.train_dataloader_object

    @pl.data_loader
    def val_dataloader(self):
        if self.val_dataloader_object is not None:
            return self.val_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples
        sampler = torch.utils.data.distributed.DistributedSampler(dataset) if self.trainer.use_ddp else None
        dl = DataLoader(dataset, batch_size=1, shuffle=(sampler is None),
                        num_workers=self.args.num_workers, sampler=sampler,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.val_dataloader_object = dl
        return self.val_dataloader_object

    @pl.data_loader
    def test_dataloader(self):
        if self.test_dataloader_object is not None:
            return self.test_dataloader_object
        dataset = hotpotqaDataset(file_path=self.args.dev_dataset, tokenizer=self.tokenizer,
                                  max_seq_len=self.args.max_seq_len, max_doc_len=self.args.max_doc_len,
                                  doc_stride=self.args.doc_stride,
                                  max_num_answers=self.args.max_num_answers,
                                  max_question_len=self.args.max_question_len,
                                  ignore_seq_with_no_answers=False)  # evaluation data should keep all examples

        dl = DataLoader(dataset, batch_size=1, shuffle=False,
                        num_workers=self.args.num_workers, sampler=None,
                        collate_fn=hotpotqaDataset.collate_one_doc_and_lists)
        self.test_dataloader_object = dl
        return self.test_dataloader_object

#%%add_to hotpotqa  
    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, q_type, sp_sent, sp_para):
        print("size of input_ids: " + str(input_ids.size())) 
        print("size of attention_mask: " + str(attention_mask.size()))
        print("size of segment_ids: " + str(segment_ids.size()))
        print("size of start_positions: " + str(start_positions.size()))
        print("size of end_positions:" + str(end_positions.size()))
        print("q_type: " + str(q_type))
        print("size of sp_sent: " + str(sp_sent.size()))
        print("size of sp_para: " + str(sp_para.size()))
        if(input_ids.size(0) > 1):
            assert("multi rows per document")
        # Each batch is one document, and each row of the batch is a chunck of the document.    ????
        # Make sure all rows have the same question length.
        
#         size of input_ids: torch.Size([1, 1495])
#         size of attention_mask: torch.Size([1, 1495])
#         size of segment_ids: torch.Size([1, 1495])
#         size of start_positions: torch.Size([1, 64])   # multiple occurences of the same answer string, -1 padding up to self.max_num_answers
#         size of end_positions: torch.Size([1, 64])
#         size of q_type: torch.Size([1, 1])
#         size of sp_sent: torch.Size([1, 40])           # number of sentences in context
#         size of sp_para: torch.Size([1, 10])

        # local attention everywhere
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
        
        # global attention for the cls and all question tokens
        question_end_index = self._get_special_index(input_ids, "</q>")
        attention_mask[:,:question_end_index.item()] = 2  # from <cls> until </q>  
            
        # global attention for the sentence and paragraph special tokens  
        p_index = self._get_special_index(input_ids, "<p>")
        print("size of p_index: " + str(p_index.size()))
        attention_mask[:, p_index] = 2
              
        s_index = self._get_special_index(input_ids, "<s>")
        print("size of s_index: " + str(s_index.size()))
        attention_mask[:, s_index] = 2
        
        print("p_index:", p_index) 
#         print("attention_mask: ", attention_mask)
        

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)

        sequence_output = self.model(
                input_ids,
                attention_mask=attention_mask)[0]

        # The pretrained hotpotqa model wasn't trained with padding, so remove padding tokens
        # before computing loss and decoding.
        padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
        if padding_len > 0:
            sequence_output = sequence_output[:, :-padding_len]
        print("size of sequence_output: " + str(sequence_output.size()))
              
        
        ###################################### layers on top of sequence_output ##################################
        

        ### 1. answer start and end positions classification ###   
        logits = self.qa_outputs(sequence_output)
        print("size of logits: " + str(logits.size())) 
        start_logits, end_logits = logits.split(1, dim=-1)
        print("size of start_logits: " + str(start_logits.size())) 
        start_logits = start_logits.squeeze(-1)
        print("size of start_logits after squeeze: " + str(start_logits.size())) 
        end_logits = end_logits.squeeze(-1)
 
        ### 2. type classification, similar as class LongformerClassificationHead(nn.Module) https://huggingface.co/transformers/_modules/transformers/modeling_longformer.html#LongformerForSequenceClassification.forward ### 
        print("size of sequence_output[:,0]: " + str(sequence_output[:,0].size()))
        type_logits = self.dense_type(sequence_output[:,0])
        print("size of type_logits after dense: " + str(type_logits.size()))
        # Non-linearity
        type_logits = torch.tanh(type_logits) 
        print("size of type_logits after tanh: " + str(type_logits.size()))
        type_logits = self.linear_type(type_logits)
        print("size of type_logits: " + str(type_logits.size()))
        
        ### 3. supporting paragraph classification ### 
        sp_para_output = torch.tensor([], device=input_ids.device) 
        sp_para_output = sequence_output[:,p_index,:]
        print("size of sp_para_output: " + str(sp_para_output.size()))      
              
        sp_para_output_t = self.dense_sp_para(sp_para_output)
        print("size of sp_para_output_t after dense: " + str(sp_para_output_t.size()))   
        # Non-linearity
        sp_para_output_t = torch.tanh(sp_para_output_t) 
        print("size of sp_para_output_t after tanh: " + str(sp_para_output_t.size()))
        sp_para_output_t = self.linear_sp_para(sp_para_output_t)
        print("size of sp_para_output_t: " + str(sp_para_output_t.size()))   
        
        # linear_sp_sent generates a single score for each sentence, instead of 2 scores for yes and no. 
        # Argument the score with additional score=0. The same way did in the HOTPOTqa paper
        sp_para_output_aux = torch.zeros(sp_para_output_t.shape, dtype=torch.float, device=sp_para_output_t.device) 
        print("size of sp_para_output_aux: " + str(sp_para_output_aux.size()))   
        predict_support_para = torch.cat([sp_para_output_aux, sp_para_output_t], dim=-1).contiguous()
        print("size of predict_support_para: " + str(predict_support_para.size()))              
            
        ### 4. supporting fact classification ###     
        # the first sentence in a paragraph is leading by <p>, other sentences are leading by <s>
        sent_indexes = torch.sort(torch.cat((s_index, p_index)))[0] # torch.sort returns a 'torch.return_types.sort' object has 2 items: values, indices
        print("size of sent_indexes: " + str(sent_indexes.size()))
        print("sent_indexes: ", sent_indexes)
        sp_sent_output = sequence_output[:,sent_indexes,:]
        print("size of sp_sent_output: " + str(sp_sent_output.size()))      
        
        sp_sent_output_t = self.dense_sp_sent(sp_sent_output)
        print("size of sp_sent_output_t after dense: " + str(sp_sent_output_t.size()))      
        # Non-linearity
        sp_sent_output_t = torch.tanh(sp_sent_output_t) 
        print("size of sp_sent_output_t after tanh: " + str(sp_sent_output_t.size()))        
        sp_sent_output_t = self.linear_sp_sent(sp_sent_output_t)
        print("size of sp_sent_output_t: " + str(sp_sent_output_t.size()))       
 
        sp_sent_output_aux = torch.zeros(sp_sent_output_t.shape, dtype=torch.float, device=sp_sent_output_t.device) 
        print("size of sp_sent_output_aux: " + str(sp_sent_output_aux.size()))  
        predict_support_sent = torch.cat([sp_sent_output_aux, sp_sent_output_t], dim=-1).contiguous()
        print("size of predict_support_sent: " + str(predict_support_sent.size()))  
        
        outputs = (torch.sigmoid(start_logits), torch.sigmoid(end_logits), torch.sigmoid(type_logits), torch.sigmoid(sp_para_output_t), torch.sigmoid(sp_sent_output_t))  
        total_loss = self.loss_computation(start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para_output_t, sp_para, predict_support_para, sp_sent_output_t, sp_sent, predict_support_sent)
#         total_loss = torch.tensor(0.0)
        print("total_loss: " + str(total_loss))
        outputs = (total_loss,) + outputs    
        return outputs
    
    def loss_computation(self, start_positions, end_positions, start_logits, end_logits, q_type, type_logits, sp_para_output_t, sp_para, predict_support_para, sp_sent_output_t, sp_sent, predict_support_sent):
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
                print("start_positions: " + str(start_positions)) 
                print("start_loss: " + str(start_loss)) 
                
#                 # for debug: check is there any impact if remove -1s from start_positions, and turns out no impact at all
#                 start_positions_debug = start_positions[:, torch.where(start_positions!=-1)[1]]
#                 start_loss_debug = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions_debug, ignore_index=-1)
#                 print("start_positions_debug: " + str(start_positions_debug)) 
#                 print("start_loss_debug: " + str(start_loss_debug)) 
                
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
                print("end_positions: " + str(end_positions)) 
                print("end_loss: " + str(end_loss)) 
                
#                 # for debug: check is there any impact if remove -1s from 
#                 end_positions_debug = end_positions[:, torch.where(end_positions!=-1)[1]]
#                 end_loss_debug = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions_debug, ignore_index=-1)
#                 print("end_positions_debug: " + str(end_positions_debug)) 
#                 print("end_loss_debug: " + str(end_loss_debug)) 

                type_loss = self.or_softmax_cross_entropy_loss_one_doc(type_logits, q_type.unsqueeze(0), ignore_index=-1)

#                 binary_loss = torch.nn.BCELoss()
# #                 print("sp_para_output_t.squeeze().type(): ", sp_para_output_t.squeeze().type())
# #                 print("sp_para.to(dtype=torch.half, device=sp_para.device).type(): ", sp_para.to(dtype=torch.half, device=sp_para.device).type())
#                 sp_para_loss = binary_loss(sp_para_output_t.squeeze(), sp_para.squeeze().to(dtype=torch.half, device=sp_para.device))
#                 sp_sent_loss = binary_loss(sp_sent_output_t.squeeze(), sp_sent.squeeze().to(dtype=torch.half, device=sp_sent.device))
                
#                 sp_para_loss = torch.tensor([0.0], device = predict_support_para.device )
# #                 print("predict_support_para.squeeze(): ", predict_support_para.squeeze())
# #                 print("sp_para.squeeze(): ", sp_para.squeeze())
#                 for para_predict, para_gold in zip(predict_support_para.squeeze(), sp_para.squeeze()):
# #                     print("para_predict.unsqueeze(0): ", para_predict.unsqueeze(0))
# #                     print(" para_gold.unsqueeze(0): ",  para_gold.unsqueeze(0))
#                     sp_para_loss += self.or_softmax_cross_entropy_loss_one_doc(para_predict.unsqueeze(0), para_gold.unsqueeze(0).unsqueeze(0), ignore_index=-1)
#                 print("sp_para.size(1): ", sp_para.size(1))
#                 sp_para_loss /= sp_para.size(1)
                
#                 sp_sent_loss = torch.tensor([0.0], device = predict_support_sent.device )
#                 for sent_predict, sent_gold in zip(predict_support_sent.squeeze(), sp_sent.squeeze()):
#                     sp_sent_loss += self.or_softmax_cross_entropy_loss_one_doc(sent_predict.unsqueeze(0), sent_gold.unsqueeze(0).unsqueeze(0), ignore_index=-1)
#                 print("sp_sent.size(1): ", sp_sent.size(1))
#                 sp_sent_loss /= sp_sent.size(1)
                nll_average = torch.nn.CrossEntropyLoss(size_average=True, ignore_index=-1)
#                 print("predict_support_para.view(-1, 2).size()", predict_support_para.view(-1, 2).size())
#                 print("sp_para.view(-1).size()", sp_para.view(-1).size())
                sp_para_loss = nll_average(predict_support_para.view(-1, 2), sp_para.view(-1))
                sp_sent_loss = nll_average(predict_support_sent.view(-1, 2), sp_sent.view(-1))
            else:
                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
                start_positions = start_positions[:, 0:1]   # only use the top1 start_position considering only one appearance of the answer string
                end_positions = end_positions[:, 0:1]
                start_loss = loss_fct(start_logits, start_positions[:, 0])
                end_loss = loss_fct(end_logits, end_positions[:, 0])
                type_loss = loss_fct(type_logits, q_type)    
                sp_para_loss = loss_fct(predict_support_para.view(-1, 2), sp_para.view(-1))
                sp_sent_loss = loss_fct(predict_support_sent.view(-1, 2), sp_sent.view(-1))
                
            answer_loss = (start_loss + end_loss) / 2 
            total_loss = answer_loss + type_loss + sp_para_loss + sp_sent_loss
            
        return total_loss  

#     %%add_to hotpotqa    
    def _get_special_index(self, input_ids, special_token):
        assert(input_ids.size(0)==1)
        token_indices =  torch.nonzero(input_ids == self.tokenizer.convert_tokens_to_ids(special_token))
        ### FOR DEBUG ###
        # input_ids = torch.tensor([[0.6, 0.0, 0.6, 0.0]]) 
        # token_indices =  torch.nonzero(input_ids == torch.tensor(0.6))
        return token_indices[:,1]    

    def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1):
        """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf"""
        assert logits.ndim == 2
        assert target.ndim == 2
        assert logits.size(0) == target.size(0)

        # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target, considing only one correct target 
        # here, the numerator is the sum of a few potential targets, where some of them is the correct answer, considing more correct targets

        # target are indexes of tokens, padded with ignore_index=-1
        # logits are scores (one for each label) for each token
        print("or_softmax_cross_entropy_loss_one_doc" ) 
        print("size of logits: " + str(logits.size()))                    # torch.Size([1, 746]), 746 is number of all tokens 
        print("size of target: " + str(target.size()))                    # torch.Size([1, 64]),  -1 padded
        print("target: " + str(target)) 

        # compute a target mask
        target_mask = target == ignore_index
        # replaces ignore_index with 0, so `gather` will select logit at index 0 for the masked targets
        masked_target = target * (1 - target_mask.long())                 # replace all -1 in target with 0， tensor([[447,   0,   0,   0, ...]])
        print("masked_target: " + str(masked_target))     
        # gather logits
        gathered_logits = logits.gather(dim=dim, index=masked_target)     # tensor([[0.4382, 0.2340, 0.2340, 0.2340 ... ]]), padding logits are all replaced by logits[0] 
        print("size of gathered_logits: " + str(gathered_logits.size()))  # torch.Size([1, 64])
        print("gathered_logits: " + str(gathered_logits)) 
        # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0
        gathered_logits[target_mask] = float('-inf')                      # padding logits are all replaced by -inf
        print("gathered_logits after -inf: " + str(gathered_logits))      # tensor([[0.4382,   -inf,   -inf,   -inf,   -inf,...]])
        
        # each batch is one example
        gathered_logits = gathered_logits.view(1, -1)
        logits = logits.view(1, -1)
        print("size of gathered_logits after view: " + str(gathered_logits.size()))  # torch.Size([1, 64])
        print("size of logits after view: " + str(logits.size()))                    # torch.Size([1, 746])　　

        # numerator = log(sum(exp(gathered logits)))
        log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False)
        print("log_score: " + str(log_score)) 
        # denominator = log(sum(exp(logits)))
        log_norm = torch.logsumexp(logits, dim=dim, keepdim=False)
        print("log_norm: " + str(log_norm)) 
        
        # compute the loss
        loss = -(log_score - log_norm)
        print("loss: " + str(loss)) 

        # some of the examples might have a loss of `inf` when `target` is all `ignore_index`.
        # remove those from the loss before computing the sum. Use sum instead of mean because
        # it is easier to compute
        loss = loss[~torch.isinf(loss)].sum()
        print("final loss: " + str(loss)) 
        return loss 
 

In [97]:
# debug
# input_ids = torch.tensor([[-1, 5, -1, 2]])
# input_ids.size(0)
# token_indices =  torch.nonzero(input_ids == torch.tensor(-1))[:,1]
# # token_indices
# # token_indices.item()
# # indices =  torch.LongTensor([[2],[0,2]])

# # torch.gather(input_ids, 1, token_indices.unsqueeze(0))
# # p_index = token_indices.view(input_ids.size(0), -1)[:,1::2]   
# # attention_mask = torch.ones(input_ids.shape, dtype=torch.long) 
# # attention_mask[:,token_indices] = 2
# # attention_mask
# p_index = torch.tensor([1, 3, 4])
# s_index = torch.tensor([1,3,6])
# torch.sort(torch.cat((s_index, p_index)))[0]
# attention_mask.view(-1)[ p_index.view(-1), :].view(attention_mask.size(0), -1)
# # for pi in p_index[0]:
# #     attention_mask[:, pi] = 2
# # attention_mask
# # s_index = torch.tensor([[1,3]])
# # torch.sort(torch.cat((p_index, s_index), -1), -1)

# sequence_output  = torch.tensor([[[-1, 5, -1, 2],
#                                  [-2, 27, 2, 9],
#                                  [3, 6, 1, 65],
#                                  [52, 36, 13, 2],
#                                  [73, 26, 1, 7]
#                                 ]])

# sp_para_output_t   = torch.tensor([[[-1],
#                                  [-2 ],
#                                  [3],
#                                  [52],
#                                  [73]
#                                 ]])
# torch.zeros(sp_para_output_t.shape, dtype=torch.float) 

# print("size of sequence_output: " + str(sequence_output.size()))
# # print("size of p_index.unsqueeze(0).unsqueeze(-1): " + str(p_index.unsqueeze(0).size()))
# sequence_output[:,p_index,:]
# b = torch.tensor([0, 1, 2, 3])
# p_index.unsqueeze(-1) * b

# input_ids = torch.tensor([[0.2, 0.0, 0.6, 0.6], [0.2, 0.6, 0.0, 0.0]]) 
# # input_ids.tolist()
# p_index =  torch.nonzero(input_ids == torch.tensor(0.2))
# print(p_index)
# s_index =  torch.nonzero(input_ids == torch.tensor(0.6))
# print(s_index)

# sp_sent = torch.tensor([[0, 1, 1, 0]])
# torch.where(sp_sent.squeeze())[0]
# cat_index = torch.tensor([])
# cat_index = torch.cat((cat_index, ids[0][1]))
# print(ids)
# print(cat_index)
# p_index[p_index[:,0] == 0]

# cat_index[cat_index[:,0].argsort()]

# sorted(torch.cat((p_index, s_index)), key = lambda x: x[0])
# torch.sort(torch.cat((p_index, s_index)), 0)[0]
# for cor in token_indices:
#     attention_mask[cor[0].item()][cor[1].item()] = 2
# attention_mask 
# input_ids = torch.tensor([[-1, 5, -6, 2]])
# print(input_ids.size())
# input_ids.topk(k=2, dim=-1).indices

# predict_type = torch.tensor([[-0.0925, -0.0999, -0.1671]])
# p_type = torch.argmax(predict_type, dim=1).item()
# p_type_score = torch.max(predict_type, dim=1)[0].item()
# print("predict_type: ", predict_type)
# print("p_type: ", p_type)
# print("p_type_score: ", p_type_score)
    
a = torch.tensor([[0.9213,  1.0887, -0.8858, -1.7683]])
a.view(-1).size() 
# print(torch.sigmoid(a))
# a = torch.tensor([ 9.213,  1.0887, -0.8858, 7683])
# print(torch.sigmoid(a))

# a = torch.tensor([[[1],[2],[4],[-1],[-1]]])
# a= a.squeeze(-1)
# a.size() 
# a[:, torch.where(a!=-1)[1]]
# m = torch.nn.Sigmoid()
# print("m: ", m)
# loss = torch.nn.BCELoss()
# # input = torch.randn(3, requires_grad=True)
# # print("input: ", input)
# # target = torch.empty(3).random_(2)
# # print("target: ", target)
# # output = loss(m(input), target)
# # print("output: ", output)

# input = torch.tensor([1.0293, -0.1585,  1.1408], requires_grad=True)
# print("input: ", input)
# print("Sigmoid(input): ", m(input))
# target = torch.tensor([0., 1., 0.])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# input = torch.tensor([[1.0293, -0.1585,  1.1408]], requires_grad=True)
# print("input: ", input)
# target = torch.tensor([[0., 1., 0.]])
# print("target: ", target)
# output = loss(m(input), target)
# print("output: ", output)

# 1.1761 * 3


torch.Size([4])

In [16]:
# # debug: check loaded dataset by DataLoader
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# num_new_tokens = tokenizer.add_special_tokens({"additional_special_tokens": ["<p>", "<q>", "</q>"]})
# # print(tokenizer.all_special_tokens)    
# # print(tokenizer.all_special_ids)     
# # tokenizer.convert_tokens_to_ids("<s>")
# # tokenizer.sep_token

# # all_doc_tokens = []
# # orig_to_tok_index = []
# # tok_to_orig_index = []
# # for (i, token) in enumerate(["<s>", "da", "tell", "<p>", "say"]):
# #     orig_to_tok_index.append(len(all_doc_tokens))
# #     sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token)
# #     for sub_token in sub_tokens:
# #         tok_to_orig_index.append(i)
# #         all_doc_tokens.append(sub_token)
# # all_doc_tokens


 

# dataset = hotpotqaDataset(file_path= args.train_dataset, tokenizer=tokenizer,
#                           max_seq_len= args.max_seq_len, max_doc_len= args.max_doc_len,
#                           doc_stride= args.doc_stride,
#                           max_num_answers= args.max_num_answers,
#                           max_question_len= args.max_question_len,
#                           ignore_seq_with_no_answers= args.ignore_seq_with_no_answers)
# print(len(dataset))

# # dl = DataLoader(dataset, batch_size=1, shuffle=None,
# #                     num_workers=args.num_workers, sampler=None,
# #                     collate_fn=hotpotqaDataset.collate_one_doc_and_lists)

# example = dataset[0]  
# [input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids] = example
 

# print(input_ids[0][:20].tolist())
# print(input_mask) 
# print(segment_ids) 
# print(subword_starts) 
# print(subword_ends)
# print(q_type)
# print(sp_sent) 
# print(sp_para) 
# print(qids)
# print(tokenizer.convert_ids_to_tokens(input_ids[0][667:669+1].tolist()))


##### configure_ddp

In [99]:
 %%add_to hotpotqa
 # A hook to overwrite to define your own DDP(DistributedDataParallel) implementation init. 
 # The only requirement is that: 
 # 1. On a validation batch the call goes to model.validation_step.
 # 2. On a training batch the call goes to model.training_step.
 # 3. On a testing batch, the call goes to model.test_step
 def configure_ddp(self, model, device_ids):
    model = LightningDistributedDataParallel(
        model,
        device_ids=device_ids,
        find_unused_parameters=True
    )
    return model

##### **configure_optimizers**

In [100]:
%%add_to hotpotqa
def configure_optimizers(self):
    # Set up optimizers and (optionally) learning rate schedulers
    def lr_lambda(current_step):
        if current_step < self.args.warmup:
            return float(current_step) / float(max(1, self.args.warmup))
        return max(0.0, float(self.args.steps - current_step) / float(max(1, self.args.steps - self.args.warmup)))
    
    optimizer = torch.optim.Adam(self.parameters(), lr=self.args.lr)

    self.scheduler = LambdaLR(optimizer, lr_lambda, last_epoch=-1)  # scheduler is not saved in the checkpoint, but global_step is, which is enough to restart
    self.scheduler.step(self.global_step)

    return optimizer
    

##### optimizer_step

In [101]:
%%add_to hotpotqa
# A hook to do a lot of non-standard training tricks such as learning-rate warm-up
def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
    optimizer.step()
    optimizer.zero_grad()
    self.scheduler.step(self.global_step)

##### **training_step**

In [102]:
%%add_to hotpotqa
def training_step(self, batch, batch_nb):
    # do the forward pass and calculate the loss for a batch 
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids = batch 
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    loss = output[0]
    print("returned loss: ", loss)
    print("self.trainer.optimizers[0].param_groups[0]['lr']: ", self.trainer.optimizers[0].param_groups[0]['lr'])
    lr = loss.new_zeros(1) + self.trainer.optimizers[0].param_groups[0]['lr']  # loss.new_zeros(1) is tensor([0.]), converting 'lr' to tensor' by adding it. 
    print("loss: ", loss)
    print("lr: ", lr)    # lr will increading over time
    tensorboard_logs = {'train_loss': loss, 'lr': lr,
                        'input_size': input_ids.numel(),
                        'mem': torch.cuda.memory_allocated(input_ids.device) / 1024 ** 3}
    return {'loss': loss, 'log': tensorboard_logs}

##### validation_step

In [103]:
%%add_to hotpotqa
# When the validation_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of validation, model goes back to training mode and gradients are enabled.
def validation_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids = batch
    print("validation_step")
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    loss, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output = output 
    print("loss: " + str(loss))

    answers_pred, sp_sent_pred, sp_para_pred = self.decode(input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)
    print("answers_pred: " + str(answers_pred))
    
    # answers_pred only contains the top one predicted answer['text', 'score']
#     answers_pred = sorted(answers_pred, key=lambda x: x['score'], reverse=True)[0:1] # each batch is one document
#     print("answers_pred after sorted: " + str(answers_pred))
    if(len(answers_pred) != 1):
        print("len(answers_pred) != 1")
        assert(len(answers_pred) == 1)
    answers_pred = answers_pred[0]

    answer_score = answers_pred['score']  # (start_logit + end_logit + p_type_score) / 3
    print("answer_score: " + str(answer_score))
    
    print("answer_text: " + str(answers_pred['text'])) 

    # even though there can be multiple gold start_postion (subword_start) and end_position(subword_end), the corresponing answer string are same
    answer_gold_token_ids = input_ids[0, subword_starts[0][0]: subword_ends[0][0] + 1]
    print("answer_gold_token_ids: " + str(answer_gold_token_ids))
    answer_gold_tokens = self.tokenizer.convert_ids_to_tokens(answer_gold_token_ids.tolist())
    print("answer_gold_tokens: " + str(answer_gold_tokens))
    answer_gold = self.tokenizer.convert_tokens_to_string(answer_gold_tokens)
    print("answer_gold: " + str(answer_gold))

    f1, prec, recall = self.f1_score(answers_pred['text'], answer_gold)
    em = self.exact_match_score(answers_pred['text'], answer_gold) 
    print("f1: " + str(f1))
    print("prec: " + str(prec))
    print("recall: " + str(recall))
    print("em: " + str(em)) 

    if(len(sp_sent_pred) > 0):
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = self.sp_metrics(sp_sent_pred, torch.where(sp_sent.squeeze())[0].tolist())
        print("sp_sent_em: " + str(sp_sent_em))
        print("sp_sent_precision: " + str(sp_sent_precision))
        print("sp_sent_recall: " + str(sp_sent_recall))    
        print("sp_sent_f1: " + str(sp_sent_f1))    
        
        joint_prec = prec * sp_sent_precision
        joint_recall = recall * sp_sent_recall
        if joint_prec + joint_recall > 0:
            joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
        else:
            joint_f1 = 0.
        joint_em = em * sp_sent_em 

    else:
        sp_sent_em, sp_sent_precision, sp_sent_recall, sp_sent_f1 = 0, 0, 0, 0
        joint_em, joint_f1, joint_prec, joint_recall = 0, 0, 0, 0
         

    return {'qids': [qids], 'vloss': loss, 
            'answer_score': [answer_score], 'f1': [f1], 'prec':[prec], 'recall':[recall], 'em': [em],
            'sp_em': [sp_sent_em], 'sp_f1': [sp_sent_f1], 'sp_prec': [sp_sent_precision], 'sp_recall': [sp_sent_recall],
            'joint_em': [joint_em], 'joint_f1': [joint_f1], 'joint_prec': [joint_prec], 'joint_recall': [joint_recall]}



###### decode

In [104]:
%%add_to hotpotqa
def decode(self, input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output):
    print("decode")
    
    question_end_index = self._get_special_index(input_ids, "</q>")
    print("question_end_index: ", question_end_index)
    
    # one example per batch
    start_logits = start_logits.squeeze()
    end_logits = end_logits.squeeze()
    start_logits_indices = start_logits.topk(k=self.args.n_best_size, dim=-1).indices
    print("start_logits_indices: ", start_logits_indices)
    end_logits_indices = end_logits.topk(k=self.args.n_best_size, dim=-1).indices 
    if(len(start_logits_indices.size()) > 1):
        print("len(start_logits_indices.size()): ", len(start_logits_indices.size()))
        assert("len(start_logits_indices.size()) > 1")
    p_type = torch.argmax(type_logits, dim=1).item()
    p_type_score = torch.max(type_logits, dim=1)[0].item()
    print("type_logits: ", type_logits)
    print("p_type: ", p_type)
    print("p_type_score: ", p_type_score)
    
    answers = []
    if p_type == 0:
        potential_answers = []
        for start_logit_index in start_logits_indices: 
            for end_logit_index in end_logits_indices: 
                if start_logit_index <= question_end_index.item():
                    continue
                if end_logit_index <= question_end_index.item():
                    continue
                if start_logit_index > end_logit_index:
                    continue
                answer_len = end_logit_index - start_logit_index + 1
                if answer_len > self.args.max_answer_length:
                    continue
                potential_answers.append({'start': start_logit_index, 'end': end_logit_index,
                                          'start_logit': start_logits[start_logit_index].item(),  # single logit score for start position at start_logit_index
                                          'end_logit': end_logits[end_logit_index].item()})    
        sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) 
        print("sorted_answers: " + str(sorted_answers))
        if len(sorted_answers) == 0:
            answers.append({'text': 'NoAnswerFound', 'score': -1000000})
        else:
            answer = sorted_answers[0]
            answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1]
            answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist())
            text = self.tokenizer.convert_tokens_to_string(answer_tokens)
            score = (answer['start_logit'] + answer['end_logit'] + p_type_score) / 3
            answers.append({'text': text, 'score': score})
            print("answers: " + str(answers))
    elif p_type == 1: 
        answers.append({'text': 'yes', 'score': p_type_score})
    elif p_type == 2:
        answers.append({'text': 'no', 'score': p_type_score})
    else:
        assert False 

    p_index = self._get_special_index(input_ids, "<p>")
    print("p_index: " + str(p_index))
    s_index = self._get_special_index(input_ids, "<s>")
    print("s_index: " + str(s_index))
    sent_indexes = torch.sort(torch.cat((s_index, p_index)))[0]
    
    s_to_p_map = []
    for s in sent_indexes:
        s_to_p = torch.where(torch.le(p_index, s))[0][-1]     # last p_index smaller or equal to s
        s_to_p_map.append(s_to_p.item()) 
    print("s_to_p_map: " + str(s_to_p_map))
    
    print("sp_sent_output", sp_sent_output)
    print("sp_para_output", sp_para_output)
    sp_sent_pred = set()
    sp_para_pred = set()
    for i, sp_sent_logit in enumerate(sp_sent_output.squeeze()):
        sp_para_logit = sp_para_output.squeeze()[s_to_p_map[i]]
        if sp_sent_logit > self.args.sp_sent_th and sp_para_logit > self.args.sp_para_th:
            sp_sent_pred.add(i)
            sp_para_pred.add(s_to_p_map[i])
#             print("s_to_p_map[i]", s_to_p_map[i])
    print("sp_sent_pred: " + str(sp_sent_pred))
    print("sp_para_pred: " + str(sp_para_pred))
    return (answers, sp_sent_pred, sp_para_pred)

###### metrics

In [105]:
%%add_to hotpotqa
def normalize_answer(self, s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(self, prediction, ground_truth):
    normalized_prediction = self.normalize_answer(prediction)
    normalized_ground_truth = self.normalize_answer(ground_truth)
    ZERO_METRIC = (0, 0, 0)
    
    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(self, prediction, ground_truth):
    return int(self.normalize_answer(prediction) == self.normalize_answer(ground_truth))


def sp_metrics(self, prediction, gold):
#     print("prediction: ", prediction)
#     print("gold: ", gold)
    tp, fp, fn = 0, 0, 0
    for e in prediction:
        if e in gold:
            tp += 1
        else:
            fp += 1
#             print("e: ", e)
#             print("gold: ", gold)
#             print("e not in gold!!!")
    for e in gold:
        if e not in prediction:
            fn += 1
#             print("e: ", e)
#             print("prediction: ", prediction)
#             print("e not in prediction!!!")
    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
#     print("sp prec: ", prec)
#     print("sp recall: ", recall)
#     print("sp f1: ", f1)
#     print("sp em: ", em)
    return em, prec, recall, f1
 

##### validation_end

In [106]:
%%add_to hotpotqa
# If a validation_step is not defined, this won't be called. Called at the end of the validation loop with the outputs of validation_step.
def validation_end(self, outputs):
    print("validation_end")
    avg_loss = torch.stack([x['vloss'] for x in outputs]).mean() 
    string_qids = [item for sublist in outputs for item in sublist['qids']]
    int_qids = [self.val_dataloader_object.dataset.val_qid_string_to_int_map[qid] for qid in string_qids]
    answer_scores = [item for sublist in outputs for item in sublist['answer_score']]
    
    f1_scores = [item for sublist in outputs for item in sublist['f1']]
    em_scores = [item for sublist in outputs for item in sublist['em']]
    prec_scores = [item for sublist in outputs for item in sublist['prec']]
    recall_scores = [item for sublist in outputs for item in sublist['recall']] 
    
    sp_sent_f1_scores = [item for sublist in outputs for item in sublist['sp_f1']]
    sp_sent_em_scores = [item for sublist in outputs for item in sublist['sp_em']]
    sp_sent_prec_scores = [item for sublist in outputs for item in sublist['sp_prec']]
    sp_sent_recall_scores = [item for sublist in outputs for item in sublist['sp_recall']] 
     
    joint_f1_scores = [item for sublist in outputs for item in sublist['joint_f1']]
    joint_em_scores = [item for sublist in outputs for item in sublist['joint_em']]
    joint_prec_scores = [item for sublist in outputs for item in sublist['joint_prec']]
    joint_recall_scores = [item for sublist in outputs for item in sublist['joint_recall']]     

    print(f'before sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
    if self.trainer.use_ddp:
        torch.distributed.all_reduce(avg_loss, op=torch.distributed.ReduceOp.SUM)
        avg_loss /= self.trainer.world_size 

        int_qids = self.sync_list_across_gpus(int_qids, avg_loss.device, torch.int)
        answer_scores = self.sync_list_across_gpus(answer_scores, avg_loss.device, torch.float)
        f1_scores = self.sync_list_across_gpus(f1_scores, avg_loss.device, torch.float)
        em_scores = self.sync_list_across_gpus(em_scores, avg_loss.device, torch.float)
        prec_scores = self.sync_list_across_gpus(prec_scores, avg_loss.device, torch.float)
        recall_scores = self.sync_list_across_gpus(recall_scores, avg_loss.device, torch.float)
        
        sp_sent_f1_scores = self.sync_list_across_gpus(sp_sent_f1_scores, avg_loss.device, torch.float)
        sp_sent_em_scores = self.sync_list_across_gpus(sp_sent_em_scores, avg_loss.device, torch.float)
        sp_sent_prec_scores = self.sync_list_across_gpus(sp_sent_prec_scores, avg_loss.device, torch.float)
        sp_sent_recall_scores = self.sync_list_across_gpus(sp_sent_recall_scores, avg_loss.device, torch.float)
        
        joint_f1_scores = self.sync_list_across_gpus(joint_f1_scores, avg_loss.device, torch.float)
        joint_em_scores = self.sync_list_across_gpus(joint_em_scores, avg_loss.device, torch.float)
        joint_prec_scores = self.sync_list_across_gpus(joint_prec_scores, avg_loss.device, torch.float)
        joint_recall_scores = self.sync_list_across_gpus(joint_recall_scores, avg_loss.device, torch.float)
        
        
    print(f'after sync --> sizes: {len(int_qids)}, {len(answer_scores)}, {len(f1_scores)}, {len(em_scores)}')
    print("answer_scores: ", answer_scores)
    print("f1_scores: ", f1_scores)
    print("em_scores: ", em_scores)
    
    # In case some questions might have multiple corresponding answers becasue of having multiple documents per questions. Should not happen for hotpotqa
    # Here, we only keep the answer with the highest answer_score
#     qa_with_duplicates = defaultdict(list)
#     for qid, answer_score, f1_score, em_score in zip(int_qids, answer_scores, f1_scores, em_scores):
#         qa_with_duplicates[qid].append({'answer_score': answer_score, 'f1': f1_score, 'em': em_score})
#     f1_scores = []
#     em_scores = []
#     for qid, answer_metrics in qa_with_duplicates.items():
#         if(len(answer_metrics) > 1):
#             print("len(answer_metrics) > 1")
#             assert(len(answer_metrics) > 1)
#         top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0]
#         f1_scores.append(top_answer['f1'])
#         em_scores.append(top_answer['em'])
#     print("f1_scores: ", f1_scores)
#     print("em_scores: ", em_scores)
    
    avg_val_f1 = sum(f1_scores) / len(f1_scores)
    print("len(f1_scores): ", len(f1_scores))
    print("avg_val_f1: ", avg_val_f1)
    avg_val_em = sum(em_scores) / len(em_scores)
    print("len(em_scores): ", len(em_scores))
    print("avg_val_em: ", avg_val_em)
    avg_val_prec = sum(prec_scores) / len(prec_scores)
    print("len(prec_scores): ", len(prec_scores))
    print("avg_val_prec: ", avg_val_prec)
    avg_val_recall = sum(recall_scores) / len(recall_scores) 
    print("len(recall_scores): ", len(recall_scores))
    print("avg_val_recall: ", avg_val_recall)
    
    avg_val_sp_sent_f1 = sum(sp_sent_f1_scores) / len(sp_sent_f1_scores)
    print("avg_val_sp_sent_f1: ", avg_val_sp_sent_f1)
    avg_val_sp_sent_em = sum(sp_sent_em_scores) / len(sp_sent_em_scores)
    print("avg_val_sp_sent_em: ", avg_val_sp_sent_em)
    avg_val_sp_sent_prec = sum(sp_sent_prec_scores) / len(sp_sent_prec_scores)
    print("avg_val_sp_sent_prec: ", avg_val_sp_sent_prec)
    avg_val_sp_sent_recall = sum(sp_sent_recall_scores) / len(sp_sent_recall_scores) 
    print("avg_val_sp_sent_recall: ", avg_val_sp_sent_recall)
        
    avg_val_joint_f1 = sum(joint_f1_scores) / len(joint_f1_scores)
    print("avg_val_joint_f1: ", avg_val_joint_f1)
    avg_val_joint_em = sum(joint_em_scores) / len(joint_em_scores)
    print("avg_val_joint_em: ", avg_val_joint_em)
    avg_val_joint_prec = sum(joint_prec_scores) / len(joint_prec_scores)
    print("avg_val_joint_prec: ", avg_val_joint_prec)
    avg_val_joint_recall = sum(joint_recall_scores) / len(joint_recall_scores) 
    print("avg_val_joint_recall: ", avg_val_joint_recall)
     
    
    
    print("avg_loss: ", avg_loss)
    
    logs = {'val_loss': avg_loss,  'avg_val_f1': avg_val_f1, 'avg_val_em': avg_val_em,  'avg_val_prec': avg_val_prec, 'avg_val_recall': avg_val_recall,
           'avg_val_sp_sent_f1': avg_val_sp_sent_f1, 'avg_val_sp_sent_em': avg_val_sp_sent_em,  'avg_val_sp_sent_prec': avg_val_sp_sent_prec, 'avg_val_sp_sent_recall': avg_val_sp_sent_recall,
           'avg_val_joint_f1': avg_val_joint_f1, 'avg_val_joint_em': avg_val_joint_em,  'avg_val_joint_prec': avg_val_joint_prec, 'avg_val_joint_recall': avg_val_joint_recall,
           }

    return {'avg_val_loss': avg_loss, 'log': logs, 'progress_bar': logs}

def sync_list_across_gpus(self, l, device, dtype):
    l_tensor = torch.tensor(l, device=device, dtype=dtype)
    gather_l_tensor = [torch.ones_like(l_tensor) for _ in range(self.trainer.world_size)]
    torch.distributed.all_gather(gather_l_tensor, l_tensor)
    return torch.cat(gather_l_tensor).tolist()


##### test_step

In [107]:
%%add_to hotpotqa
def test_step(self, batch, batch_nb):
    input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para, qids = batch
    output = self.forward(input_ids, input_mask, segment_ids, subword_starts, subword_ends, q_type, sp_sent, sp_para)
    loss, start_logits, end_logits = output[:3]
    answers = self.decode(input_ids, start_logits, end_logits)

    # each batch is one document
    answers = sorted(answers, key=lambda x: x['score'], reverse=True)[0:1]
    qids = [qids]
    assert len(answers) == len(qids)
    return {'qids': qids, 'answers': answers}

##### test_end

In [108]:
%%add_to hotpotqa
def test_end(self, outputs):
    qids = [item for sublist in outputs for item in sublist['qids']]
    answers = [item for sublist in outputs for item in sublist['answers']]

    qa_with_duplicates = defaultdict(list)
    for qid, answer in zip(qids, answers):
        qa_with_duplicates[qid].append({'answer_score': answer['score'], 'answer_text': answer['text'], })

    qid_to_answer_text = {}
    for qid, answer_metrics in qa_with_duplicates.items():
        top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0]
        qid_to_answer_text[qid] = top_answer['answer_text']

    with open('predictions.json', 'w') as f:
        json.dump(qid_to_answer_text, f)

    return {'count': len(qid_to_answer_text)}

##### add_model_specific_args

In [109]:
%%add_to hotpotqa
@staticmethod
def add_model_specific_args(parser, root_dir):
    parser.add_argument("--save_dir", type=str, default='hotpotqa')
    parser.add_argument("--save_prefix", type=str, required=True)
    parser.add_argument("--train_dataset", type=str, required=False, help="Path to the training squad-format")
    parser.add_argument("--dev_dataset", type=str, required=True, help="Path to the dev squad-format")
    parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
    parser.add_argument("--gpus", type=str, default='0',
                        help="Comma separated list of gpus. Default is gpu 0. To use CPU, use --gpus "" ")
    parser.add_argument("--warmup", type=int, default=1000, help="Number of warmup steps")
    parser.add_argument("--lr", type=float, default=0.00005, help="Maximum learning rate")
    parser.add_argument("--val_every", type=float, default=0.2, help="How often within one training epoch to check the validation set.")
    parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
    parser.add_argument("--num_workers", type=int, default=4, help="Number of data loader workers")
    parser.add_argument("--seed", type=int, default=1234, help="Seed")
    parser.add_argument("--epochs", type=int, default=26, help="Number of epochs")
    parser.add_argument("--max_seq_len", type=int, default=4096,
                        help="Maximum length of seq passed to the transformer model")
    parser.add_argument("--max_doc_len", type=int, default=4096,
                        help="Maximum number of wordpieces of the input document")
    parser.add_argument("--max_num_answers", type=int, default=64,
                        help="Maximum number of answer spans per document (64 => 94%)")
    parser.add_argument("--max_question_len", type=int, default=55,
                        help="Maximum length of the question")
    parser.add_argument("--doc_stride", type=int, default=-1,
                        help="Overlap between document chunks. Use -1 to only use the first chunk")
    parser.add_argument("--ignore_seq_with_no_answers", action='store_true',
                        help="each example should have at least one answer. Default is False")
    parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
    parser.add_argument("--n_best_size", type=int, default=20,
                        help="Number of answer candidates. Used at decoding time")
    parser.add_argument("--max_answer_length", type=int, default=30,
                        help="maximum num of wordpieces/answer. Used at decoding time")
    parser.add_argument("--regular_softmax_loss", action='store_true', help="IF true, use regular softmax. Default is using ORed softmax loss")
    parser.add_argument("--test", action='store_true', help="Test only, no training")
    parser.add_argument("--model_path", type=str, required=True,
                        help="Path to the checkpoint directory")
    parser.add_argument("--no_progress_bar", action='store_true', help="no progress bar. Good for printing")
    parser.add_argument("--attention_mode", type=str, choices=['tvm', 'sliding_chunks'],
                        default='sliding_chunks', help='Which implementation of selfattention to use')
    parser.add_argument("--fp32", action='store_true', help="default is fp16. Use --fp32 to switch to fp32")
    parser.add_argument('--sp_sent_th', type=float, default=0.4)
    parser.add_argument('--sp_para_th', type=float, default=0.45)
    return parser

##### class info

In [28]:
dir(hotpotqa)

['T_destination',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_apply',
 '_call_impl',
 '_forward_unimplemented',
 '_get_name',
 '_get_special_index',
 '_load_from_state_dict',
 '_named_members',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_version',
 'add_model_specific_args',
 'add_module',
 'apply',
 'backward',
 'bfloat16',
 'buffers',
 'children',
 'configure_apex',
 'configure_ddp',


In [29]:
from inspect import getmembers, isfunction
getmembers(hotpotqa)

[('T_destination', ~T_destination),
 ('__abstractmethods__', frozenset({'configure_optimizers', 'training_step'})),
 ('__annotations__',
  {'dump_patches': bool,
   '_version': int,
   'training': bool,
   'forward': typing.Callable[..., typing.Any],
   '__call__': typing.Callable[..., typing.Any]}),
 ('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__class__', abc.ABCMeta),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dict__',
  mappingproxy({'__module__': '__main__',
                '__init__': <function __main__.hotpotqa.__init__(self, args)>,
                'load_model': <function __main__.hotpotqa.load_model(self)>,
                'train_dataloader': <function pytorch_lightning.core.decorators.data_loader.<locals>._get_data_loader(self)>,
                'val_dataloader': <function pytorch_lightning.core.decorators.data_loader.<locals>._get_data_loader(self)>,
                'test_dat

In [30]:
functions_list = [o for o in getmembers(hotpotqa) if isfunction(o[1])]
functions_list

[('__call__',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('__delattr__',
  <function torch.nn.modules.module.Module.__delattr__(self, name)>),
 ('__dir__', <function torch.nn.modules.module.Module.__dir__(self)>),
 ('__getattr__',
  <function torch.nn.modules.module.Module.__getattr__(self, name:str) -> Union[torch.Tensor, _ForwardRef('Module')]>),
 ('__init__', <function __main__.hotpotqa.__init__(self, args)>),
 ('__repr__', <function torch.nn.modules.module.Module.__repr__(self)>),
 ('__setattr__',
  <function torch.nn.modules.module.Module.__setattr__(self, name:str, value:Union[torch.Tensor, _ForwardRef('Module')]) -> None>),
 ('__setstate__',
  <function torch.nn.modules.module.Module.__setstate__(self, state)>),
 ('_apply', <function torch.nn.modules.module.Module._apply(self, fn)>),
 ('_call_impl',
  <function torch.nn.modules.module.Module._call_impl(self, *input, **kwargs)>),
 ('_forward_unimplemented',
  <function torch.nn.modules.modul

In [31]:
import inspect
inspect.getmro(hotpotqa)  # a hierarchy of classes 

(__main__.hotpotqa,
 pytorch_lightning.core.lightning.LightningModule,
 abc.ABC,
 pytorch_lightning.core.grads.GradInformation,
 pytorch_lightning.core.saving.ModelIO,
 pytorch_lightning.core.hooks.ModelHooks,
 torch.nn.modules.module.Module,
 object)

In [32]:
help(hotpotqa)

Help on class hotpotqa in module __main__:

class hotpotqa(pytorch_lightning.core.lightning.LightningModule)
 |  Helper class that provides a standard way to create an ABC using
 |  inheritance.
 |  
 |  Method resolution order:
 |      hotpotqa
 |      pytorch_lightning.core.lightning.LightningModule
 |      abc.ABC
 |      pytorch_lightning.core.grads.GradInformation
 |      pytorch_lightning.core.saving.ModelIO
 |      pytorch_lightning.core.hooks.ModelHooks
 |      torch.nn.modules.module.Module
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, args)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  configure_ddp(self, model, device_ids)
 |  
 |  configure_optimizers(self)
 |  
 |  decode(self, input_ids, start_logits, end_logits, type_logits, sp_para_output, sp_sent_output)
 |  
 |  exact_match_score(self, prediction, ground_truth)
 |  
 |  f1_score(self, prediction, ground_truth)
 |  
 |  forward(self, input_ids, attentio

In [33]:
help(hotpotqa.configure_optimizers)

Help on function configure_optimizers in module __main__:

configure_optimizers(self)



In [34]:
# import inspect
# code, line_no = inspect.getsourcelines(hotpotqa.training_step)
# print(''.join(code))

### main

In [35]:
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed) 

In [110]:
    hotpotqa.__abstractmethods__=set()   # without this, got an error "Can't instantiate abstract class hotpotqa with abstract methods" if these two abstract methods are not implemented in the same cell where class hotpotqa defined 
    model = hotpotqa(args)
    model.to('cuda')    # this is necessary to use gpu

INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/u32/fanluo/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
INFO:transformers.tokenization_utils_base:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/u32/fanluo/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:transformers.tokenization_utils_base:Assigning ['<cls>', '<p>', '<q>', '</q>'] to the additional_special_tokens key of the tokenizer
INFO:transformers.tokenization_utils:Adding <cls> to the vocabulary
INFO:transformers.tokenization_utils:Adding <p> to the vocabulary
INFO:transformers.tokenization_utils:Adding <q> to the vocabulary
INFO:transformers.tokeniza

Loaded model with config:
RobertaConfig {
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],
  "attention_mode": "tvm",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256,
    256
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}



hotpotqa(
  (model): Longformer(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50269, 768)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
        

In [39]:
    logger = TestTubeLogger( # The TestTubeLogger adds a nicer folder structure to manage experiments and snapshots all hyperparameters you pass to a LightningModule.
        save_dir=args.save_dir,
        name=args.save_prefix,
        version=0  # always use version=0
    )

In [40]:
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
        save_top_k=5,
        verbose=True,
        monitor='avg_val_f1',
        mode='max',
        prefix=''
    )

  f"Checkpoint directory {filepath} exists and is not empty with save_top_k != 0."


In [112]:
    args.gpus = [int(x) for x in args.gpus.split(',')] if args.gpus is not "" else None  # use CPU if no gpu provided
    train_set_size = 16 # 90447   # hardcode dataset size. Needed to compute number of steps for the lr scheduler
    num_devices = 1 or len(args.gpus)
    print("num_devices: ", num_devices)
    print("train_set_size: ", train_set_size)
    args.steps = args.epochs * train_set_size / (args.batch_size * num_devices)
    print(f'>>>>>>> #steps: {args.steps}, #epochs: {args.epochs}, batch_size: {args.batch_size * num_devices} <<<<<<<')

num_devices:  1
train_set_size:  16
>>>>>>> #steps: 52.0, #epochs: 26, batch_size: 8 <<<<<<<


In [59]:
### To install apex ### 
#     !git clone https://github.com/NVIDIA/apex
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/apex/")
#     !module load cuda101/neuralnet/7/7.6.4
#     !module load cuda10.1/toolkit/10.1.243 
#     !conda install -c conda-forge cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf201901 cudatoolkit-dev --yes
#     !conda install -c conda-forge/label/cf202003 cudatoolkit-dev --yes
#     !which nvcc
#     !python -m pip install -v --no-cache-dir ./
#     os.chdir("/xdisk/msurdeanu/fanluo/hotpotQA/")

In [113]:
    trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if args.gpus and (len(args.gpus) > 1) else None,
                         track_grad_norm=-1, max_epochs=args.epochs, early_stop_callback=None,
                         accumulate_grad_batches=args.batch_size,
                         val_check_interval=args.val_every,
                         val_percent_check=args.val_percent_check,
                         test_percent_check=args.val_percent_check,
                         logger=logger if not args.disable_checkpointing else False,
                         checkpoint_callback=checkpoint_callback if not args.disable_checkpointing else False,
                         show_progress_bar=not args.no_progress_bar,
                         use_amp=not args.fp32, amp_level='O2',
                         )
    

INFO:root:gpu available: True, used: True
INFO:root:VISIBLE GPUS: 0
INFO:root:using 16bit precision


In [114]:
#     if not args.test:
    trainer.fit(model)

INFO:root:
                                       Name               Type Params
0                                     model         Longformer  148 M
1                          model.embeddings  RobertaEmbeddings   41 M
2          model.embeddings.word_embeddings          Embedding   38 M
3      model.embeddings.position_embeddings          Embedding    3 M
4    model.embeddings.token_type_embeddings          Embedding  768  
..                                      ...                ...    ...
242                             linear_type             Linear    2 K
243                           dense_sp_sent             Linear  590 K
244                          linear_sp_sent             Linear  769  
245                           dense_sp_para             Linear  590 K
246                          linear_sp_para             Linear  769  

[247 rows x 3 columns]


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
reading file: small.json
['<pad>', '<mask>', '<s>', '<q>', '</q>', '</s>', '<s>', '<cls>', '</s>', '<p>', '<unk>']
[1, 50264, 0, 50267, 50268, 2, 0, 50265, 2, 50266, 3]
reading file: small_dev.json
['<pad>', '<mask>', '<s>', '<q>', '</q>', '</s>', '<s>', '<cls>', '</s>', '<p>', '<unk>']
[1, 50264, 0, 

INFO:root:model and trainer restored from checkpoint: hotpotqa/hotpotqa-longformer/checkpoints/_ckpt_epoch_23_v51.ckpt
Validation sanity check:   0%|          | 0/5 [00:00<?, ?batch/s]

validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1, 1662, 2])
size of start_logits: torch.Size([1, 1662, 1])
size of start_logits after squeeze: torch.Size([1, 1662])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: tor

Validation sanity check:  20%|██        | 1/5 [00:01<00:04,  1.12s/batch]

sorted_answers: [{'start': tensor(660, device='cuda:0'), 'end': tensor(660, device='cuda:0'), 'start_logit': 0.990234375, 'end_logit': 0.0794677734375}, {'start': tensor(655, device='cuda:0'), 'end': tensor(660, device='cuda:0'), 'start_logit': 0.970703125, 'end_logit': 0.0794677734375}, {'start': tensor(255, device='cuda:0'), 'end': tensor(255, device='cuda:0'), 'start_logit': 0.91357421875, 'end_logit': 0.134765625}, {'start': tensor(655, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 0.970703125, 'end_logit': 0.0330810546875}, {'start': tensor(145, device='cuda:0'), 'end': tensor(145, device='cuda:0'), 'start_logit': 0.94384765625, 'end_logit': 0.04742431640625}, {'start': tensor(255, device='cuda:0'), 'end': tensor(259, device='cuda:0'), 'start_logit': 0.91357421875, 'end_logit': 0.05194091796875}, {'start': tensor(255, device='cuda:0'), 'end': tensor(261, device='cuda:0'), 'start_logit': 0.91357421875, 'end_logit': 0.043212890625}, {'start': tensor(255, devi

Validation sanity check:  40%|████      | 2/5 [00:01<00:02,  1.11batch/s]

size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1, 1660, 2])
size of start_logits: torch.Size([1, 1660, 1])
size of start_logits after squeeze: torch.Size([1, 1660])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([57])
sent_indexes:  tensor([  24,   50,   81,  118,  130,  161,  201,  237,  269,  297,  356,  389,
         415,  462,  480,  500,  520,  540,  562,  591,  620,  662,  678,  706,
         754,  773,  782,  813,  839,  854,  883,  923,  955,  

size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1, 1320, 2])
size of start_logits: torch.Size([1, 1320, 1])
size of start_logits after squeeze: torch.Size([1, 1320])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([37])
sent_indexes:  tensor([  29,   60,  100,  151,  168,  237,  329,  376,  438,  475,  516,  546,
       

Validation sanity check:  60%|██████    | 3/5 [00:01<00:01,  1.37batch/s]

gathered_logits: tensor([[0.0072]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.0072]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0072], device='cuda:0', dtype=torch.float16)
log_norm: tensor([0.9370], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9297], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9297, device='cuda:0', dtype=torch.float16)
total_loss: tensor(9.9197, device='cuda:0')
loss: tensor(9.9197, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([611, 173, 775, 129, 191, 548, 667, 550, 300, 787, 791, 211,  41, 643,
        644, 519, 169, 671, 669, 439], device='cuda:0')
type_logits:  tensor([[0.5020, 0.4502, 0.4207]], device='cuda:0')
p_type:  0
p_type_score:  0.501953125
sorted_answers: [{'start': tensor(611, device='cuda:0'), 'end': tensor(611, device

Validation sanity check:  80%|████████  | 4/5 [00:02<00:00,  1.65batch/s]

size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1, 1649, 2])
size of start_logits: torch.Size([1, 1649, 1])
size of start_logits after squeeze: torch.Size([1, 1649])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([53])
sent_indexes:  tensor([  25,  148,  197,  240,  268,  323,  341,  352,  366,  401,  428,  458,
         476,  495,  523,  548,  584,  642,  661,  670,  701,  755,  794,  828,
         884,  931,  960,  973, 1003, 1026, 1042, 1082, 1104, 1

Validation sanity check: 100%|██████████| 5/5 [00:02<00:00,  1.96batch/s]

size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1, 1400, 2])
size of start_logits: torch.Size([1, 1400, 1])
size of start_logits after squeeze: torch.Size([1, 1400])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([41])
sent_indexes:  tensor([  21,  109,  128,  158,  182,  219,  243,  290,  322,  363,  389,  445,
         480,  514,  553,  563,  572,  602,  612,  628,  661,  672,  691,  703,
         745,  780,  833,  868,  967, 1072, 1116, 1132, 1152, 1

                                                                         

validation_end
before sync --> sizes: 5, 5, 5, 5
after sync --> sizes: 5, 5, 5, 5
answer_scores:  [0.5210367838541666, 0.7449544270833334, 0.6941731770833334, 0.7137044270833334, 0.7957356770833334]
f1_scores:  [0, 0, 0, 0, 0]
em_scores:  [0, 0, 0, 0, 0]
len(f1_scores):  5
avg_val_f1:  0.0
len(em_scores):  5
avg_val_em:  0.0
len(prec_scores):  5
avg_val_prec:  0.0
len(recall_scores):  5
avg_val_recall:  0.0
avg_val_sp_sent_f1:  0.0912548822875981
avg_val_sp_sent_em:  0.0
avg_val_sp_sent_prec:  0.04808419725948245
avg_val_sp_sent_recall:  1.0
avg_val_joint_f1:  0.0
avg_val_joint_em:  0.0
avg_val_joint_prec:  0.0
avg_val_joint_recall:  0.0
avg_loss:  tensor(8.1542, device='cuda:0')
Epoch 24:   0%|          | 0/81 [00:00<?, ?batch/s]



size of input_ids: torch.Size([1, 1548])
size of attention_mask: torch.Size([1, 1548])
size of segment_ids: torch.Size([1, 1548])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 43])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([33])
p_index: tensor([  25,  322,  395,  568,  718,  863,  991, 1090, 1207, 1322],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1548, 768])
size of logits: torch.Size([1, 1548, 2])
size of start_logits: torch.Size([1, 1548, 1])
size of start_logits after squeeze: torch.Size([1, 1548])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 24:   1%|          | 1/81 [00:01<01:49,  1.37s/batch, batch_idx=0, gpu=0, loss=0.000, v_num=0]size of input_ids: torch.Size([1, 1314])
size of attention_mask: torch.Size([1, 1314])
size of segment_ids: torch.Size([1, 1314])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  23,  142,  212,  402,  481,  676,  794,  899,  947, 1129],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1314, 768])
size of logits: torch.Size([1, 1314, 2])
size of start_logits: torch.Size([1, 1314, 1])
size of start_logits after squeeze: torch.Size([1, 1314])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size o

Epoch 24:   2%|▏         | 2/81 [00:02<01:32,  1.17s/batch, batch_idx=1, gpu=0, loss=0.000, v_num=0]size of input_ids: torch.Size([1, 1459])
size of attention_mask: torch.Size([1, 1459])
size of segment_ids: torch.Size([1, 1459])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  26,  145,  282,  567,  597,  908, 1029, 1157, 1234, 1320],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1459, 768])
size of logits: torch.Size([1, 1459, 2])
size of start_logits: torch.Size([1, 1459, 1])
size of start_logits after squeeze: torch.Size([1, 1459])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size o

final loss: tensor(2.0605, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
end_positions: tensor([[1281,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(2.0605, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0571]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered

Epoch 24:   5%|▍         | 4/81 [00:03<01:16,  1.01batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1, 1011, 1])
size of start_logits after squeeze: torch.Size([1, 1011])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of

sorted_answers: [{'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.77490234375}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99267578125, 'end_logit': 0.77490234375}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9453125, 'end_logit': 0.77490234375}, {'start': tensor(262, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9404296875, 'end_logit': 0.77490234375}, {'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.6630859375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.94873046875, 'end_logit': 0.6630859375}, {'start': tensor(634, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.6630859375}, {'start': tensor(264, device='cuda:0'), 

Epoch 24:   6%|▌         | 5/81 [00:03<00:59,  1.28batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 1])
size of start_logits after squeeze: torch.Size([1, 933])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_par

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.931640625, 'end_logit': 0.9755859375}, {'start': tensor(758, device='cuda:0'), 'end': tensor(759, device='cuda:0'), 'start_logit': 0.8798828125, 'end_logit': 0.48583984375}, {'start': tensor(646, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.33837890625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.98193359375, 'end_logit': 0.33837890625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(791, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.320556640625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(781, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.307861328125}, {'start': tensor(777, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.94677734375, 'end_logit': 0.323974609375}, {'start': tensor(777, device='cud

sorted_answers: [{'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.92138671875}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.994140625, 'end_logit': 0.92138671875}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.8251953125}, {'start': tensor(136, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.94580078125, 'end_logit': 0.8251953125}, {'start': tensor(149, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.9189453125, 'end_logit': 0.8251953125}, {'start': tensor(151, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.876953125, 'end_logit': 0.8251953125}, {'start': tensor(143, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.87060546875, 'end_logit': 0.8251953125}, {'start': tensor(401, device='cuda:0'), 'e

size of start_logits: torch.Size([1, 1649, 1])
size of start_logits after squeeze: torch.Size([1, 1649])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([53])
sent_indexes:  tensor([  25,  148,  197,  240,  268,  323,  341,  352,  366,  401,  428,  458,
         476,  495,  523,  548,  584,  642,  661,  670,  701,  755,  794,  828,
         884,  931,  960,  973, 1003, 1026, 1042, 1082, 1104, 1127, 1163, 1182,
        1205, 1232, 1259, 1278, 1297, 1325, 1347, 1365, 1418, 1489, 1509, 15

Epoch 24:  10%|▉         | 8/81 [00:04<00:34,  2.10batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of start_logits: torch.Size([1, 851, 1])
size of start_logits after squeeze: torch.Size([1, 851])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_par

sorted_answers: [{'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.744140625}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.98681640625, 'end_logit': 0.744140625}, {'start': tensor(145, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.95751953125, 'end_logit': 0.744140625}, {'start': tensor(147, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9462890625, 'end_logit': 0.744140625}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9208984375, 'end_logit': 0.744140625}, {'start': tensor(117, device='cuda:0'), 'end': te

Epoch 24:  11%|█         | 9/81 [00:05<00:29,  2.42batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1, 1320, 2])
size of start_logits: torch.Size([1, 1320, 1])
size of start_logits after squeeze: torch.Size([1, 1320])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Siz

total_loss: tensor(9.9197, device='cuda:0')
loss: tensor(9.9197, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([611, 173, 775, 129, 191, 548, 667, 550, 300, 787, 791, 211,  41, 643,
        644, 519, 169, 671, 669, 439], device='cuda:0')
type_logits:  tensor([[0.5020, 0.4502, 0.4207]], device='cuda:0')
p_type:  0
p_type_score:  0.501953125
sorted_answers: [{'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.5830078125}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.98876953125, 'end_logit': 0.341796875}, {'start': tensor(548, device='cuda:0'), 'end': tensor(549, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.345703125}, {'start': tensor(667, device='cuda:0'), 'end': tensor(667, device='cuda:0'), 'start_logit': 0.8984375, 'end_logit': 0.274658203125}, {'start': tensor(775, device='cuda:0'), 'end':

p_type:  0
p_type_score:  0.509765625
sorted_answers: [{'start': tensor(130, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.9609375, 'end_logit': 0.91650390625}, {'start': tensor(131, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.8740234375, 'end_logit': 0.91650390625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.986328125, 'end_logit': 0.6279296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.60400390625}, {'start': tensor(365, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.95654296875, 'end_logit': 0.60400390625}, {'start': tensor(673, device='cuda:0'), 'end': tensor(674, device='cuda:0'), 'start_logit': 0.916015625, 'end_logit': 0.60107421875}, {'start': tensor(159, device='cuda:0'), 'end': tensor(161, device='cuda:0'), 'start_logit': 0.98095703125, 'end_logit': 0.42041015625}, {'sta

target: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
masked_target: tensor([[678,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-0.8154, -6.2578, -6.2578, -6.2578, -6.2578, -6.2578, -6.2578, -6.2578,
         -6.2578, -6.2578, -6.2578, -6.2578, -6.2578, -6.2578, -6.

Epoch 24:  15%|█▍        | 12/81 [00:06<00:24,  2.83batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 1310])
size of attention_mask: torch.Size([1, 1310])
size of segment_ids: torch.Size([1, 1310])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  34,   91,  222,  316,  347,  479,  651,  711, 1092, 1277],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1310, 768])
size of logits: torch.Size([1, 1310, 2])
size of start_logits: torch.Size([1, 1310, 1])
size of start_logits after squeeze: torch.Size([1, 1310])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Si

sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.57275390625}, {'start': tensor(1132, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.9482421875, 'end_logit': 0.57275390625}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.491943359375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(117, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.481689453125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.974609375, 'end_logit': 0.491943359375}, {'start': tensor(1118, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8818359375, 'end_logit': 0.57275390625}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8642578125, 'end_logit': 0.57275390625}, {'start': tensor(92, 

loss: tensor(8.6741, device='cuda:0')
decode
question_end_index:  tensor([30], device='cuda:0')
start_logits_indices:  tensor([ 559,  290,  817,   47,  423,  231,  145,  574,  268,  232,  291, 1075,
         634,  489,  179,  246,  724,  483,  946,  351], device='cuda:0')
type_logits:  tensor([[0.5073, 0.4521, 0.4097]], device='cuda:0')
p_type:  0
p_type_score:  0.50732421875
sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.78564453125}, {'start': tensor(47, device='cuda:0'), 'end': tensor(47, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.52197265625}, {'start': tensor(634, device='cuda:0'), 'end': tensor(637, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.4990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0'), 'end': tensor(297, device=

log_score: tensor([1.0781], device='cuda:0', dtype=torch.float16)
log_norm: tensor([7.0352], device='cuda:0', dtype=torch.float16)
loss: tensor([5.9570], device='cuda:0', dtype=torch.float16)
final loss: tensor(5.9570, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[ 132,  161,  184,  516,  590,  639,  998, 1093, 1620,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(5.9570, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1849])
size of target: torch.Size([1, 64])
target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,   -1,   -1,   -1,
   

Epoch 24:  19%|█▊        | 15/81 [00:07<00:22,  2.95batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1, 1660, 2])
size of start_logits: torch.Size([1, 1660, 1])
size of start_logits after squeeze: torch.Size([1, 1660])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Si

final loss: tensor(7.0547, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(7.0547, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[-0.0037]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-0.0037]], device='cuda:0', dtype=torch.float16)
si

Epoch 24:  20%|█▉        | 16/81 [00:07<00:21,  2.98batch/s, batch_idx=2, gpu=0, loss=0.000, v_num=0]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.5210367838541666, 0.7549641927083334, 0.8089192708333334, 0.8089192708333334, 0.7137044270833334, 0.7493489583333334, 0.6941731770833334, 0.7957356770833334, 0.5677083333333334, 0.6891276041666666, 0.763671875, 0.7268880208333334, 0.7449544270833334]
f1_scores:  [0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
em_scores:  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.07692307692307693
len(em_scores):  13
avg_val_em:  0.07692307692307693
len(prec_scores):  13
avg_val_prec:  0.07692307692307693
len(recall_scores):  13
avg_val_recall:  0.07692307692307693
avg_val_sp_sent_f1:  0.10193237402064405
avg_val_sp_sent_em:  0.0
avg_val_sp_sent_prec:  0.05390114969987609
avg_val_sp_sent_recall:  1.0
avg_val_joint_f1:  0.007155635062611807
avg_val_joint_em:  0.0
avg_val_

INFO:root:
Epoch 00023: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1872])
size of attention_mask: torch.Size([1, 1872])
size of segment_ids: torch.Size([1, 1872])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 61])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([51])
p_index: tensor([  20,  304,  712,  785,  944, 1044, 1110, 1397, 1664, 1795],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1872, 768])
size of logits: torch.Size([1, 1872, 2])
size of start_logits: torch.Size([1, 1872, 1])
size of start_logits after squeeze: torch.Size([1, 1872])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 24:  21%|██        | 17/81 [00:08<00:32,  1.95batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=3, gpu=0, loss=0.000, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1376])
size of attention_mask: torch.Size([1, 1376])
size of segment_ids: torch.Size([1, 1376])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([  22,  114,  207,  322,  430,  553,  858,  978, 1129, 1217],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1376, 768])
size of logits: torch.Size([1, 1376, 2])
size 

returned loss:  tensor(5.1063, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.57e-05
loss:  tensor(5.1063, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.5700e-05], device='cuda:0')
Epoch 24:  22%|██▏       | 18/81 [00:08<00:32,  1.95batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=4, gpu=0, loss=0.000, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1316])
size of attention_mask: torch.Size([1, 1316])
size of segment_ids: torch.Size([1, 1316])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 38])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([1

Epoch 24:  23%|██▎       | 19/81 [00:09<00:33,  1.87batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: 

Epoch 24:  25%|██▍       | 20/81 [00:10<00:40,  1.52batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of sta

sorted_answers: [{'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.744140625}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.98681640625, 'end_logit': 0.744140625}, {'start': tensor(145, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.95751953125, 'end_logit': 0.744140625}, {'start': tensor(147, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9462890625, 'end_logit': 0.744140625}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9208984375, 'end_logit': 0.744140625}, {'start': tensor(117, device='cuda:0'), 'end': te

Epoch 24:  26%|██▌       | 21/81 [00:10<00:32,  1.85batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of sta

sorted_answers: [{'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.92138671875}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.994140625, 'end_logit': 0.92138671875}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.8251953125}, {'start': tensor(136, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.94580078125, 'end_logit': 0.8251953125}, {'start': tensor(149, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.9189453125, 'end_logit': 0.8251953125}, {'start': tensor(151, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.876953125, 'end_logit': 0.8251953125}, {'start': tensor(143, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.87060546875, 'end_logit': 0.8251953125}, {'start': tensor(401, device='cuda:0'), 'e

Epoch 24:  27%|██▋       | 22/81 [00:10<00:27,  2.18batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1,

Epoch 24:  28%|██▊       | 23/81 [00:11<00:24,  2.33batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1,

Epoch 24:  30%|██▉       | 24/81 [00:11<00:23,  2.40batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1,

Epoch 24:  31%|███       | 25/81 [00:11<00:21,  2.55batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of sta

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.931640625, 'end_logit': 0.9755859375}, {'start': tensor(758, device='cuda:0'), 'end': tensor(759, device='cuda:0'), 'start_logit': 0.8798828125, 'end_logit': 0.48583984375}, {'start': tensor(646, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.33837890625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.98193359375, 'end_logit': 0.33837890625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(791, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.320556640625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(781, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.307861328125}, {'start': tensor(777, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.94677734375, 'end_logit': 0.323974609375}, {'start': tensor(777, device='cud

sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.57275390625}, {'start': tensor(1132, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.9482421875, 'end_logit': 0.57275390625}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.491943359375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(117, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.481689453125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.974609375, 'end_logit': 0.491943359375}, {'start': tensor(1118, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8818359375, 'end_logit': 0.57275390625}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8642578125, 'end_logit': 0.57275390625}, {'start': tensor(92, 

sorted_answers: [{'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.77490234375}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99267578125, 'end_logit': 0.77490234375}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9453125, 'end_logit': 0.77490234375}, {'start': tensor(262, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9404296875, 'end_logit': 0.77490234375}, {'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.6630859375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.94873046875, 'end_logit': 0.6630859375}, {'start': tensor(634, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.6630859375}, {'start': tensor(264, device='cuda:0'), 

Epoch 24:  35%|███▍      | 28/81 [00:12<00:16,  3.19batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1187])
size of attention_mask: torch.Size([1, 1187])
size of segment_ids: torch.Size([1, 1187])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  31,   85,  144,  230,  289,  428,  466,  558,  816, 1111],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1187, 768])
size of logits: torch.Size([1,

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.78564453125}, {'start': tensor(47, device='cuda:0'), 'end': tensor(47, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.52197265625}, {'start': tensor(634, device='cuda:0'), 'end': tensor(637, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.4990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.98388671875, 'end_logit': 0.3955078125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.378173828125}, {'start': tensor(291, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.97900390625, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0')

question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([611, 173, 775, 129, 191, 548, 667, 550, 300, 787, 791, 211,  41, 643,
        644, 519, 169, 671, 669, 439], device='cuda:0')
type_logits:  tensor([[0.5020, 0.4502, 0.4207]], device='cuda:0')
p_type:  0
p_type_score:  0.501953125
sorted_answers: [{'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.5830078125}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.98876953125, 'end_logit': 0.341796875}, {'start': tensor(548, device='cuda:0'), 'end': tensor(549, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.345703125}, {'start': tensor(667, device='cuda:0'), 'end': tensor(667, device='cuda:0'), 'start_logit': 0.8984375, 'end_logit': 0.274658203125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(793, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.2012939453125

masked_target: tensor([[678,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-4.5547, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,
         -6.6680, -6.6680, -6.6680, -6.6680, -6.6680,

Epoch 24:  38%|███▊      | 31/81 [00:13<00:15,  3.22batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1,

sorted_answers: [{'start': tensor(130, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.9609375, 'end_logit': 0.91650390625}, {'start': tensor(131, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.8740234375, 'end_logit': 0.91650390625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.986328125, 'end_logit': 0.6279296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.60400390625}, {'start': tensor(365, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.95654296875, 'end_logit': 0.60400390625}, {'start': tensor(673, device='cuda:0'), 'end': tensor(674, device='cuda:0'), 'start_logit': 0.916015625, 'end_logit': 0.60107421875}, {'start': tensor(159, device='cuda:0'), 'end': tensor(161, device='cuda:0'), 'start_logit': 0.98095703125, 'end_logit': 0.42041015625}, {'start': tensor(364, device='cuda:0'), 'en

Epoch 24:  40%|███▉      | 32/81 [00:14<00:14,  3.29batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=5, gpu=0, loss=0.000, v_num=0, val_loss=7.73]
                                                              [A

INFO:root:
Epoch 00023: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 878])
size of attention_mask: torch.Size([1, 878])
size of segment_ids: torch.Size([1, 878])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 32])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([22])
p_index: tensor([ 20, 187, 232, 324, 454, 552, 604, 677, 720, 821], device='cuda:0')
size of sequence_output: torch.Size([1, 878, 768])
size of logits: torch.Size([1, 878, 2])
size of start_logits: torch.Size([1, 878, 1])
size of start_logits after squeeze: torch.Size([1, 878])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_ou

Epoch 24:  41%|████      | 33/81 [00:14<00:20,  2.36batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=6, gpu=0, loss=0.000, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1079])
size of attention_mask: torch.Size([1, 1079])
size of segment_ids: torch.Size([1, 1079])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 39])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([29])
p_index: tensor([ 22, 119, 144, 322, 457, 521, 742, 801, 887, 931], device='cuda:0')
size of sequence_output: torch.Size([1, 1079, 768])
size of logits: torch.Size([1, 1079, 2])
size of start_logits: 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Epoch 24:  42%|████▏     | 34/81 [00:15<00:22,  2.08batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=7, gpu=0, loss=5.470, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1597])
size of attention_mask: torch.Size([1, 1597])
size of segment_ids: torch.Size([1, 1597])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 50])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([40])
p_index: tensor([  17,  196,  277,  474,  653,  968, 1072, 1223, 1353, 1463],
       device='cuda:0')
size of sequence_o



size of logits: torch.Size([1, 1597, 2])
size of start_logits: torch.Size([1, 1597, 1])
size of start_logits after squeeze: torch.Size([1, 1597])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([50])
sent_indexes:  tensor([  17,   60,   78,  123,  138,  175,  196,  227,  236,  255,  277,  327,
         370,  393,  421,  474,  497,  524,  540,  557,  577,  599,  629,  653,
         694,  747,  783,  804,  865,  909,  937,  968,  998, 1036, 1072, 1126,
        1141, 1169, 1197, 1223, 128

Epoch 24:  43%|████▎     | 35/81 [00:16<00:25,  1.80batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 10

sorted_answers: [{'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.77490234375}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99267578125, 'end_logit': 0.77490234375}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9453125, 'end_logit': 0.77490234375}, {'start': tensor(262, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9404296875, 'end_logit': 0.77490234375}, {'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.6630859375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.94873046875, 'end_logit': 0.6630859375}, {'start': tensor(634, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.6630859375}, {'start': tensor(264, device='cuda:0'), 

Epoch 24:  44%|████▍     | 36/81 [00:16<00:28,  1.60batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of sta

sorted_answers: [{'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.92138671875}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.994140625, 'end_logit': 0.92138671875}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.8251953125}, {'start': tensor(136, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.94580078125, 'end_logit': 0.8251953125}, {'start': tensor(149, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.9189453125, 'end_logit': 0.8251953125}, {'start': tensor(151, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.876953125, 'end_logit': 0.8251953125}, {'start': tensor(143, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.87060546875, 'end_logit': 0.8251953125}, {'start': tensor(401, device='cuda:0'), 'e

Epoch 24:  46%|████▌     | 37/81 [00:17<00:22,  1.95batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1,

sorted_answers: [{'start': tensor(130, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.9609375, 'end_logit': 0.91650390625}, {'start': tensor(131, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.8740234375, 'end_logit': 0.91650390625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.986328125, 'end_logit': 0.6279296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.60400390625}, {'start': tensor(365, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.95654296875, 'end_logit': 0.60400390625}, {'start': tensor(673, device='cuda:0'), 'end': tensor(674, device='cuda:0'), 'start_logit': 0.916015625, 'end_logit': 0.60107421875}, {'start': tensor(159, device='cuda:0'), 'end': tensor(161, device='cuda:0'), 'start_logit': 0.98095703125, 'end_logit': 0.42041015625}, {'start': tensor(364, device='cuda:0'), 'en

size of sp_sent_output: torch.Size([1, 58, 768])
size of sp_sent_output_t after dense: torch.Size([1, 58, 768])
size of sp_sent_output_t after tanh: torch.Size([1, 58, 768])
size of sp_sent_output_t: torch.Size([1, 58, 1])
size of sp_sent_output_aux: torch.Size([1, 58, 1])
size of predict_support_sent: torch.Size([1, 58, 2])
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 2018])
size of target: torch.Size([1, 64])
target: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
masked_target: tensor([[678,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  

Epoch 24:  48%|████▊     | 39/81 [00:17<00:17,  2.46batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1,

sorted_answers: [{'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.5830078125}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.98876953125, 'end_logit': 0.341796875}, {'start': tensor(548, device='cuda:0'), 'end': tensor(549, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.345703125}, {'start': tensor(667, device='cuda:0'), 'end': tensor(667, device='cuda:0'), 'start_logit': 0.8984375, 'end_logit': 0.274658203125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(793, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.2012939453125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.167236328125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(775, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.155029296875}, {'start': tensor(775, device='cuda:

target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0301]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.0301]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0301], device='cuda:0', dtype=torch.float16)
log_norm: tensor([0.9365], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9062], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9062, device='cuda:0', dtype=torch.float16)
total_loss: tensor(8.6741, device='cuda:0')
loss: tensor(8.6741, device='cuda:0')
decode
question_end_index:  tensor([30], device='cuda:0')
start_logits_indices:  tensor([ 559,  290,  817,   47,  423,  231,  145,  574,  268,  232,  291, 1075,
         634,  489,  179,  246,  724,  483,  946,  351], device='cuda:0')
type_logits:  tensor([[0.5073, 0.4521

gathered_logits after -inf: tensor([[-2.9375,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf]],
       device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1662])
log_score: tensor([-2.9375], device='cuda:0', dtype=torch.float16)
log_norm: tensor([5.4922], device='cuda:0', dtype=torch.float16)
loss: tensor([8.4297], device='cuda:0', 

Epoch 24:  52%|█████▏    | 42/81 [00:18<00:13,  2.85batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1310])
size of attention_mask: torch.Size([1, 1310])
size of segment_ids: torch.Size([1, 1310])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  34,   91,  222,  316,  347,  479,  651,  711, 1092, 1277],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1310, 768])
size of logits: torch.Size([1,

sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.57275390625}, {'start': tensor(1132, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.9482421875, 'end_logit': 0.57275390625}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.491943359375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(117, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.481689453125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.974609375, 'end_logit': 0.491943359375}, {'start': tensor(1118, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8818359375, 'end_logit': 0.57275390625}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.8642578125, 'end_logit': 0.57275390625}, {'start': tensor(92, 

sorted_answers: [{'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.744140625}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.98681640625, 'end_logit': 0.744140625}, {'start': tensor(145, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.95751953125, 'end_logit': 0.744140625}, {'start': tensor(147, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9462890625, 'end_logit': 0.744140625}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9208984375, 'end_logit': 0.744140625}, {'start': tensor(117, device='cuda:0'), 'end': te

Epoch 24:  54%|█████▍    | 44/81 [00:19<00:12,  3.08batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of sta

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.931640625, 'end_logit': 0.9755859375}, {'start': tensor(758, device='cuda:0'), 'end': tensor(759, device='cuda:0'), 'start_logit': 0.8798828125, 'end_logit': 0.48583984375}, {'start': tensor(646, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.33837890625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.98193359375, 'end_logit': 0.33837890625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(791, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.320556640625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(781, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.307861328125}, {'start': tensor(777, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.94677734375, 'end_logit': 0.323974609375}, {'start': tensor(777, device='cud

gathered_logits: tensor([[-0.7866, -1.4951, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633,
         -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633, -6.3633]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-0.7866, -1.4951,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,

Epoch 24:  57%|█████▋    | 46/81 [00:19<00:11,  3.16batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1,

Epoch 24:  58%|█████▊    | 47/81 [00:20<00:11,  3.03batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1,

Epoch 24:  59%|█████▉    | 48/81 [00:20<00:11,  2.98batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=8, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.7549641927083334, 0.8089192708333334, 0.7957356770833334, 0.5677083333333334, 0.6941731770833334, 0.763671875, 0.5210367838541666, 0.6891276041666666, 0.7493489583333334, 0.8089192708333334, 0.7449544270833334, 0.7268880208333334, 0.7137044270833334]
f1_scores:  [1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
em_scores:  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.07692307692307693
len(em_scores):  13
avg_val_em:  0.07692307692307693
len(prec_scores):  13
av

INFO:root:
Epoch 00023: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1285])
size of attention_mask: torch.Size([1, 1285])
size of segment_ids: torch.Size([1, 1285])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 48])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([38])
p_index: tensor([  25,  135,  251,  343,  431,  569,  636,  903, 1045, 1131],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1285, 768])
size of logits: torch.Size([1, 1285, 2])
size of start_logits: torch.Size([1, 1285, 1])
size of start_logits after squeeze: torch.Size([1, 1285])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 24:  60%|██████    | 49/81 [00:21<00:15,  2.08batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=9, gpu=0, loss=5.470, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 746])
size of attention_mask: torch.Size([1, 746])
size of segment_ids: torch.Size([1, 746])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 23])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([13])
p_index: tensor([ 12, 107, 169, 348, 382, 430, 490, 634, 663, 721], device='cuda:0')
size of sequence_output: torch.Size([1, 746, 768])
size of logits: torch.Size([1, 746, 2])
size of start_logits: torch

Epoch 24:  62%|██████▏   | 50/81 [00:21<00:14,  2.10batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=10, gpu=0, loss=5.470, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1345])
size of attention_mask: torch.Size([1, 1345])
size of segment_ids: torch.Size([1, 1345])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  37,  229,  410,  523,  666,  803,  895, 1137, 1189, 1240],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1345, 768])
size of logits: torch.Size([1, 1345, 2])
size

Epoch 24:  63%|██████▎   | 51/81 [00:22<00:15,  1.91batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943,

sorted_answers: [{'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.92138671875}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.994140625, 'end_logit': 0.92138671875}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.8251953125}, {'start': tensor(136, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.94580078125, 'end_logit': 0.8251953125}, {'start': tensor(149, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.9189453125, 'end_logit': 0.8251953125}, {'start': tensor(151, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.876953125, 'end_logit': 0.8251953125}, {'start': tensor(143, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.87060546875, 'end_logit': 0.8251953125}, {'start': tensor(401, device='cuda:0'), 'e

Epoch 24:  64%|██████▍   | 52/81 [00:23<00:17,  1.67batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1

question_end_index:  tensor([20], device='cuda:0')
start_logits_indices:  tensor([1199,  364,  585,   22,  573,  159,  381,  291, 1133,  130,  365,  673,
        1169,  295,  518,  131,  869,  840, 1148, 1134], device='cuda:0')
type_logits:  tensor([[0.5098, 0.4390, 0.4111]], device='cuda:0')
p_type:  0
p_type_score:  0.509765625
sorted_answers: [{'start': tensor(130, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.9609375, 'end_logit': 0.91650390625}, {'start': tensor(131, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.8740234375, 'end_logit': 0.91650390625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.986328125, 'end_logit': 0.6279296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.60400390625}, {'start': tensor(365, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.95654296875, 'end_logit

final loss: tensor(9.6406, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[693,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(9.6406, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1320])
size of target: torch.Size([1, 64])
target: tensor([[696,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,

sp_sent_output tensor([[[0.4568],
         [0.4656],
         [0.4512],
         [0.4504],
         [0.4712],
         [0.4590],
         [0.4600],
         [0.4717],
         [0.4617],
         [0.4617],
         [0.4631],
         [0.4521],
         [0.4575],
         [0.4651],
         [0.4600],
         [0.4512],
         [0.4709],
         [0.4531],
         [0.4648],
         [0.4595],
         [0.4761],
         [0.4697],
         [0.4602],
         [0.4685],
         [0.4624],
         [0.4626],
         [0.4753],
         [0.4526],
         [0.4634],
         [0.4736],
         [0.4651],
         [0.4619],
         [0.4512],
         [0.4780],
         [0.4514],
         [0.4565],
         [0.4585]]], device='cuda:0')
sp_para_output tensor([[[0.4551],
         [0.4663],
         [0.4790],
         [0.4768],
         [0.4724],
         [0.4670],
         [0.4761],
         [0.4707],
         [0.4727],
         [0.4819]]], device='cuda:0')
sp_sent_pred: {0, 1, 2, 3, 4, 5, 6, 7, 

Epoch 24:  68%|██████▊   | 55/81 [00:24<00:11,  2.24batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size 

sorted_answers: [{'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.77490234375}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99267578125, 'end_logit': 0.77490234375}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9453125, 'end_logit': 0.77490234375}, {'start': tensor(262, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9404296875, 'end_logit': 0.77490234375}, {'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.6630859375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.94873046875, 'end_logit': 0.6630859375}, {'start': tensor(634, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.6630859375}, {'start': tensor(264, device='cuda:0'), 

Epoch 24:  69%|██████▉   | 56/81 [00:24<00:09,  2.52batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of st

sorted_answers: [{'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.744140625}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.98681640625, 'end_logit': 0.744140625}, {'start': tensor(145, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.95751953125, 'end_logit': 0.744140625}, {'start': tensor(147, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9462890625, 'end_logit': 0.744140625}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9208984375, 'end_logit': 0.744140625}, {'start': tensor(117, device='cuda:0'), 'end': te

Epoch 24:  70%|███████   | 57/81 [00:24<00:08,  2.83batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1

Epoch 24:  72%|███████▏  | 58/81 [00:25<00:08,  2.84batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of st

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.931640625, 'end_logit': 0.9755859375}, {'start': tensor(758, device='cuda:0'), 'end': tensor(759, device='cuda:0'), 'start_logit': 0.8798828125, 'end_logit': 0.48583984375}, {'start': tensor(646, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.33837890625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.98193359375, 'end_logit': 0.33837890625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(791, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.320556640625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(781, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.307861328125}, {'start': tensor(777, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.94677734375, 'end_logit': 0.323974609375}, {'start': tensor(777, device='cud

loss: tensor(8.6741, device='cuda:0')
decode
question_end_index:  tensor([30], device='cuda:0')
start_logits_indices:  tensor([ 559,  290,  817,   47,  423,  231,  145,  574,  268,  232,  291, 1075,
         634,  489,  179,  246,  724,  483,  946,  351], device='cuda:0')
type_logits:  tensor([[0.5073, 0.4521, 0.4097]], device='cuda:0')
p_type:  0
p_type_score:  0.50732421875
sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.78564453125}, {'start': tensor(47, device='cuda:0'), 'end': tensor(47, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.52197265625}, {'start': tensor(634, device='cuda:0'), 'end': tensor(637, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.4990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0'), 'end': tensor(297, device=

end_positions: tensor([[293, 318,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(3.3438, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0007]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.0007]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0007], device='cuda:

Epoch 24:  75%|███████▌  | 61/81 [00:26<00:06,  3.23batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1

Epoch 24:  77%|███████▋  | 62/81 [00:26<00:06,  3.15batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1

log_score: tensor([-4.3477], device='cuda:0', dtype=torch.float16)
log_norm: tensor([2.7070], device='cuda:0', dtype=torch.float16)
loss: tensor([7.0547], device='cuda:0', dtype=torch.float16)
final loss: tensor(7.0547, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(7.0547, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gather

Epoch 24:  78%|███████▊  | 63/81 [00:26<00:05,  3.11batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1

Epoch 24:  79%|███████▉  | 64/81 [00:27<00:05,  3.07batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=11, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.8089192708333334, 0.7957356770833334, 0.6941731770833334, 0.7268880208333334, 0.7549641927083334, 0.7493489583333334, 0.7137044270833334, 0.8089192708333334, 0.763671875, 0.6891276041666666, 0.5677083333333334, 0.7449544270833334, 0.5210367838541666]
f1_scores:  [0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]
em_scores:  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.07692307692307693
len(em_scores):  13
avg_val_em:  0.07692307692307693
len(prec_scores):  13
a

INFO:root:
Epoch 00023: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 994])
size of attention_mask: torch.Size([1, 994])
size of segment_ids: torch.Size([1, 994])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 29])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([19])
p_index: tensor([ 24, 143, 208, 354, 408, 509, 585, 731, 816, 910], device='cuda:0')
size of sequence_output: torch.Size([1, 994, 768])
size of logits: torch.Size([1, 994, 2])
size of start_logits: torch.Size([1, 994, 1])
size of start_logits after squeeze: torch.Size([1, 994])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_ou

Epoch 24:  80%|████████  | 65/81 [00:27<00:06,  2.42batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=12, gpu=0, loss=5.470, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 2078])
size of attention_mask: torch.Size([1, 2078])
size of segment_ids: torch.Size([1, 2078])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 68])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([58])
p_index: tensor([  34,  209,  391,  583,  766,  834, 1210, 1396, 1694, 1905],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2078, 768])
size of logits: torch.Size([1, 2078, 2])
size

Epoch 24:  81%|████████▏ | 66/81 [00:28<00:07,  1.88batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=13, gpu=0, loss=5.470, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 1494])
size of attention_mask: torch.Size([1, 1494])
size of segment_ids: torch.Size([1, 1494])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  47,  208,  366,  466,  614,  767,  886,  982, 1165, 1222],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1494, 768])
size of logits: torch.Size([1, 1494, 2])
size

Epoch 24:  83%|████████▎ | 67/81 [00:29<00:07,  1.85batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1

sorted_answers: [{'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.77490234375}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99267578125, 'end_logit': 0.77490234375}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9453125, 'end_logit': 0.77490234375}, {'start': tensor(262, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9404296875, 'end_logit': 0.77490234375}, {'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.6630859375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.94873046875, 'end_logit': 0.6630859375}, {'start': tensor(634, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.6630859375}, {'start': tensor(264, device='cuda:0'), 

Epoch 24:  84%|████████▍ | 68/81 [00:29<00:08,  1.53batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1

loss: tensor([6.8867], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.8867, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 696, 1010, 1245,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(6.8867, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0284]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: 

Epoch 24:  85%|████████▌ | 69/81 [00:30<00:06,  1.80batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1

Epoch 24:  86%|████████▋ | 70/81 [00:30<00:05,  2.04batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1187])
size of attention_mask: torch.Size([1, 1187])
size of segment_ids: torch.Size([1, 1187])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  31,   85,  144,  230,  289,  428,  466,  558,  816, 1111],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1187, 768])
size of logits: torch.Size([1

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.78564453125}, {'start': tensor(47, device='cuda:0'), 'end': tensor(47, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.52197265625}, {'start': tensor(634, device='cuda:0'), 'end': tensor(637, device='cuda:0'), 'start_logit': 0.96728515625, 'end_logit': 0.4990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.98388671875, 'end_logit': 0.3955078125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.378173828125}, {'start': tensor(291, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.97900390625, 'end_logit': 0.3955078125}, {'start': tensor(268, device='cuda:0')

size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 2018])
log_score: tensor([-4.2539], device='cuda:0', dtype=torch.float16)
log_norm: tensor([3.0781], device='cuda:0', dtype=torch.float16)
loss: tensor([7.3320], device='cuda:0', dtype=torch.float16)
final loss: tensor(7.3320, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[680,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(7.3320, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]],

start_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(7.3867, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1660])
size of target: torch.Size([1, 64])
target: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   

Epoch 24:  90%|█████████ | 73/81 [00:31<00:03,  2.64batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of st

sorted_answers: [{'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.92138671875}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.994140625, 'end_logit': 0.92138671875}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.8251953125}, {'start': tensor(136, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.94580078125, 'end_logit': 0.8251953125}, {'start': tensor(149, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.9189453125, 'end_logit': 0.8251953125}, {'start': tensor(151, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.876953125, 'end_logit': 0.8251953125}, {'start': tensor(143, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.87060546875, 'end_logit': 0.8251953125}, {'start': tensor(401, device='cuda:0'), 'e

Epoch 24:  91%|█████████▏| 74/81 [00:31<00:02,  2.94batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1

loss: tensor([6.6094], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.6094, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(6.6094, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[-0.0268]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-0.0268]], device='cuda:0', dtype=torch.float16)
size of gather

Epoch 24:  93%|█████████▎| 75/81 [00:32<00:02,  2.97batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1

loss: tensor([0.9297], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9297, device='cuda:0', dtype=torch.float16)
total_loss: tensor(9.9197, device='cuda:0')
loss: tensor(9.9197, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([611, 173, 775, 129, 191, 548, 667, 550, 300, 787, 791, 211,  41, 643,
        644, 519, 169, 671, 669, 439], device='cuda:0')
type_logits:  tensor([[0.5020, 0.4502, 0.4207]], device='cuda:0')
p_type:  0
p_type_score:  0.501953125
sorted_answers: [{'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.5830078125}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.98876953125, 'end_logit': 0.341796875}, {'start': tensor(548, device='cuda:0'), 'end': tensor(549, device='cuda:0'), 'start_logit': 0.93603515625, 'end_logit': 0.345703125}, {'start': tensor(667, device='cuda:0'), 'end': tensor

sorted_answers: [{'start': tensor(130, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.9609375, 'end_logit': 0.91650390625}, {'start': tensor(131, device='cuda:0'), 'end': tensor(133, device='cuda:0'), 'start_logit': 0.8740234375, 'end_logit': 0.91650390625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.986328125, 'end_logit': 0.6279296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.60400390625}, {'start': tensor(365, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.95654296875, 'end_logit': 0.60400390625}, {'start': tensor(673, device='cuda:0'), 'end': tensor(674, device='cuda:0'), 'start_logit': 0.916015625, 'end_logit': 0.60107421875}, {'start': tensor(159, device='cuda:0'), 'end': tensor(161, device='cuda:0'), 'start_logit': 0.98095703125, 'end_logit': 0.42041015625}, {'start': tensor(364, device='cuda:0'), 'en

loss: tensor(7.7081, device='cuda:0')
decode
question_end_index:  tensor([33], device='cuda:0')
start_logits_indices:  tensor([ 117,  657, 1110, 1278,   67, 1007,  790,  712, 1093,   50,  539,   92,
        1132,  680,  673,  328, 1118, 1122,  124,  547], device='cuda:0')
type_logits:  tensor([[0.5000, 0.4639, 0.4199]], device='cuda:0')
p_type:  0
p_type_score:  0.5
sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.57275390625}, {'start': tensor(1132, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.9482421875, 'end_logit': 0.57275390625}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.491943359375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(117, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.481689453125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, de

log_norm: tensor([0.9307], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9043], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9043, device='cuda:0', dtype=torch.float16)
total_loss: tensor(9.7550, device='cuda:0')
loss: tensor(9.7550, device='cuda:0')
decode
question_end_index:  tensor([12], device='cuda:0')
start_logits_indices:  tensor([117, 141, 136, 153, 785, 167, 144, 654, 172, 684, 692, 623, 630,  23,
        552, 145, 147, 118, 149, 610], device='cuda:0')
type_logits:  tensor([[0.5063, 0.4543, 0.4041]], device='cuda:0')
p_type:  0
p_type_score:  0.50634765625
sorted_answers: [{'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.744140625}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.744140625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.7

Epoch 24:  98%|█████████▊| 79/81 [00:33<00:00,  3.27batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of st

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.931640625, 'end_logit': 0.9755859375}, {'start': tensor(758, device='cuda:0'), 'end': tensor(759, device='cuda:0'), 'start_logit': 0.8798828125, 'end_logit': 0.48583984375}, {'start': tensor(646, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.33837890625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 0.98193359375, 'end_logit': 0.33837890625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(791, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.320556640625}, {'start': tensor(779, device='cuda:0'), 'end': tensor(781, device='cuda:0'), 'start_logit': 0.98583984375, 'end_logit': 0.307861328125}, {'start': tensor(777, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.94677734375, 'end_logit': 0.323974609375}, {'start': tensor(777, device='cud

Epoch 24:  99%|█████████▉| 80/81 [00:33<00:00,  3.44batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=14, gpu=0, loss=5.470, v_num=0, val_loss=7.73]
                                                              [A

INFO:root:
Epoch 00023: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1118])
size of attention_mask: torch.Size([1, 1118])
size of segment_ids: torch.Size([1, 1118])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 46])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([36])
p_index: tensor([  16,  192,  327,  526,  638,  666,  750,  868,  927, 1015],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1118, 768])
size of logits: torch.Size([1, 1118, 2])
size of start_logits: torch.Size([1, 1118, 1])
size of start_logits after squeeze: torch.Size([1, 1118])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

end_loss: tensor(0.0156, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0019]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered_logits after -inf: tensor([[0.0019]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0019], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
log_norm: tensor([0.9316], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.9297], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.9297, device='cuda:0', dtype=torch.float

Epoch 25:   1%|          | 1/81 [00:00<00:46,  1.73batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=0, gpu=0, loss=4.732, v_num=0, val_loss=7.73] size of input_ids: torch.Size([1, 1314])
size of attention_mask: torch.Size([1, 1314])
size of segment_ids: torch.Size([1, 1314])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  23,  142,  212,  402,  481,  676,  794,  899,  947, 1129],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1314, 768])
size of logits: torch.Size([1, 1314, 2])
size 

Epoch 25:   2%|▏         | 2/81 [00:01<00:48,  1.64batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=1, gpu=0, loss=4.732, v_num=0, val_loss=7.73]size of input_ids: torch.Size([1, 2078])
size of attention_mask: torch.Size([1, 2078])
size of segment_ids: torch.Size([1, 2078])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 68])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([58])
p_index: tensor([  34,  209,  391,  583,  766,  834, 1210, 1396, 1694, 1905],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2078, 768])
size of logits: torch.Size([1, 2078, 2])
size o

Epoch 25:   4%|▎         | 3/81 [00:02<00:52,  1.48batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=2, gpu=0, loss=4.732, v_num=0, val_loss=7.73]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 101

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

loss: tensor([4.2266], device='cuda:0', dtype=torch.float16)
final loss: tensor(4.2266, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[ 694, 1008, 1243,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(4.2266, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1649])
size of target: torch.Size([1, 64])
target: tensor([[ 696, 1010, 1245,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   

Epoch 25:   6%|▌         | 5/81 [00:03<00:45,  1.68batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=2, gpu=0, loss=4.732, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of star

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

loss: tensor([0.9341], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9341, device='cuda:0', dtype=torch.float16)
total_loss: tensor(8.8241, device='cuda:0')
loss: tensor(8.8241, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([ 439,  191,  611,  563,  775,  715,  173,  211,  129, 1077,  787, 1225,
         300,  192,  964,  453,  871,  591,  217,   30], device='cuda:0')
type_logits:  tensor([[0.5337, 0.4883, 0.4487]], device='cuda:0')
p_type:  0
p_type_score:  0.53369140625
sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(173, devic

start_positions: tensor([[258, 283, 332, 365, 392, 429,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(4.2266, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1187])
size of target: torch.Size([1, 64])
target: tensor([[259, 283, 333, 365, 392, 429,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
masked_target: tensor([[259, 2

Epoch 25:  10%|▉         | 8/81 [00:04<00:30,  2.38batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=2, gpu=0, loss=4.732, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of star

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
masked_target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: t

Epoch 25:  12%|█▏        | 10/81 [00:05<00:25,  2.81batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=2, gpu=0, loss=4.732, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of sta

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

log_norm: tensor([8.4375], device='cuda:0', dtype=torch.float16)
loss: tensor([inf], device='cuda:0', dtype=torch.float16)
final loss: tensor(0., device='cuda:0', dtype=torch.float16)
end_positions: tensor([[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], device='cuda:0')
end_loss: tensor(0., device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[2]], device='cuda:0')
masked_target: tensor([[2]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[-0.2452]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-0.2452]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after vie

or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1389]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.1389]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.1389], device='cuda:0', dtype=torch.float16)
log_norm: tensor([1.0742], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9355], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9355, device='cuda:0', dtype=torch.float16)
total_loss: tensor(5.7289, device='cuda:0')
loss: tensor(5.7289, device='cuda:0')
decode
question_end_index:  tensor([33], device='cuda:0')
start_logits_indices:  tensor([  67, 1110, 1093,   50, 1278, 1295,  848, 1111,   88,  117, 1122,  657,
   

gathered_logits after -inf: tensor([[3.1523,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1662])
log_score: tensor([3.1523], device='cuda:0', dtype=torch.float16)
log_norm: tensor([9.9922], device='cuda:0', dtype=torch.float16)
loss: tensor([6.8398], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.8398, device='cuda:0', dtype=t

Epoch 25:  17%|█▋        | 14/81 [00:06<00:21,  3.14batch/s, avg_val_em=0.0769, avg_val_f1=0.0769, avg_val_joint_em=0, avg_val_joint_f1=0.00716, avg_val_joint_prec=0.00375, avg_val_joint_recall=0.0769, avg_val_prec=0.0769, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0.102, avg_val_sp_sent_prec=0.0539, avg_val_sp_sent_recall=1, batch_idx=2, gpu=0, loss=4.732, v_num=0, val_loss=7.73]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1,

s_to_p_map: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9]
sp_sent_output tensor([[[0.4055],
         [0.4270],
         [0.4265],
         [0.4312],
         [0.4309],
         [0.4143],
         [0.4268],
         [0.4287],
         [0.4280],
         [0.4270],
         [0.4258],
         [0.4292],
         [0.4299],
         [0.4248],
         [0.4287],
         [0.4285],
         [0.4275],
         [0.4282],
         [0.4280],
         [0.4050],
         [0.4148],
         [0.4314],
         [0.4282],
         [0.4236],
         [0.4177],
         [0.4282],
         [0.4282],
         [0.4253],
         [0.4260],
         [0.4260],
         [0.4275],
         [0.4287],
         [0.4094],
         [0.4270],
         [0.4280],
         [0.4282],
         [0.4253],
         [0.4282],
         [0.4270],
         [0.4182],
         [0.4258],
         [0.4258],
   

sp_sent_output tensor([[[0.3933],
         [0.3921],
         [0.3926],
         [0.3967],
         [0.3943],
         [0.3943],
         [0.3914],
         [0.3916],
         [0.3960],
         [0.3940],
         [0.4070],
         [0.3914],
         [0.3945],
         [0.3977],
         [0.4102],
         [0.3977],
         [0.3938],
         [0.3938],
         [0.4160],
         [0.3921],
         [0.3931],
         [0.3931],
         [0.4065],
         [0.3977],
         [0.3950],
         [0.4004],
         [0.3928],
         [0.3914],
         [0.3938],
         [0.3943],
         [0.3943],
         [0.3894],
         [0.3906],
         [0.3936],
         [0.3904],
         [0.3901],
         [0.3923],
         [0.4277],
         [0.3923],
         [0.3987],
         [0.3962],
         [0.4080],
         [0.4009],
         [0.3943],
         [0.3945],
         [0.3972],
         [0.3926],
         [0.3906],
         [0.3950],
         [0.3982],
         [0.3933],
         [0.3948

INFO:root:
Epoch 00024: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1079])
size of attention_mask: torch.Size([1, 1079])
size of segment_ids: torch.Size([1, 1079])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 39])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([29])
p_index: tensor([ 22, 119, 144, 322, 457, 521, 742, 801, 887, 931], device='cuda:0')
size of sequence_output: torch.Size([1, 1079, 768])
size of logits: torch.Size([1, 1079, 2])
size of start_logits: torch.Size([1, 1079, 1])
size of start_logits after squeeze: torch.Size([1, 1079])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_

or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1377]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered_logits after -inf: tensor([[0.1377]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.1377], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
log_norm: tensor([1.1230], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.9854], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.9854, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
total_loss: tensor(7.2942, device='cuda:0', grad_fn=<AddBac

end_positions: tensor([[582,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(0.7109, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0634]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered_logits after -inf: tensor([[0.0634]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1,

loss: tensor([0.9370], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.9370, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
total_loss: tensor(5.0810, device='cuda:0', grad_fn=<AddBackward0>)
returned loss:  tensor(5.0810, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.6800000000000004e-05
loss:  tensor(5.0810, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.6800e-05], device='cuda:0')
Epoch 25:  23%|██▎       | 19/81 [00:08<00:33,  1.88batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.732, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1400])
size of attention_ma

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9951171875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(402, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(186, device='cuda:0'), 'end': tensor(189, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98291015625}, {'start': tensor(541, device='cuda:0'), 'end': tensor(550, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98193359375}, {'start': tensor(22, device='cuda:0'), 'end': tensor(2

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

question_end_index:  tensor([33], device='cuda:0')
start_logits_indices:  tensor([  67, 1110, 1093,   50, 1278, 1295,  848, 1111,   88,  117, 1122,  657,
         218,  539, 1007,  790,  311,  328,  680,  712], device='cuda:0')
type_logits:  tensor([[0.5347, 0.4858, 0.4551]], device='cuda:0')
p_type:  0
p_type_score:  0.53466796875
sorted_answers: [{'start': tensor(311, device='cuda:0'), 'end': tensor(318, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99267578125}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1143, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.982421875}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9794921875}, {'start': tensor(67, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.979003

gathered_logits: tensor([[ 3.1523, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238,
         -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238, -3.4238]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[3.1523,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf, 

Epoch 25:  28%|██▊       | 23/81 [00:10<00:23,  2.46batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.732, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1, 

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

gathered_logits after -inf: tensor([[5.3594, 4.6484, 2.6914, 0.5166, 4.6953, 0.4883, 2.8672, 2.8516, 1.2988,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1849])
log_score: tensor([6.1797], device='cuda:0', dtype=torch.float16)
log_norm: tensor([10.6562], device='cuda:0', dtype=torch.float16)
loss: tensor([4.4766], device='cuda:0', dtype=torch.float16)
final loss: tensor(4.4766, device='cuda:0', dtype=

Epoch 25:  31%|███       | 25/81 [00:11<00:20,  2.70batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.732, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of start_logits: torch.Size([1, 851, 

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

target: tensor([[680,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
masked_target: tensor([[680,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-3.5352, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.

Epoch 25:  33%|███▎      | 27/81 [00:11<00:18,  2.97batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.732, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1, 1649, 2])
size of start_logits

Epoch 25:  35%|███▍      | 28/81 [00:12<00:17,  2.98batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.732, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1, 1660, 2])
size of start_logits

loss: tensor([6.3359], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.3359, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(6.3359, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0872]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: 

size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 1])
size of start_logits after squeeze: torch.Size([1, 933])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([37])
sent_indexes:  tensor([ 21,  54,  68,  97, 157, 176, 190, 216, 263, 282, 315, 362, 398, 417,
        441, 456, 474, 488, 510, 523, 543, 554, 578, 599, 616, 645, 675, 696,
        709, 745, 757, 772, 805, 829, 849, 871, 913], device='cuda:0')


sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

final loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[259, 283, 333, 365, 392, 429,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after v

log_norm: tensor([1.0693], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9341], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9341, device='cuda:0', dtype=torch.float16)
total_loss: tensor(8.8241, device='cuda:0')
loss: tensor(8.8241, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([ 439,  191,  611,  563,  775,  715,  173,  211,  129, 1077,  787, 1225,
         300,  192,  964,  453,  871,  591,  217,   30], device='cuda:0')
type_logits:  tensor([[0.5337, 0.4883, 0.4487]], device='cuda:0')
p_type:  0
p_type_score:  0.53369140625
sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9995

INFO:root:
Epoch 00024: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1459])
size of attention_mask: torch.Size([1, 1459])
size of segment_ids: torch.Size([1, 1459])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  26,  145,  282,  567,  597,  908, 1029, 1157, 1234, 1320],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1459, 768])
size of logits: torch.Size([1, 1459, 2])
size of start_logits: torch.Size([1, 1459, 1])
size of start_logits after squeeze: torch.Size([1, 1459])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

end_loss: tensor(0.7695, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0333]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered_logits after -inf: tensor([[0.0333]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0333], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
log_norm: tensor([1.0107], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.9775], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.9775, device='cuda:0', dtype=torch.float

loss: tensor([3.0898], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(3.0898, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
end_positions: tensor([[117,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(3.0898, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0194]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward

masked_target: tensor([[1046,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[ 1.9844, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578,
         -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578,
         -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578,
         -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578,
         -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578,
         -3.2578, -3.2578, -3.2578, -3.2578, -3.2578, -3.2578

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

loss: tensor([6.8398], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.8398, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[674,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(6.8398, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1662])
size of target: torch.Size([1, 64])
target: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -

Epoch 25:  46%|████▌     | 37/81 [00:16<00:22,  1.97batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1, 1849, 2])
size of start_logits

Epoch 25:  47%|████▋     | 38/81 [00:16<00:19,  2.16batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1187])
size of attention_mask: torch.Size([1, 1187])
size of segment_ids: torch.Size([1, 1187])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  31,   85,  144,  230,  289,  428,  466,  558,  816, 1111],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1187, 768])
size of logits: torch.Size([1, 1187, 2])
size of start_logits

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(636, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(615, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(69, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.986328125}, {'start': tensor(87, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.986328125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98095703125}, {'start': tensor(326, device='cuda:0'), 'end': tensor(

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9951171875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(402, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(186, device='cuda:0'), 'end': tensor(189, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98291015625}, {'start': tensor(541, device='cuda:0'), 'end': tensor(550, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98193359375}, {'start': tensor(22, device='cuda:0'), 'end': tensor(2

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

loss: tensor([0.9341], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9341, device='cuda:0', dtype=torch.float16)
total_loss: tensor(8.8241, device='cuda:0')
loss: tensor(8.8241, device='cuda:0')
decode
question_end_index:  tensor([28], device='cuda:0')
start_logits_indices:  tensor([ 439,  191,  611,  563,  775,  715,  173,  211,  129, 1077,  787, 1225,
         300,  192,  964,  453,  871,  591,  217,   30], device='cuda:0')
type_logits:  tensor([[0.5337, 0.4883, 0.4487]], device='cuda:0')
p_type:  0
p_type_score:  0.53369140625
sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(173, devic

size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.1389], device='cuda:0', dtype=torch.float16)
log_norm: tensor([1.0742], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9355], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9355, device='cuda:0', dtype=torch.float16)
total_loss: tensor(5.7289, device='cuda:0')
loss: tensor(5.7289, device='cuda:0')
decode
question_end_index:  tensor([33], device='cuda:0')
start_logits_indices:  tensor([  67, 1110, 1093,   50, 1278, 1295,  848, 1111,   88,  117, 1122,  657,
         218,  539, 1007,  790,  311,  328,  680,  712], device='cuda:0')
type_logits:  tensor([[0.5347, 0.4858, 0.4551]], device='cuda:0')
p_type:  0
p_type_score:  0.53466796875
sorted_answers: [{'start': tensor(311, device='cuda:0'), 'end': tensor(318, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='

final loss: tensor(4.5391, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(4.5391, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1660])
size of target: torch.Size([1, 64])
target: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1, 

Epoch 25:  56%|█████▌    | 45/81 [00:18<00:11,  3.16batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1, 2018, 2])
size of start_logits

Epoch 25:  57%|█████▋    | 46/81 [00:19<00:11,  3.15batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1, 

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

target: tensor([[ 696, 1010, 1245,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
masked_target: tensor([[ 696, 1010, 1245,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: t

Epoch 25:  59%|█████▉    | 48/81 [00:19<00:10,  3.16batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.8444010416666666, 0.83740234375, 0.8465169270833334, 0.8453776041666666, 0.8474934895833334, 0.8426106770833334, 0.8473307291666666, 0.8396809895833334, 0.8440755208333334, 0.8323567708333334, 0.8429361979166666, 0.83837890625, 0.8445638020833334]
f1_scores:  [0, 0, 0, 0, 0, 0, 0, 0, 0.6666666666666666, 0, 0, 0, 0]
em_scores:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.05128205128205128
len(em_scores):  13
avg_val_em:  0.0
len(prec_scores):  13
avg_val_prec:  0.038461538461538464
l

INFO:root:
Epoch 00024: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1316])
size of attention_mask: torch.Size([1, 1316])
size of segment_ids: torch.Size([1, 1316])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 38])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([28])
p_index: tensor([  17,  310,  402,  562,  616,  652,  755,  928,  998, 1149],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1316, 768])
size of logits: torch.Size([1, 1316, 2])
size of start_logits: torch.Size([1, 1316, 1])
size of start_logits after squeeze: torch.Size([1, 1316])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 25:  60%|██████    | 49/81 [00:20<00:14,  2.27batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=9, gpu=0, loss=4.781, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1548])
size of attention_mask: torch.Size([1, 1548])
size of segment_ids: torch.Size([1, 1548])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 43])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([33])
p_index: tensor([  25,  322,  395,  568,  718,  863,  991, 1090, 1207, 1322],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1548, 768])
size of logits: torch.Size([1, 1548, 2])
size of start_logits: torch.Size([1,

end_positions: tensor([[609,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(7.2070, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1604]], device='cuda:0', dtype=torch.float16,
       grad_fn=<GatherBackward>)
gathered_logits after -inf: tensor([[0.1604]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1,

Epoch 25:  63%|██████▎   | 51/81 [00:21<00:15,  1.88batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
siz

Epoch 25:  64%|██████▍   | 52/81 [00:22<00:18,  1.58batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1,

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

log_score: tensor([5.0117], device='cuda:0', dtype=torch.float16)
log_norm: tensor([11.2266], device='cuda:0', dtype=torch.float16)
loss: tensor([6.2148], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.2148, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[291, 317,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(6.2148, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1310])
size of target: torch.Size([1, 64])
target: tensor([[293, 318,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1

Epoch 25:  68%|██████▊   | 55/81 [00:23<00:11,  2.33batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of start_logits: torch.Size([1, 943,

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

loss: tensor([4.2070], device='cuda:0', dtype=torch.float16)
final loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[259, 283, 333, 365, 392, 429,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
size of gathered

target: tensor([[ 132,  161,  184,  516,  590,  639,  998, 1093, 1620,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
masked_target: tensor([[ 132,  161,  184,  516,  590,  639,  998, 1093, 1620,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: t

Epoch 25:  72%|███████▏  | 58/81 [00:24<00:08,  2.75batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933,

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

type_logits:  tensor([[0.5337, 0.4883, 0.4487]], device='cuda:0')
p_type:  0
p_type_score:  0.53369140625
sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9599609375}, {'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.95458984375}, {'start': tensor(1225, device='cuda:0'), 'end': tensor(1225, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.955078125}, {'start': tensor(591, device='cuda:0'), 'end': tensor(611, device='cuda:0'

target: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
masked_target: tensor([[ 766, 1176,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: t

Epoch 25:  75%|███████▌  | 61/81 [00:25<00:06,  3.09batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1, 1400, 2])
size of start_logit

Epoch 25:  77%|███████▋  | 62/81 [00:25<00:06,  3.02batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1, 2018, 2])
size of start_logit

Epoch 25:  78%|███████▊  | 63/81 [00:26<00:06,  2.95batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1, 1662, 2])
size of start_logit

Epoch 25:  79%|███████▉  | 64/81 [00:26<00:05,  2.97batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.8445638020833334, 0.83837890625, 0.8444010416666666, 0.8440755208333334, 0.8426106770833334, 0.8453776041666666, 0.8465169270833334, 0.8473307291666666, 0.8396809895833334, 0.8323567708333334, 0.8474934895833334, 0.8429361979166666, 0.83740234375]
f1_scores:  [0, 0, 0, 0.6666666666666666, 0, 0, 0, 0, 0, 0, 0, 0, 0]
em_scores:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.05128205128205128
len(em_scores):  13
avg_val_em:  0.0
len(prec_scores):  13
avg_val_prec:  0.038461538461538464


INFO:root:
Epoch 00024: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1494])
size of attention_mask: torch.Size([1, 1494])
size of segment_ids: torch.Size([1, 1494])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  47,  208,  366,  466,  614,  767,  886,  982, 1165, 1222],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1494, 768])
size of logits: torch.Size([1, 1494, 2])
size of start_logits: torch.Size([1, 1494, 1])
size of start_logits after squeeze: torch.Size([1, 1494])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

total_loss: tensor(5.4862, device='cuda:0', grad_fn=<AddBackward0>)
returned loss:  tensor(5.4862, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.7200000000000004e-05
loss:  tensor(5.4862, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.7200e-05], device='cuda:0')
Epoch 25:  80%|████████  | 65/81 [00:27<00:07,  2.18batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=12, gpu=0, loss=4.781, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 994])
size of attention_mask: torch.Size([1, 994])
size of segment_ids: torch.Size([1, 994])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 29])
size of sp_para: t

Epoch 25:  81%|████████▏ | 66/81 [00:27<00:06,  2.18batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=13, gpu=0, loss=4.781, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 878])
size of attention_mask: torch.Size([1, 878])
size of segment_ids: torch.Size([1, 878])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 32])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([22])
p_index: tensor([ 20, 187, 232, 324, 454, 552, 604, 677, 720, 821], device='cuda:0')
size of sequence_output: torch.Size([1, 878, 768])
size of logits: torch.Size([1, 878, 2])
size of start_logits: torch.Size([1, 878, 1])
size of sta

Epoch 25:  83%|████████▎ | 67/81 [00:28<00:06,  2.17batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1187])
size of attention_mask: torch.Size([1, 1187])
size of segment_ids: torch.Size([1, 1187])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  31,   85,  144,  230,  289,  428,  466,  558,  816, 1111],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1187, 768])
siz

total_loss: tensor(6.2761, device='cuda:0')
loss: tensor(6.2761, device='cuda:0')
decode
question_end_index:  tensor([30], device='cuda:0')
start_logits_indices:  tensor([ 423, 1075,  574,  984,  467,  231,  559,  608,  448,  489,  449,  483,
         313,  290,  326,   69,  207,  196,   87,  145], device='cuda:0')
type_logits:  tensor([[0.5371, 0.4810, 0.4426]], device='cuda:0')
p_type:  0
p_type_score:  0.537109375
sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(636, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(615, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(69, device='cuda

sorted_answers: [{'start': tensor(311, device='cuda:0'), 'end': tensor(318, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99267578125}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1143, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.982421875}, {'start': tensor(1110, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9794921875}, {'start': tensor(67, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.97900390625}, {'start': tensor(1111, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9794921875}, {'start': tensor(1122, device='cuda:0'), 'end': tensor(1133, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9794921875}, {'start': tensor(88, device='cuda:0'), 'end': tensor

masked_target: tensor([[696,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[ 2.5742, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,
         -2.9824, -2.9824, -2.9824, -2.9824, -2.9824,

Epoch 25:  86%|████████▋ | 70/81 [00:29<00:04,  2.21batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of start_logits: torch.Size([1, 943,

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

start_positions: tensor([[ 694, 1008, 1243,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(4.2266, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1649])
size of target: torch.Size([1, 64])
target: tensor([[ 696, 1010, 1245,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   

Epoch 25:  90%|█████████ | 73/81 [00:30<00:02,  2.84batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1, 1662, 2])
size of start_logit

sp_para_output tensor([[[0.4314],
         [0.4167],
         [0.4241],
         [0.4224],
         [0.4221],
         [0.4197],
         [0.4211],
         [0.4177],
         [0.4236],
         [0.4265]]], device='cuda:0')
sp_sent_pred: set()
sp_para_pred: set()
answers_pred: [{'text': ' 2006', 'score': 0.83740234375}]
answer_score: 0.83740234375
answer_text:  2006
answer_gold_token_ids: tensor([545,  12, 180,  12, 279], device='cuda:0')
answer_gold_tokens: ['Ġ16', '-', 'year', '-', 'old']
answer_gold:  16-year-old
f1: 0
prec: 0
recall: 0
em: 0

Epoch 25:  91%|█████████▏| 74/81 [00:30<00:02,  2.76batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 2018])
size of attentio

Epoch 25:  93%|█████████▎| 75/81 [00:31<00:02,  2.80batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1, 1400, 2])
size of start_logit

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9951171875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(402, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(186, device='cuda:0'), 'end': tensor(189, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98291015625}, {'start': tensor(541, device='cuda:0'), 'end': tensor(550, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98193359375}, {'start': tensor(22, device='cuda:0'), 'end': tensor(2

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

start_positions: tensor([[ 132,  161,  184,  516,  590,  639,  998, 1093, 1620,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(4.4766, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1849])
size of target: torch.Size([1, 64])
target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   

Epoch 25:  98%|█████████▊| 79/81 [00:32<00:00,  3.10batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1, 1660, 2])
size of start_logit

Epoch 25:  99%|█████████▉| 80/81 [00:32<00:00,  3.04batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=4.781, v_num=0, val_loss=6.84]validation_end
before sync --> sizes: 13, 13, 13, 13
after sync --> sizes: 13, 13, 13, 13
answer_scores:  [0.8453776041666666, 0.8440755208333334, 0.8396809895833334, 0.8426106770833334, 0.8473307291666666, 0.8445638020833334, 0.83740234375, 0.8429361979166666, 0.8474934895833334, 0.8444010416666666, 0.83837890625, 0.8465169270833334, 0.8323567708333334]
f1_scores:  [0, 0.6666666666666666, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
em_scores:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(f1_scores):  13
avg_val_f1:  0.05128205128205128
len(em_scores):  13
avg_val_em:  0.0
len(prec_scores):  13
avg_val_prec:  0.038461538461538464


INFO:root:
Epoch 00024: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1597])
size of attention_mask: torch.Size([1, 1597])
size of segment_ids: torch.Size([1, 1597])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 50])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([40])
p_index: tensor([  17,  196,  277,  474,  653,  968, 1072, 1223, 1353, 1463],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1597, 768])
size of logits: torch.Size([1, 1597, 2])
size of start_logits: torch.Size([1, 1597, 1])
size of start_logits after squeeze: torch.Size([1, 1597])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Epoch 26:   0%|          | 0/81 [00:00<00:40,  2.00batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=15, gpu=0, loss=4.983, v_num=0, val_loss=6.84] size of input_ids: torch.Size([1, 1459])
size of attention_mask: torch.Size([1, 1459])
size of segment_ids: torch.Size([1, 1459])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  26,  145,  282,  567,  597,  908, 1029, 1157, 1234, 1320],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1459, 768

gathered_logits after -inf: tensor([[0.2119]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexPutBackward>)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.2119], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
log_norm: tensor([1.0508], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.8389], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.8389, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
total_loss: tensor(3.7243, device='cuda:0', grad_fn=<AddBackward0>)
returned loss:  tensor(3.7243, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.7600000000000003e-05
loss:  tensor(3.7243, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.7600e-05], device='cuda:0')
Epoch 26:   1%|          | 1/81 [00:01<00:53,  1.50batch/s, avg_val_em=0, avg_val_f

lr:  tensor([2.7600e-05], device='cuda:0')
Epoch 26:   2%|▏         | 2/81 [00:01<00:48,  1.63batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=1, gpu=0, loss=4.983, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1345])
size of attention_mask: torch.Size([1, 1345])
size of segment_ids: torch.Size([1, 1345])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  37,  229,  410,  523,  666,  803,  895, 1137, 1189, 1240],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1345, 768])
size of logits: torch.Size([1, 1345

loss:  tensor(5.5273, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.7600e-05], device='cuda:0')
Epoch 26:   4%|▎         | 3/81 [00:02<00:46,  1.66batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=2, gpu=0, loss=4.983, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672, 

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9951171875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(381, device='cuda:0'), 'end': tensor(402, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(186, device='cuda:0'), 'end': tensor(189, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98291015625}, {'start': tensor(541, device='cuda:0'), 'end': tensor(550, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98193359375}, {'start': tensor(22, device='cuda:0'), 'end': tensor(2

log_score: tensor([-3.5352], device='cuda:0', dtype=torch.float16)
log_norm: tensor([7.3086], device='cuda:0', dtype=torch.float16)
loss: tensor([10.8438], device='cuda:0', dtype=torch.float16)
final loss: tensor(10.8438, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[680,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(10.8438, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1471]], devi

start_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
start_loss: tensor(4.5391, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1660])
size of target: torch.Size([1, 64])
target: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   

Epoch 26:   7%|▋         | 6/81 [00:03<00:36,  2.04batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=2, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1, 1649, 2])
size of start_logits:

Epoch 26:   9%|▊         | 7/81 [00:03<00:32,  2.28batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=2, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 943])
size of attention_mask: torch.Size([1, 943])
size of segment_ids: torch.Size([1, 943])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 27])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([17])
p_index: tensor([ 17,  70, 148, 248, 385, 430, 556, 634, 672, 877], device='cuda:0')
size of sequence_output: torch.Size([1, 943, 768])
size of logits: torch.Size([1, 943, 2])
size of start_logits: torch.Size([1, 943, 1

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

loss: tensor([4.2070], device='cuda:0', dtype=torch.float16)
final loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[259, 283, 333, 365, 392, 429,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(4.2070, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.1490]], device='cuda:0', dtype=torch.float16)
size of gathered

masked_target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[ 2.1992,  0.1064, -0.4421, -1.4922, -0.7490, -4.1719, -0.3457,  0.1851,
         -1.1787, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469,
         -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469,
         -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469,
         -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469,
         -3.0469, -3.0469, -3.0469, -3.0469, -3.0469, -3.0469

Epoch 26:  12%|█▏        | 10/81 [00:04<00:24,  2.85batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=2, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 

sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1662])
log_score: tensor([3.1523], device='cuda:0', dtype=torch.float16)
log_norm: tensor([9.9922], device='cuda:0', dtype=torch.float16)
loss: tensor([6.8398], device='cuda:0', dtype=torch.float16)
final loss: tensor(6.8398, device='cuda:0', dtype=torch.float16)
start_positions: tensor([[674,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(6.8398, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1662])
size of target: torch.Size([1, 64])
target: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1

Epoch 26:  16%|█▌        | 13/81 [00:05<00:20,  3.33batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=2, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1, 1320, 2])
size of start_logits

sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9599609375}, {'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.95458984375}, {'start': tensor(1225, device='cuda:0'), 'end': tensor(1225, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.955078125}, {'start': tensor(591, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.95458984375}, {'start': tensor(191, device='cuda:0'), 'end

log_norm: tensor([7.6094], device='cuda:0', dtype=torch.float16)
loss: tensor([1.1641], device='cuda:0', dtype=torch.float16)
final loss: tensor(1.1641, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[293, 318,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(1.1641, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1389]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([

size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1, 1011, 1])
size of start_logits after squeeze: torch.Size([1, 1011])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([41])
sent_indexes:  tensor([ 52,  78, 111, 127, 144, 167, 192, 213, 234, 261, 276, 293, 321, 345,
        369, 394, 428, 453, 480, 505, 521, 558, 572, 604, 620, 670, 697, 718,
        736, 747, 766, 779, 783, 808, 834, 861, 885, 900, 928, 963,

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

INFO:root:
Epoch 00025: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 2078])
size of attention_mask: torch.Size([1, 2078])
size of segment_ids: torch.Size([1, 2078])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 68])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([58])
p_index: tensor([  34,  209,  391,  583,  766,  834, 1210, 1396, 1694, 1905],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2078, 768])
size of logits: torch.Size([1, 2078, 2])
size of start_logits: torch.Size([1, 2078, 1])
size of start_logits after squeeze: torch.Size([1, 2078])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 26:  21%|██        | 17/81 [00:07<00:32,  1.95batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=3, gpu=0, loss=4.983, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1079])
size of attention_mask: torch.Size([1, 1079])
size of segment_ids: torch.Size([1, 1079])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 39])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([29])
p_index: tensor([ 22, 119, 144, 322, 457, 521, 742, 801, 887, 931], device='cuda:0')
size of sequence_output: torch.Size([1, 1079, 768])
size of logits: torch.Size([1, 1079, 2])
size of start_logits: torch.Size([1, 1079, 1])
size o

Epoch 26:  22%|██▏       | 18/81 [00:08<00:32,  1.92batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=4, gpu=0, loss=4.983, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1118])
size of attention_mask: torch.Size([1, 1118])
size of segment_ids: torch.Size([1, 1118])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 46])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([36])
p_index: tensor([  16,  192,  327,  526,  638,  666,  750,  868,  927, 1015],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1118, 768])
size of logits: torch.Size([1, 1118, 2])
size of start_logits: torch.Size([1,

log_norm: tensor([1.1270], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.8467], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.8467, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
total_loss: tensor(3.5904, device='cuda:0', grad_fn=<AddBackward0>)
returned loss:  tensor(3.5904, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.7600000000000003e-05
loss:  tensor(3.5904, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.7600e-05], device='cuda:0')
Epoch 26:  23%|██▎       | 19/81 [00:08<00:33,  1.86batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.983, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/1

Epoch 26:  25%|██▍       | 20/81 [00:09<00:38,  1.60batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1, 1849, 2])
size of start_logits

size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1849])
log_score: tensor([2.6172], device='cuda:0', dtype=torch.float16)
log_norm: tensor([7.3203], device='cuda:0', dtype=torch.float16)
loss: tensor([4.7031], device='cuda:0', dtype=torch.float16)
final loss: tensor(4.7031, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(4.7031, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1,

sorted_answers: [{'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98583984375}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98583984375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9599609375}, {'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.95458984375}, {'start': tensor(1225, device='cuda:0'), 'end': tensor(1225, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.955078125}, {'start': tensor(591, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.95458984375}, {'start': tensor(191, device='cuda:0'), 'end

log_norm: tensor([1.0957], device='cuda:0', dtype=torch.float16)
loss: tensor([1.3408], device='cuda:0', dtype=torch.float16)
final loss: tensor(1.3408, device='cuda:0', dtype=torch.float16)
total_loss: tensor(2.4370, device='cuda:0')
loss: tensor(2.4370, device='cuda:0')
decode
question_end_index:  tensor([20], device='cuda:0')
start_logits_indices:  tensor([1199,  364,  325, 1206,   22,  969,  840,  869,  578,  573,  581,  295,
         585,  202,  143,  186,  541,  381,  291,  255], device='cuda:0')
type_logits:  tensor([[0.5439, 0.5044, 0.4390]], device='cuda:0')
p_type:  0
p_type_score:  0.5439453125
sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9951171875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0

sorted_answers: [{'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99853515625}, {'start': tensor(58, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(401, device='cuda:0'), 'end': ten

loss: tensor([1.1641], device='cuda:0', dtype=torch.float16)
final loss: tensor(1.1641, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[293, 318,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(1.1641, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.1389]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.1389]], device='cuda:0', dtype=torch.float16)
size of gathered

size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 1])
size of start_logits after squeeze: torch.Size([1, 933])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([37])
sent_indexes:  tensor([ 21,  54,  68,  97, 157, 176, 190, 216, 263, 282, 315, 362, 398, 417,
        441, 456, 474, 488, 510, 523, 543, 554, 578, 599, 616, 645, 675, 696,
        709, 745, 757, 772, 805, 829, 849, 871, 913], device='cuda:0')


sorted_answers: [{'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9970703125}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(642, device='cuda:0'), 'end': tensor(655, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99169921875}, {'start': tensor(642, device='cuda:0'), 'end': tensor(654, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99072265625}, {'start': tensor(450, device='cuda:0'), 'end': tensor(476, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.982421875}, {'start': tensor(779, device='cuda:0'), 'end': tensor(790, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97705078125}, {'start': tensor(642, device='cuda:0'), 'end': tensor(643, device

size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-3.5352, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836,
         -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836, -2.6836]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-3.5352,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,

size of sequence_output: torch.Size([1, 1660, 768])
size of logits: torch.Size([1, 1660, 2])
size of start_logits: torch.Size([1, 1660, 1])
size of start_logits after squeeze: torch.Size([1, 1660])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([57])
sent_indexes:  tensor([  24,   50,   81,  118,  130,  161,  201,  237,  269,  297,  356,  389,
         415,  462,  480,  500,  520,  540,  562,  591,  620,  662,  678,  706,
         754,  773,  782,  813,  839,  854,  883,  923,  955,  

end_loss: tensor(6.3359, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.0872]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.0872]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.0872], device='cuda:0', dtype=torch.float16)
log_norm: tensor([1.0322], device='cuda:0', dtype=torch.float16)
loss: tensor([0.9448], device='cuda:0', dtype=torch.float16)
final loss: tensor(0.9448, device='cuda:0', dtype=torch.float16)
total_loss: tensor(7.5488, device='cuda:0')
loss: tensor(7.5488, device='cuda:0')
decode
question_end_index:  tensor([23], device='cuda:0')
start_logits_indices:  tensor([1203, 1418, 

gathered_logits: tensor([[-0.4348, -1.8271, -2.1836, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004,
         -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004, -2.9004]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-0.4348, -1.8271, -2.1836,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,

Epoch 26:  36%|███▌      | 29/81 [00:12<00:16,  3.08batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=5, gpu=0, loss=4.983, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of start_logits: torch.Size([1, 851, 

sorted_answers: [{'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99609375}, {'start': tensor(149, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99609375}, {'start': tensor(109, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(117, device='cuda:0'), 'end': tensor(122, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99365234375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(721, device='cuda:0'), 'end': t

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.9990234375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(297, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9921875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(636, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(608, device='cuda:0'), 'end': tensor(615, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(69, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.986328125}, {'start': tensor(87, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.986328125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98095703125}, {'start': tensor(326, device='cuda:0'), 'end': tensor(

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99560546875}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99560546875}, {'start': tensor(611, device='cuda:0'), 'end': tensor(626, device='cuda:0'), 'start_logit': 0.998046875, 'end_logit': 0.990234375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.98828125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98486328125}, {'start': tensor(215, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98486328125}, {'start': tensor(118, device='cuda:0'), '

INFO:root:
Epoch 00025: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1316])
size of attention_mask: torch.Size([1, 1316])
size of segment_ids: torch.Size([1, 1316])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 38])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([28])
p_index: tensor([  17,  310,  402,  562,  616,  652,  755,  928,  998, 1149],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1316, 768])
size of logits: torch.Size([1, 1316, 2])
size of start_logits: torch.Size([1, 1316, 1])
size of start_logits after squeeze: torch.Size([1, 1316])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 26:  41%|████      | 33/81 [00:13<00:21,  2.23batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=6, gpu=0, loss=4.983, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1314])
size of attention_mask: torch.Size([1, 1314])
size of segment_ids: torch.Size([1, 1314])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  23,  142,  212,  402,  481,  676,  794,  899,  947, 1129],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1314, 768])
size of logits: torch.Size([1, 1314, 2])
size of start_logits: torch.Size([1,

lr:  tensor([2.7600e-05], device='cuda:0')
Epoch 26:  42%|████▏     | 34/81 [00:14<00:23,  2.00batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=7, gpu=0, loss=5.005, v_num=0, val_loss=6.84]size of input_ids: torch.Size([1, 1494])
size of attention_mask: torch.Size([1, 1494])
size of segment_ids: torch.Size([1, 1494])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  47,  208,  366,  466,  614,  767,  886,  982, 1165, 1222],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1494, 768])
size of logits: torch.Size([1, 149

Epoch 26:  43%|████▎     | 35/81 [00:14<00:23,  1.97batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=5.005, v_num=0, val_loss=6.84]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 933])
size of attention_mask: torch.Size([1, 933])
size of segment_ids: torch.Size([1, 933])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([ 21, 190, 263, 282, 398, 474, 554, 645, 757, 849], device='cuda:0')
size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Siz

sorted_answers: [{'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99560546875}, {'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9990234375}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'en

masked_target: tensor([[ 766, 1176,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[ 0.4070,  1.9980, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500

Epoch 26:  46%|████▌     | 37/81 [00:15<00:22,  1.98batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=5.005, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1, 1320, 2])
size of start_logits

sorted_answers: [{'start': tensor(1209, device='cuda:0'), 'end': tensor(1216, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.974609375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9580078125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.95947265625}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9951171875, 'end_logit': 0.95947265625}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.955078125}, {'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9443359375}, {'start': tensor(591, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.9443359375}, {'start': tensor(1225, device='cuda:0'), 

masked_target: tensor([[678,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-2.5176, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,
         -4.5938, -4.5938, -4.5938, -4.5938, -4.5938,

Epoch 26:  48%|████▊     | 39/81 [00:16<00:16,  2.47batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=5.005, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1400])
size of attention_mask: torch.Size([1, 1400])
size of segment_ids: torch.Size([1, 1400])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([2], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([  21,  128,  158,  363,  572,  672,  868,  967, 1132, 1198],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1400, 768])
size of logits: torch.Size([1, 1400, 2])
size of start_logits

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99560546875}, {'start': tensor(1133, device='cuda:0'), 'end': tensor(1134, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.998046875}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.966796875}, {'start': tensor(573, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(585, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(578, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.96435546875}, {'start': tensor(381, device='cuda:0

start_loss: tensor(6.1055, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 1849])
size of target: torch.Size([1, 64])
target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
masked_target: tensor([[ 134,  164,  187,  519,  592,  642, 1001, 1096, 1623,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0

Epoch 26:  51%|█████     | 41/81 [00:17<00:14,  2.77batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=5.005, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1, 2018, 2])
size of start_logits

Epoch 26:  52%|█████▏    | 42/81 [00:17<00:13,  2.87batch/s, avg_val_em=0, avg_val_f1=0.0513, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0.0385, avg_val_recall=0.0769, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=8, gpu=0, loss=5.005, v_num=0, val_loss=6.84]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1, 1649, 2])
size of start_logits

size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1649])
log_score: tensor([-1.7988], device='cuda:0', dtype=torch.float16)
log_norm: tensor([6.8242], device='cuda:0', dtype=torch.float16)
loss: tensor([8.6250], device='cuda:0', dtype=torch.float16)
final loss: tensor(8.6250, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 696, 1010, 1245,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(8.6250, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1

size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of start_logits: torch.Size([1, 851, 1])
size of start_logits after squeeze: torch.Size([1, 851])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([35])
sent_indexes:  tensor([ 13,  34,  61,  75,  97, 116, 151, 185, 214, 237, 258, 295, 316, 353,
        374, 385, 414, 433, 455, 501, 522, 550, 572, 590, 618, 645, 657, 683,
        706, 720, 747, 761, 775, 797, 826], device='cuda:0')
size of sp

sorted_answers: [{'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99462890625}, {'start': tensor(721, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(610, device='cuda:0'), 'end': tensor(623, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97802734375}, {'start': tensor(623, device='cuda:

size of sequence_output: torch.Size([1, 1310, 768])
size of logits: torch.Size([1, 1310, 2])
size of start_logits: torch.Size([1, 1310, 1])
size of start_logits after squeeze: torch.Size([1, 1310])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([40])
sent_indexes:  tensor([  34,   60,   91,  127,  199,  211,  222,  240,  263,  316,  347,  381,
         398,  453,  471,  479,  516,  579,  615,  651,  685,  711,  750,  785,
         804,  837,  871,  932,  979,  997, 1027, 1048, 1069, 1

sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.98193359375}, {'start': tensor(657, device='cuda:0'), 'end': tensor(657, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97265625}, {'start': tensor(67, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.966796875}, {'start': tensor(88, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.966796875}, {'start': tensor(92, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.966796875}, {'start': tensor(67, device='cuda:0'), 'end': tensor(68, device='cuda:0'

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(1102, device='cuda:0'), 'end': tensor(1113, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.990234375}, {'start': tensor(817, device='cuda:0'), 'end': tensor(833, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98876953125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.982421875}, {'start': tensor(87, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(69, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(307, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96240234375}, {'start': tensor(290, device='cuda:0'), 'end': ten

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9931640625}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99609375, 'end_logit': 0.9931640625}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.98974609375, 'end_logit': 0.9931640625}, {'start': tensor(264, device='cuda:0'), 'end': tensor(264, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98095703125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98193359375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(118, device='cuda:0'), 'start_logit': 0.9912109375, 'end_logit': 0.986328125}, {'start': tensor(263, device='cuda:0'), 'e

sorted_answers: [{'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9990234375}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9970703125}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9970703125}, {'start': tensor(67, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9921875, 'end_logit': 0.99853515625}, {'start': tensor(401, device='cuda:0'), 'end

INFO:root:
Epoch 00025: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1872])
size of attention_mask: torch.Size([1, 1872])
size of segment_ids: torch.Size([1, 1872])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 61])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([51])
p_index: tensor([  20,  304,  712,  785,  944, 1044, 1110, 1397, 1664, 1795],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1872, 768])
size of logits: torch.Size([1, 1872, 2])
size of start_logits: torch.Size([1, 1872, 1])
size of start_logits after squeeze: torch.Size([1, 1872])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 26:  60%|██████    | 49/81 [00:20<00:15,  2.13batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=9, gpu=0, loss=5.005, v_num=0, val_loss=7.53]size of input_ids: torch.Size([1, 1285])
size of attention_mask: torch.Size([1, 1285])
size of segment_ids: torch.Size([1, 1285])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 48])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([38])
p_index: tensor([  25,  135,  251,  343,  431,  569,  636,  903, 1045, 1131],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1285, 768])
size of logits: torch.Size([1, 1285, 2])
size of start_logits: torch.Size([1, 1285, 1])
size

Epoch 26:  62%|██████▏   | 50/81 [00:20<00:15,  2.01batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=10, gpu=0, loss=5.005, v_num=0, val_loss=7.53]size of input_ids: torch.Size([1, 1376])
size of attention_mask: torch.Size([1, 1376])
size of segment_ids: torch.Size([1, 1376])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([  22,  114,  207,  322,  430,  553,  858,  978, 1129, 1217],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1376, 768])
size of logits: torch.Size([1, 1376, 2])
size of start_logits: torch.Size([1, 1376, 1])
siz

returned loss:  tensor(6.2417, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.8000000000000003e-05
loss:  tensor(6.2417, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.8000e-05], device='cuda:0')
Epoch 26:  63%|██████▎   | 51/81 [00:21<00:16,  1.84batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=5.005, v_num=0, val_loss=7.53]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Si

Epoch 26:  64%|██████▍   | 52/81 [00:22<00:18,  1.61batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1, 1662, 2])
size of start_logits: torch.Size([

Epoch 26:  65%|██████▌   | 53/81 [00:22<00:14,  1.87batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 2018])
size of attention_mask: torch.Size([1, 2018])
size of segment_ids: torch.Size([1, 2018])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 58])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([48])
p_index: tensor([  25,  354,  470,  654,  801, 1279, 1398, 1416, 1547, 1879],
       device='cuda:0')
size of sequence_output: torch.Size([1, 2018, 768])
size of logits: torch.Size([1, 2018, 2])
size of start_logits: torch.Size([

final loss: tensor(10.3750, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[680,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(10.3750, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.3379]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.3379]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(1102, device='cuda:0'), 'end': tensor(1113, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.990234375}, {'start': tensor(817, device='cuda:0'), 'end': tensor(833, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98876953125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.982421875}, {'start': tensor(87, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(69, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(307, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96240234375}, {'start': tensor(290, device='cuda:0'), 'end': ten

final loss: tensor(3.9668, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[696,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
end_loss: tensor(3.9668, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1, 1])
target: tensor([[0]], device='cuda:0')
masked_target: tensor([[0]], device='cuda:0')
size of gathered_logits: torch.Size([1, 1])
gathered_logits: tensor([[0.3684]], device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.3684]], device='cuda:0', dtype=torch.float16)
size of gathered_logits after view: torch.Size([1, 1])
size of logits after v

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9931640625}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99609375, 'end_logit': 0.9931640625}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.98974609375, 'end_logit': 0.9931640625}, {'start': tensor(264, device='cuda:0'), 'end': tensor(264, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98095703125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98193359375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(118, device='cuda:0'), 'start_logit': 0.9912109375, 'end_logit': 0.986328125}, {'start': tensor(263, device='cuda:0'), 'e

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99560546875}, {'start': tensor(1133, device='cuda:0'), 'end': tensor(1134, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.998046875}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.966796875}, {'start': tensor(573, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(585, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(578, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.96435546875}, {'start': tensor(381, device='cuda:0

sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.98193359375}, {'start': tensor(657, device='cuda:0'), 'end': tensor(657, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97265625}, {'start': tensor(67, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.966796875}, {'start': tensor(88, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.966796875}, {'start': tensor(92, device='cuda:0'), 'end': tensor(94, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.966796875}, {'start': tensor(67, device='cuda:0'), 'end': tensor(68, device='cuda:0'

sorted_answers: [{'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9990234375}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9970703125}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9970703125}, {'start': tensor(67, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9921875, 'end_logit': 0.99853515625}, {'start': tensor(401, device='cuda:0'), 'end

gathered_logits: tensor([[ 0.4070,  1.9980, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500,
         -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500, -4.2500]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[0.4070, 1.9980,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,
           -inf, 

Epoch 26:  75%|███████▌  | 61/81 [00:24<00:06,  3.30batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1649])
size of attention_mask: torch.Size([1, 1649])
size of segment_ids: torch.Size([1, 1649])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  25,  197,  495,  584,  670,  828, 1003, 1325, 1489, 1570],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1649, 768])
size of logits: torch.Size([1, 1649, 2])
size of start_logits: torch.Size([

Epoch 26:  77%|███████▋  | 62/81 [00:25<00:05,  3.19batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=11, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 851])
size of attention_mask: torch.Size([1, 851])
size of segment_ids: torch.Size([1, 851])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 35])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([25])
p_index: tensor([ 13, 116, 151, 295, 374, 455, 501, 590, 683, 720], device='cuda:0')
size of sequence_output: torch.Size([1, 851, 768])
size of logits: torch.Size([1, 851, 2])
size of start_logits: torch.Size([1, 851, 1])
size of st

sorted_answers: [{'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99462890625}, {'start': tensor(721, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(610, device='cuda:0'), 'end': tensor(623, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97802734375}, {'start': tensor(623, device='cuda:

size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 1])
size of start_logits after squeeze: torch.Size([1, 933])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([37])
sent_indexes:  tensor([ 21,  54,  68,  97, 157, 176, 190, 216, 263, 282, 315, 362, 398, 417,
        441, 456, 474, 488, 510, 523, 543, 554, 578, 599, 616, 645, 675, 696,
        709, 745, 757, 772, 805, 829, 849, 871, 913], device='cuda:0')


sorted_answers: [{'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99560546875}, {'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9990234375}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'en

INFO:root:
Epoch 00025: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 1597])
size of attention_mask: torch.Size([1, 1597])
size of segment_ids: torch.Size([1, 1597])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 50])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([40])
p_index: tensor([  17,  196,  277,  474,  653,  968, 1072, 1223, 1353, 1463],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1597, 768])
size of logits: torch.Size([1, 1597, 2])
size of start_logits: torch.Size([1, 1597, 1])
size of start_logits after squeeze: torch.Size([1, 1597])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 

Epoch 26:  80%|████████  | 65/81 [00:26<00:07,  2.23batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=12, gpu=0, loss=5.005, v_num=0, val_loss=7.53]size of input_ids: torch.Size([1, 994])
size of attention_mask: torch.Size([1, 994])
size of segment_ids: torch.Size([1, 994])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 29])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([19])
p_index: tensor([ 24, 143, 208, 354, 408, 509, 585, 731, 816, 910], device='cuda:0')
size of sequence_output: torch.Size([1, 994, 768])
size of logits: torch.Size([1, 994, 2])
size of start_logits: torch.Size([1, 994, 1])
size of start_logits after

Epoch 26:  81%|████████▏ | 66/81 [00:26<00:06,  2.24batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=13, gpu=0, loss=5.005, v_num=0, val_loss=7.53]size of input_ids: torch.Size([1, 1548])
size of attention_mask: torch.Size([1, 1548])
size of segment_ids: torch.Size([1, 1548])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 43])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([33])
p_index: tensor([  25,  322,  395,  568,  718,  863,  991, 1090, 1207, 1322],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1548, 768])
size of logits: torch.Size([1, 1548, 2])
size of start_logits: torch.Size([1, 1548, 1])
siz

Epoch 26:  83%|████████▎ | 67/81 [00:27<00:06,  2.01batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]
Validating:   0%|          | 0/13 [00:00<?, ?batch/s][Avalidation_step
size of input_ids: torch.Size([1, 1660])
size of attention_mask: torch.Size([1, 1660])
size of segment_ids: torch.Size([1, 1660])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 57])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([47])
p_index: tensor([  24,  161,  462,  591,  620,  754,  955, 1154, 1325, 1524],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1660, 768])
size of logits: to

size of gathered_logits after view: torch.Size([1, 64])
size of logits after view: torch.Size([1, 1660])
log_score: tensor([-1.8232], device='cuda:0', dtype=torch.float16)
log_norm: tensor([5.2891], device='cuda:0', dtype=torch.float16)
loss: tensor([7.1133], device='cuda:0', dtype=torch.float16)
final loss: tensor(7.1133, device='cuda:0', dtype=torch.float16)
end_positions: tensor([[ 766, 1176,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
           -1,   -1,   -1,   -1]], device='cuda:0')
end_loss: tensor(7.1133, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 3])
size of target: torch.Size([1

Epoch 26:  84%|████████▍ | 68/81 [00:28<00:08,  1.59batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1662])
size of attention_mask: torch.Size([1, 1662])
size of segment_ids: torch.Size([1, 1662])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 53])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([43])
p_index: tensor([  15,  126,  231,  459,  649,  767,  940, 1114, 1365, 1545],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1662, 768])
size of logits: torch.Size([1, 1662, 2])
size of start_logits: torch.Size([

Epoch 26:  85%|████████▌ | 69/81 [00:28<00:06,  1.83batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1320])
size of attention_mask: torch.Size([1, 1320])
size of segment_ids: torch.Size([1, 1320])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 37])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([27])
p_index: tensor([  29,  168,  438,  601,  692,  765,  953, 1068, 1127, 1208],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1320, 768])
size of logits: torch.Size([1, 1320, 2])
size of start_logits: torch.Size([

sorted_answers: [{'start': tensor(1209, device='cuda:0'), 'end': tensor(1216, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.974609375}, {'start': tensor(129, device='cuda:0'), 'end': tensor(129, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9580078125}, {'start': tensor(775, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9970703125, 'end_logit': 0.95947265625}, {'start': tensor(787, device='cuda:0'), 'end': tensor(789, device='cuda:0'), 'start_logit': 0.9951171875, 'end_logit': 0.95947265625}, {'start': tensor(173, device='cuda:0'), 'end': tensor(173, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.955078125}, {'start': tensor(611, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9443359375}, {'start': tensor(591, device='cuda:0'), 'end': tensor(611, device='cuda:0'), 'start_logit': 0.9931640625, 'end_logit': 0.9443359375}, {'start': tensor(1225, device='cuda:0'), 

start_positions: tensor([[678,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
start_loss: tensor(5.1875, device='cuda:0', dtype=torch.float16)
or_softmax_cross_entropy_loss_one_doc
size of logits: torch.Size([1, 2018])
size of target: torch.Size([1, 64])
target: tensor([[680,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]], device='cuda:0')
masked_target: tensor([[680,  

Epoch 26:  88%|████████▊ | 71/81 [00:29<00:04,  2.31batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1011])
size of attention_mask: torch.Size([1, 1011])
size of segment_ids: torch.Size([1, 1011])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 41])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([31])
p_index: tensor([ 52, 167, 321, 453, 521, 572, 697, 747, 808, 900], device='cuda:0')
size of sequence_output: torch.Size([1, 1011, 768])
size of logits: torch.Size([1, 1011, 2])
size of start_logits: torch.Size([1, 1011, 1])
size

sorted_answers: [{'start': tensor(641, device='cuda:0'), 'end': tensor(642, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99755859375}, {'start': tensor(264, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9931640625}, {'start': tensor(263, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.99609375, 'end_logit': 0.9931640625}, {'start': tensor(237, device='cuda:0'), 'end': tensor(265, device='cuda:0'), 'start_logit': 0.98974609375, 'end_logit': 0.9931640625}, {'start': tensor(264, device='cuda:0'), 'end': tensor(264, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.98095703125}, {'start': tensor(221, device='cuda:0'), 'end': tensor(237, device='cuda:0'), 'start_logit': 0.99755859375, 'end_logit': 0.98193359375}, {'start': tensor(118, device='cuda:0'), 'end': tensor(118, device='cuda:0'), 'start_logit': 0.9912109375, 'end_logit': 0.986328125}, {'start': tensor(263, device='cuda:0'), 'e

sorted_answers: [{'start': tensor(557, device='cuda:0'), 'end': tensor(559, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9990234375}, {'start': tensor(71, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99853515625}, {'start': tensor(632, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.9970703125}, {'start': tensor(66, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99853515625}, {'start': tensor(154, device='cuda:0'), 'end': tensor(154, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(635, device='cuda:0'), 'end': tensor(638, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9970703125}, {'start': tensor(67, device='cuda:0'), 'end': tensor(73, device='cuda:0'), 'start_logit': 0.9921875, 'end_logit': 0.99853515625}, {'start': tensor(401, device='cuda:0'), 'end

sorted_answers: [{'start': tensor(559, device='cuda:0'), 'end': tensor(561, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99853515625}, {'start': tensor(1102, device='cuda:0'), 'end': tensor(1113, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.990234375}, {'start': tensor(817, device='cuda:0'), 'end': tensor(833, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98876953125}, {'start': tensor(290, device='cuda:0'), 'end': tensor(291, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.982421875}, {'start': tensor(87, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(69, device='cuda:0'), 'end': tensor(90, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.97021484375}, {'start': tensor(290, device='cuda:0'), 'end': tensor(307, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96240234375}, {'start': tensor(290, device='cuda:0'), 'end': ten

sorted_answers: [{'start': tensor(136, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(141, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(144, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99462890625}, {'start': tensor(153, device='cuda:0'), 'end': tensor(157, device='cuda:0'), 'start_logit': 0.99853515625, 'end_logit': 0.99462890625}, {'start': tensor(721, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(697, device='cuda:0'), 'end': tensor(726, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.9833984375}, {'start': tensor(610, device='cuda:0'), 'end': tensor(623, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.97802734375}, {'start': tensor(623, device='cuda:

size of sequence_output: torch.Size([1, 933, 768])
size of logits: torch.Size([1, 933, 2])
size of start_logits: torch.Size([1, 933, 1])
size of start_logits after squeeze: torch.Size([1, 933])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_output_t after tanh: torch.Size([1, 10, 768])
size of sp_para_output_t: torch.Size([1, 10, 1])
size of sp_para_output_aux: torch.Size([1, 10, 1])
size of predict_support_para: torch.Size([1, 10, 2])
size of sent_indexes: torch.Size([37])
sent_indexes:  tensor([ 21,  54,  68,  97, 157, 176, 190, 216, 263, 282, 315, 362, 398, 417,
        441, 456, 474, 488, 510, 523, 543, 554, 578, 599, 616, 645, 675, 696,
        709, 745, 757, 772, 805, 829, 849, 871, 913], device='cuda:0')


sorted_answers: [{'start': tensor(642, device='cuda:0'), 'end': tensor(647, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.99560546875}, {'start': tensor(251, device='cuda:0'), 'end': tensor(266, device='cuda:0'), 'start_logit': 0.99462890625, 'end_logit': 0.9990234375}, {'start': tensor(779, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(777, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(752, device='cuda:0'), 'end': tensor(780, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.99267578125}, {'start': tensor(758, device='cuda:0'), 'en

sorted_answers: [{'start': tensor(364, device='cuda:0'), 'end': tensor(366, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.99560546875}, {'start': tensor(1133, device='cuda:0'), 'end': tensor(1134, device='cuda:0'), 'start_logit': 0.99658203125, 'end_logit': 0.998046875}, {'start': tensor(22, device='cuda:0'), 'end': tensor(29, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.98779296875}, {'start': tensor(364, device='cuda:0'), 'end': tensor(381, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.966796875}, {'start': tensor(573, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(585, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.99951171875, 'end_logit': 0.96435546875}, {'start': tensor(578, device='cuda:0'), 'end': tensor(585, device='cuda:0'), 'start_logit': 0.9990234375, 'end_logit': 0.96435546875}, {'start': tensor(381, device='cuda:0

size of gathered_logits: torch.Size([1, 64])
gathered_logits: tensor([[-2.1094, -3.7871, -3.8398, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984,
         -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984, -3.8984]],
       device='cuda:0', dtype=torch.float16)
gathered_logits after -inf: tensor([[-2.1094, -3.7871, -3.8398,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,

Epoch 26:  96%|█████████▋| 78/81 [00:31<00:00,  3.28batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1849])
size of attention_mask: torch.Size([1, 1849])
size of segment_ids: torch.Size([1, 1849])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 51])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([41])
p_index: tensor([  18,  171,  225,  495,  652,  970, 1256, 1442, 1596, 1696],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1849, 768])
size of logits: torch.Size([1, 1849, 2])
size of start_logits: torch.Size([

Epoch 26:  98%|█████████▊| 79/81 [00:31<00:00,  3.09batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]validation_step
size of input_ids: torch.Size([1, 1310])
size of attention_mask: torch.Size([1, 1310])
size of segment_ids: torch.Size([1, 1310])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 40])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([30])
p_index: tensor([  34,   91,  222,  316,  347,  479,  651,  711, 1092, 1277],
       device='cuda:0')
size of sequence_output: torch.Size([1, 1310, 768])
size of logits: torch.Size([1, 1310, 2])
size of start_logits: torch.Size([

final loss: tensor(0.7271, device='cuda:0', dtype=torch.float16)
total_loss: tensor(6.3314, device='cuda:0')
loss: tensor(6.3314, device='cuda:0')
decode
question_end_index:  tensor([33], device='cuda:0')
start_logits_indices:  tensor([1110,   67, 1093, 1278,  657,   88,  218,   50, 1118, 1111,  790,  848,
         680,   92,  117,  362,  712, 1151, 1190, 1007], device='cuda:0')
type_logits:  tensor([[0.5859, 0.4365, 0.4255]], device='cuda:0')
p_type:  0
p_type_score:  0.5859375
sorted_answers: [{'start': tensor(1110, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(1093, device='cuda:0'), 'end': tensor(1110, device='cuda:0'), 'start_logit': 1.0, 'end_logit': 0.98095703125}, {'start': tensor(712, device='cuda:0'), 'end': tensor(715, device='cuda:0'), 'start_logit': 0.99560546875, 'end_logit': 0.98193359375}, {'start': tensor(657, device='cuda:0'), 'end': tensor(657, device='cuda:0'), 'start_logit': 0.99951171875, 

Epoch 26:  99%|█████████▉| 80/81 [00:32<00:00,  3.10batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f1=0, avg_val_joint_prec=0, avg_val_joint_recall=0, avg_val_prec=0, avg_val_recall=0, avg_val_sp_sent_em=0, avg_val_sp_sent_f1=0, avg_val_sp_sent_prec=0, avg_val_sp_sent_recall=0, batch_idx=14, gpu=0, loss=5.005, v_num=0, val_loss=7.53]
                                                              [A

INFO:root:
Epoch 00025: avg_val_f1 was not in top 5


size of input_ids: torch.Size([1, 746])
size of attention_mask: torch.Size([1, 746])
size of segment_ids: torch.Size([1, 746])
size of start_positions: torch.Size([1, 64])
size of end_positions:torch.Size([1, 64])
q_type: tensor([0], device='cuda:0')
size of sp_sent: torch.Size([1, 23])
size of sp_para: torch.Size([1, 10])
size of p_index: torch.Size([10])
size of s_index: torch.Size([13])
p_index: tensor([ 12, 107, 169, 348, 382, 430, 490, 634, 663, 721], device='cuda:0')
size of sequence_output: torch.Size([1, 746, 768])
size of logits: torch.Size([1, 746, 2])
size of start_logits: torch.Size([1, 746, 1])
size of start_logits after squeeze: torch.Size([1, 746])
size of sequence_output[:,0]: torch.Size([1, 768])
size of type_logits after dense: torch.Size([1, 768])
size of type_logits after tanh: torch.Size([1, 768])
size of type_logits: torch.Size([1, 3])
size of sp_para_output: torch.Size([1, 10, 768])
size of sp_para_output_t after dense: torch.Size([1, 10, 768])
size of sp_para_ou

size of gathered_logits after view: torch.Size([1, 1])
size of logits after view: torch.Size([1, 3])
log_score: tensor([0.3616], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
log_norm: tensor([1.1055], device='cuda:0', dtype=torch.float16,
       grad_fn=<LogsumexpBackward>)
loss: tensor([0.7441], device='cuda:0', dtype=torch.float16, grad_fn=<NegBackward>)
final loss: tensor(0.7441, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward0>)
total_loss: tensor(3.5913, device='cuda:0', grad_fn=<AddBackward0>)
returned loss:  tensor(3.5913, device='cuda:0', grad_fn=<AddBackward0>)
self.trainer.optimizers[0].param_groups[0]['lr']:  2.8000000000000003e-05
loss:  tensor(3.5913, device='cuda:0', grad_fn=<AddBackward0>)
lr:  tensor([2.8000e-05], device='cuda:0')
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Epoch 26: 100%|██████████| 81/81 [00:32<00:00,  2.46batch/s, avg_val_em=0, avg_val_f1=0, avg_val_joint_em=0, avg_val_joint_f

1

In [None]:
    trainer.test(model)

In [111]:
# debug: check args
import shlex
argString ='--train_dataset small.json --dev_dataset small_dev.json  \
    --gpus 0  --num_workers 4 \
    --max_seq_len 4096 --doc_stride -1  \
    --save_prefix hotpotqa-longformer  --model_path /Users/fan/Downloads/longformer-base-4096 --test '
# hotpot_dev_distractor_v1.json

import argparse
if __name__ == "__main__":
    main_arg_parser = argparse.ArgumentParser(description="hotpotqa")
    parser = hotpotqa.add_model_specific_args(main_arg_parser, os.getcwd())
    args = parser.parse_args(shlex.split(argString))
    print(args)
    main(args)

Namespace(attention_mode='sliding_chunks', batch_size=8, dev_dataset='small_dev.json', disable_checkpointing=False, doc_stride=-1, epochs=26, fp32=False, gpus='0', ignore_seq_with_no_answers=False, lr=5e-05, max_answer_length=30, max_doc_len=4096, max_num_answers=64, max_question_len=55, max_seq_len=4096, model_path='/Users/fan/Downloads/longformer-base-4096', n_best_size=20, no_progress_bar=False, num_workers=4, regular_softmax_loss=False, save_dir='hotpotqa', save_prefix='hotpotqa-longformer', seed=1234, sp_para_th=0.45, sp_sent_th=0.4, test=True, train_dataset='small.json', val_every=0.2, val_percent_check=1.0, warmup=1000)
