In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


importing libraries

In [5]:
#!pip3 install transformers
#!pip3 install sentencepiece

In [6]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [7]:

import pandas as pd
#from datasets import load_dataset
from datasets import load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


from tqdm import tqdm
import torch


In [8]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


In [47]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=8, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        # Filter out examples with a shorter sequence length
        #mask = inputs['attention_mask'].sum(dim=1) > 0
        #inputs = {k: v[mask] for k, v in inputs.items()}
        #target_batch = [t for t, m in zip(target_batch, mask) if m]
        
        print("input_ids shape:", inputs["input_ids"].shape)
        print("max input_ids value:", inputs["input_ids"].max())
        print("vocabulary size:", tokenizer.vocab_size)

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
       
        print(summaries.shape)
        print(inputs["attention_mask"].shape)
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
              
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute(use_stemmer=True)
    return score

In [12]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')


In [13]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments


In [14]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [15]:
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts, val_labels, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

In [44]:
def prepare_fine_tuning(model_name, train_dataset, val_dataset, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           
      adafactor=True,                  
      num_train_epochs=20,           
      per_device_train_batch_size=2,   
      per_device_eval_batch_size=2,   
      save_steps=5,                  
      save_total_limit=5,              
      evaluation_strategy='steps',     
      eval_steps=5,                  
      warmup_steps=500,  
      learning_rate=0.0002,              
      weight_decay=0.01,              
      logging_dir='./logs',            
      logging_steps=10,
      gradient_accumulation_steps=4
    )

    trainer = Trainer(
      model=model,                         
      args=training_args,                  
      train_dataset=train_dataset,         
      eval_dataset=val_dataset             
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      adafactor=True,                  # use adafactor instead of AdamW
      num_train_epochs=1,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=100,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               
      logging_dir='./logs',            
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
    )

  return trainer

In [17]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/final/augmented_train3.csv')

In [18]:
df.shape

(480289, 2)

In [19]:
#importing the cleaned and preprocessed test csv file exported from jupyter notebook
test_df = pd.read_csv('/content/drive/MyDrive/final/cleaned_test.csv')

In [20]:
test_df.shape

(15441, 2)

In [21]:
#importing the cleaned and preprocessed test csv file exported from jupyter notebook
val_df = pd.read_csv('/content/drive/MyDrive/final/cleaned_val.csv')

In [22]:
val_df.shape

(15442, 2)

In [23]:
#shuffling all the rows so that original and augmented data rows gets mixed
train_df = df.sample(frac=1, random_state=42)

In [24]:
# reset index
train_df.reset_index(drop=True, inplace=True)

FOR ORIGINAL DATA WITHOUT AUGMENTATION

In [25]:
from datasets import load_dataset

In [26]:
dataset_cnn = load_dataset('cnn_dailymail','3.0.0')
split_lengths = [len(dataset_cnn[split])for split in dataset_cnn]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_cnn['train'].column_names}")



  0%|          | 0/3 [00:00<?, ?it/s]

Split lengths: [287113, 13368, 11490]
Features: ['article', 'highlights', 'id']


In [27]:
!pip install --upgrade accelerate
from functools import partial



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
!pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
# use cnn dataset as example, with first original as training data

train_texts, train_labels = dataset_cnn['train'][:30000]['article'], dataset_cnn['train']['highlights'][:30000]
#train_texts, train_labels = dataset['train']['document'], dataset['train']['summary']
val_texts, val_labels = dataset_cnn['validation']['article'][:2000], dataset_cnn['validation']['highlights'][:2000]

In [30]:
model_checkpoint = 'google/pegasus-multi_news'
# about 8 mins to finish for the whole dataset
train_dataset, val_dataset, _ = prepare_data(model_checkpoint, train_texts, train_labels, val_texts, val_labels)


In [45]:
%%time
# only about 1 minute
trainer = prepare_fine_tuning(model_checkpoint, train_dataset, val_dataset)

CPU times: user 15.1 s, sys: 4.2 s, total: 19.3 s
Wall time: 13.9 s




In [46]:
trainer.train()

Step,Training Loss,Validation Loss
5,No log,8.673236
10,1.851900,8.652978
15,1.851900,8.62141
20,1.813200,8.564925


TrainOutput(global_step=20, training_loss=1.8325450897216797, metrics={'train_runtime': 195.9177, 'train_samples_per_second': 0.204, 'train_steps_per_second': 0.102, 'total_flos': 115578576568320.0, 'train_loss': 1.8325450897216797, 'epoch': 20.0})

In [35]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_metric = load_metric('rouge')

In [37]:
pegtokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)

In [48]:
score = calculate_metric_on_test_ds(
    dataset_cnn['test'][:500], rouge_metric, trainer.model, pegtokenizer, batch_size = 3, column_text = 'article', column_summary= 'highlights'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

  0%|          | 0/167 [00:00<?, ?it/s]

input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95399)
vocabulary size: 96103


  1%|          | 1/167 [00:24<1:07:06, 24.26s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88480)
vocabulary size: 96103


  1%|          | 2/167 [00:47<1:05:07, 23.68s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95967)
vocabulary size: 96103


  2%|▏         | 3/167 [01:10<1:04:13, 23.50s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(87575)
vocabulary size: 96103


  2%|▏         | 4/167 [01:33<1:02:39, 23.07s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88699)
vocabulary size: 96103


  3%|▎         | 5/167 [01:56<1:02:03, 22.99s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92531)
vocabulary size: 96103


  4%|▎         | 6/167 [02:19<1:02:27, 23.27s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93359)
vocabulary size: 96103


  4%|▍         | 7/167 [02:42<1:01:10, 22.94s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94811)
vocabulary size: 96103


  5%|▍         | 8/167 [03:04<1:00:38, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95607)
vocabulary size: 96103


  5%|▌         | 9/167 [03:28<1:01:04, 23.19s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94494)
vocabulary size: 96103


  6%|▌         | 10/167 [03:51<1:00:04, 22.96s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95534)
vocabulary size: 96103


  7%|▋         | 11/167 [04:13<59:31, 22.89s/it]  

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91755)
vocabulary size: 96103


  7%|▋         | 12/167 [04:38<1:00:19, 23.35s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93454)
vocabulary size: 96103


  8%|▊         | 13/167 [05:02<1:00:09, 23.44s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95460)
vocabulary size: 96103


  8%|▊         | 14/167 [05:24<59:17, 23.25s/it]  

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91955)
vocabulary size: 96103


  9%|▉         | 15/167 [05:48<59:01, 23.30s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92172)
vocabulary size: 96103


 10%|▉         | 16/167 [06:10<58:03, 23.07s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93338)
vocabulary size: 96103


 10%|█         | 17/167 [06:33<57:32, 23.02s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92610)
vocabulary size: 96103


 11%|█         | 18/167 [06:57<57:35, 23.19s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(87023)
vocabulary size: 96103


 11%|█▏        | 19/167 [07:20<56:56, 23.09s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91223)
vocabulary size: 96103


 12%|█▏        | 20/167 [07:42<56:05, 22.90s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95379)
vocabulary size: 96103


 13%|█▎        | 21/167 [08:06<56:12, 23.10s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95722)
vocabulary size: 96103


 13%|█▎        | 22/167 [08:28<55:37, 23.02s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95775)
vocabulary size: 96103


 14%|█▍        | 23/167 [08:51<55:02, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92627)
vocabulary size: 96103


 14%|█▍        | 24/167 [09:14<54:20, 22.80s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94709)
vocabulary size: 96103


 15%|█▍        | 25/167 [09:38<54:47, 23.15s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92818)
vocabulary size: 96103


 16%|█▌        | 26/167 [10:01<54:47, 23.31s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94872)
vocabulary size: 96103


 16%|█▌        | 27/167 [10:24<53:52, 23.09s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95616)
vocabulary size: 96103


 17%|█▋        | 28/167 [10:47<53:48, 23.23s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90423)
vocabulary size: 96103


 17%|█▋        | 29/167 [11:10<53:13, 23.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89875)
vocabulary size: 96103


 18%|█▊        | 30/167 [11:33<52:47, 23.12s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88737)
vocabulary size: 96103


 19%|█▊        | 31/167 [11:56<52:12, 23.03s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95659)
vocabulary size: 96103


 19%|█▉        | 32/167 [12:19<51:30, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94712)
vocabulary size: 96103


 20%|█▉        | 33/167 [12:41<50:55, 22.80s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89286)
vocabulary size: 96103


 20%|██        | 34/167 [13:05<51:17, 23.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95642)
vocabulary size: 96103


 21%|██        | 35/167 [13:28<50:41, 23.04s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94229)
vocabulary size: 96103


 22%|██▏       | 36/167 [13:51<49:52, 22.84s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92614)
vocabulary size: 96103


 22%|██▏       | 37/167 [14:19<53:07, 24.52s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95341)
vocabulary size: 96103


 23%|██▎       | 38/167 [14:42<51:54, 24.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89703)
vocabulary size: 96103


 23%|██▎       | 39/167 [15:05<50:32, 23.69s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89786)
vocabulary size: 96103


 24%|██▍       | 40/167 [15:28<49:47, 23.52s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92025)
vocabulary size: 96103


 25%|██▍       | 41/167 [15:51<49:14, 23.45s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94429)
vocabulary size: 96103


 25%|██▌       | 42/167 [16:13<47:59, 23.04s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92771)
vocabulary size: 96103


 26%|██▌       | 43/167 [16:36<47:10, 22.83s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95386)
vocabulary size: 96103


 26%|██▋       | 44/167 [16:59<47:16, 23.06s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90981)
vocabulary size: 96103


 27%|██▋       | 45/167 [17:22<46:48, 23.02s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93338)
vocabulary size: 96103


 28%|██▊       | 46/167 [17:45<46:15, 22.94s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92106)
vocabulary size: 96103


 28%|██▊       | 47/167 [18:09<46:43, 23.37s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92325)
vocabulary size: 96103


 29%|██▊       | 48/167 [18:31<45:27, 22.92s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94815)
vocabulary size: 96103


 29%|██▉       | 49/167 [18:54<44:58, 22.87s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90091)
vocabulary size: 96103


 30%|██▉       | 50/167 [19:18<45:20, 23.25s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92726)
vocabulary size: 96103


 31%|███       | 51/167 [19:41<44:31, 23.03s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93262)
vocabulary size: 96103


 31%|███       | 52/167 [20:04<44:16, 23.10s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95775)
vocabulary size: 96103


 32%|███▏      | 53/167 [20:27<44:00, 23.17s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94136)
vocabulary size: 96103


 32%|███▏      | 54/167 [20:50<43:17, 22.98s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95134)
vocabulary size: 96103


 33%|███▎      | 55/167 [21:13<42:51, 22.96s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95103)
vocabulary size: 96103


 34%|███▎      | 56/167 [21:35<42:17, 22.86s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93872)
vocabulary size: 96103


 34%|███▍      | 57/167 [21:59<42:11, 23.02s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90919)
vocabulary size: 96103


 35%|███▍      | 58/167 [22:22<41:44, 22.98s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90955)
vocabulary size: 96103


 35%|███▌      | 59/167 [22:45<41:25, 23.01s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94393)
vocabulary size: 96103


 36%|███▌      | 60/167 [23:07<40:54, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95134)
vocabulary size: 96103


 37%|███▋      | 61/167 [23:30<40:20, 22.83s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89792)
vocabulary size: 96103


 37%|███▋      | 62/167 [23:53<40:11, 22.97s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90254)
vocabulary size: 96103


 38%|███▊      | 63/167 [24:16<39:37, 22.86s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89872)
vocabulary size: 96103


 38%|███▊      | 64/167 [24:38<39:00, 22.72s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93615)
vocabulary size: 96103


 39%|███▉      | 65/167 [25:02<38:51, 22.86s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92542)
vocabulary size: 96103


 40%|███▉      | 66/167 [25:25<38:38, 22.95s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94471)
vocabulary size: 96103


 40%|████      | 67/167 [25:47<38:04, 22.84s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88064)
vocabulary size: 96103


 41%|████      | 68/167 [26:10<37:49, 22.92s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95693)
vocabulary size: 96103


 41%|████▏     | 69/167 [26:33<37:26, 22.92s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94815)
vocabulary size: 96103


 42%|████▏     | 70/167 [26:56<37:11, 23.01s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91808)
vocabulary size: 96103


 43%|████▎     | 71/167 [27:20<37:03, 23.16s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93301)
vocabulary size: 96103


 43%|████▎     | 72/167 [27:42<36:13, 22.88s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95338)
vocabulary size: 96103


 44%|████▎     | 73/167 [28:05<35:55, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91134)
vocabulary size: 96103


 44%|████▍     | 74/167 [28:28<35:36, 22.97s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95452)
vocabulary size: 96103


 45%|████▍     | 75/167 [28:51<35:05, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93003)
vocabulary size: 96103


 46%|████▌     | 76/167 [29:15<34:58, 23.06s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90561)
vocabulary size: 96103


 46%|████▌     | 77/167 [29:37<34:26, 22.97s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88903)
vocabulary size: 96103


 47%|████▋     | 78/167 [30:01<34:14, 23.09s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95193)
vocabulary size: 96103


 47%|████▋     | 79/167 [30:24<34:05, 23.24s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95608)
vocabulary size: 96103


 48%|████▊     | 80/167 [30:46<33:05, 22.82s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95720)
vocabulary size: 96103


 49%|████▊     | 81/167 [31:09<32:32, 22.71s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92819)
vocabulary size: 96103


 49%|████▉     | 82/167 [31:32<32:25, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90214)
vocabulary size: 96103


 50%|████▉     | 83/167 [31:54<31:53, 22.77s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95921)
vocabulary size: 96103


 50%|█████     | 84/167 [32:17<31:30, 22.78s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94385)
vocabulary size: 96103


 51%|█████     | 85/167 [32:41<31:25, 22.99s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94815)
vocabulary size: 96103


 51%|█████▏    | 86/167 [33:03<30:57, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94226)
vocabulary size: 96103


 52%|█████▏    | 87/167 [33:27<30:49, 23.12s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95987)
vocabulary size: 96103


 53%|█████▎    | 88/167 [33:50<30:21, 23.06s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(81449)
vocabulary size: 96103


 53%|█████▎    | 89/167 [34:13<29:54, 23.01s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(83415)
vocabulary size: 96103


 54%|█████▍    | 90/167 [34:38<30:20, 23.64s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95194)
vocabulary size: 96103


 54%|█████▍    | 91/167 [35:01<29:41, 23.44s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(84958)
vocabulary size: 96103


 55%|█████▌    | 92/167 [35:23<28:54, 23.13s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(96092)
vocabulary size: 96103


 56%|█████▌    | 93/167 [35:47<28:37, 23.21s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95905)
vocabulary size: 96103


 56%|█████▋    | 94/167 [36:10<28:06, 23.10s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93338)
vocabulary size: 96103


 57%|█████▋    | 95/167 [36:32<27:34, 22.98s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93566)
vocabulary size: 96103


 57%|█████▋    | 96/167 [36:56<27:26, 23.19s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91223)
vocabulary size: 96103


 58%|█████▊    | 97/167 [37:18<26:47, 22.96s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(84650)
vocabulary size: 96103


 59%|█████▊    | 98/167 [37:42<26:37, 23.16s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94961)
vocabulary size: 96103


 59%|█████▉    | 99/167 [38:05<26:13, 23.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(87371)
vocabulary size: 96103


 60%|█████▉    | 100/167 [38:28<25:39, 22.98s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94726)
vocabulary size: 96103


 60%|██████    | 101/167 [38:51<25:33, 23.24s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88950)
vocabulary size: 96103


 61%|██████    | 102/167 [39:14<25:02, 23.11s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(85360)
vocabulary size: 96103


 62%|██████▏   | 103/167 [39:38<24:55, 23.37s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94542)
vocabulary size: 96103


 62%|██████▏   | 104/167 [40:02<24:32, 23.37s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89872)
vocabulary size: 96103


 63%|██████▎   | 105/167 [40:25<24:07, 23.34s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(91438)
vocabulary size: 96103


 63%|██████▎   | 106/167 [40:48<23:39, 23.26s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94694)
vocabulary size: 96103


 64%|██████▍   | 107/167 [41:11<23:09, 23.17s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(80894)
vocabulary size: 96103


 65%|██████▍   | 108/167 [41:34<22:50, 23.24s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92609)
vocabulary size: 96103


 65%|██████▌   | 109/167 [41:57<22:21, 23.12s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93266)
vocabulary size: 96103


 66%|██████▌   | 110/167 [42:20<21:59, 23.15s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94864)
vocabulary size: 96103


 66%|██████▋   | 111/167 [42:44<21:51, 23.42s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95790)
vocabulary size: 96103


 67%|██████▋   | 112/167 [43:07<21:12, 23.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95607)
vocabulary size: 96103


 68%|██████▊   | 113/167 [43:32<21:18, 23.67s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95369)
vocabulary size: 96103


 68%|██████▊   | 114/167 [43:55<20:42, 23.45s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95722)
vocabulary size: 96103


 69%|██████▉   | 115/167 [44:19<20:27, 23.60s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95698)
vocabulary size: 96103


 69%|██████▉   | 116/167 [44:42<19:57, 23.48s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95616)
vocabulary size: 96103


 70%|███████   | 117/167 [45:05<19:28, 23.38s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94183)
vocabulary size: 96103


 71%|███████   | 118/167 [45:28<19:00, 23.27s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92781)
vocabulary size: 96103


 71%|███████▏  | 119/167 [45:51<18:31, 23.16s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92329)
vocabulary size: 96103


 72%|███████▏  | 120/167 [46:15<18:16, 23.32s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92396)
vocabulary size: 96103


 72%|███████▏  | 121/167 [46:38<17:49, 23.24s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93768)
vocabulary size: 96103


 73%|███████▎  | 122/167 [47:01<17:22, 23.18s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95239)
vocabulary size: 96103


 74%|███████▎  | 123/167 [47:24<16:57, 23.13s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(96004)
vocabulary size: 96103


 74%|███████▍  | 124/167 [47:46<16:24, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(96092)
vocabulary size: 96103


 75%|███████▍  | 125/167 [48:09<15:58, 22.83s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(89328)
vocabulary size: 96103


 75%|███████▌  | 126/167 [48:32<15:40, 22.94s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95192)
vocabulary size: 96103


 76%|███████▌  | 127/167 [48:55<15:17, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95452)
vocabulary size: 96103


 77%|███████▋  | 128/167 [49:19<15:04, 23.20s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95023)
vocabulary size: 96103


 77%|███████▋  | 129/167 [49:41<14:33, 22.99s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95616)
vocabulary size: 96103


 78%|███████▊  | 130/167 [50:05<14:15, 23.11s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95338)
vocabulary size: 96103


 78%|███████▊  | 131/167 [50:28<13:52, 23.13s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94462)
vocabulary size: 96103


 79%|███████▉  | 132/167 [50:50<13:21, 22.89s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90353)
vocabulary size: 96103


 80%|███████▉  | 133/167 [51:13<13:01, 23.00s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95192)
vocabulary size: 96103


 80%|████████  | 134/167 [51:36<12:36, 22.92s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95853)
vocabulary size: 96103


 81%|████████  | 135/167 [51:59<12:09, 22.78s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93733)
vocabulary size: 96103


 81%|████████▏ | 136/167 [52:22<11:53, 23.00s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88263)
vocabulary size: 96103


 82%|████████▏ | 137/167 [52:45<11:26, 22.88s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95059)
vocabulary size: 96103


 83%|████████▎ | 138/167 [53:07<10:57, 22.68s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93768)
vocabulary size: 96103


 83%|████████▎ | 139/167 [53:30<10:39, 22.86s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95702)
vocabulary size: 96103


 84%|████████▍ | 140/167 [53:54<10:21, 23.03s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95806)
vocabulary size: 96103


 84%|████████▍ | 141/167 [54:16<09:53, 22.83s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(90851)
vocabulary size: 96103


 85%|████████▌ | 142/167 [54:41<09:49, 23.59s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94571)
vocabulary size: 96103


 86%|████████▌ | 143/167 [55:04<09:21, 23.39s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(88275)
vocabulary size: 96103


 86%|████████▌ | 144/167 [55:28<09:00, 23.49s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94710)
vocabulary size: 96103


 87%|████████▋ | 145/167 [55:50<08:29, 23.18s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(96037)
vocabulary size: 96103


 87%|████████▋ | 146/167 [56:13<08:01, 22.93s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95916)
vocabulary size: 96103


 88%|████████▊ | 147/167 [56:36<07:42, 23.11s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(96000)
vocabulary size: 96103


 89%|████████▊ | 148/167 [56:59<07:18, 23.08s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95469)
vocabulary size: 96103


 89%|████████▉ | 149/167 [57:22<06:51, 22.85s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(83548)
vocabulary size: 96103


 90%|████████▉ | 150/167 [57:45<06:31, 23.06s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93630)
vocabulary size: 96103


 90%|█████████ | 151/167 [58:08<06:09, 23.09s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92614)
vocabulary size: 96103


 91%|█████████ | 152/167 [58:32<05:47, 23.17s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(92285)
vocabulary size: 96103


 92%|█████████▏| 153/167 [58:55<05:23, 23.10s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95335)
vocabulary size: 96103


 92%|█████████▏| 154/167 [59:17<04:57, 22.88s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93262)
vocabulary size: 96103


 93%|█████████▎| 155/167 [59:41<04:37, 23.13s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93781)
vocabulary size: 96103


 93%|█████████▎| 156/167 [1:00:04<04:15, 23.21s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95616)
vocabulary size: 96103


 94%|█████████▍| 157/167 [1:00:27<03:50, 23.08s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95932)
vocabulary size: 96103


 95%|█████████▍| 158/167 [1:00:50<03:27, 23.05s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95702)
vocabulary size: 96103


 95%|█████████▌| 159/167 [1:01:13<03:03, 22.95s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94872)
vocabulary size: 96103


 96%|█████████▌| 160/167 [1:01:36<02:40, 23.00s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95864)
vocabulary size: 96103


 96%|█████████▋| 161/167 [1:01:59<02:17, 22.97s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95835)
vocabulary size: 96103


 97%|█████████▋| 162/167 [1:02:21<01:54, 22.90s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(95814)
vocabulary size: 96103


 98%|█████████▊| 163/167 [1:02:45<01:32, 23.17s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(94771)
vocabulary size: 96103


 98%|█████████▊| 164/167 [1:03:08<01:09, 23.14s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93386)
vocabulary size: 96103


 99%|█████████▉| 165/167 [1:03:32<00:46, 23.23s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([3, 1024])
max input_ids value: tensor(93736)
vocabulary size: 96103


 99%|█████████▉| 166/167 [1:03:55<00:23, 23.15s/it]

torch.Size([3, 128])
torch.Size([3, 1024])
input_ids shape: torch.Size([2, 1024])
max input_ids value: tensor(94693)
vocabulary size: 96103


100%|██████████| 167/167 [1:04:12<00:00, 23.07s/it]

torch.Size([2, 128])
torch.Size([2, 1024])





Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.27176,0.096749,0.180235,0.213639
