## Package Installation

In [1]:
!pip install transformers datasets evaluate
!pip install --upgrade accelerate

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Arguments

### Model Arguments

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# project_folder_path = "/content/drive/MyDrive/nlp_project"

In [3]:
project_folder_path = "."

In [4]:
# model_name_or_path = project_folder_path + '/output/preTrainedModels/model_1'
model_name_or_path = 't5-small'
model_random_weights = True

### Data Arguments

In [5]:
data_folder_path = project_folder_path + "/data"

data_args = {
    'train_file': data_folder_path + '/pretraining/preprocessed_data' + '/polyai-bank_label_denoising.json', # mention the path for train data
    'validation_file': data_folder_path + '/evaluation/preprocessed_data' + '/SNIPS_label_denoising.json', # mention the path for validation data
    'max_target_length': 128,
    'max_source_length': 512,
    'ignore_pad_token_for_loss': True,
    }

### Training Arguments

In [6]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    project_folder_path + '/output/preTrainedModels/model_3', # mention the path where you want model to be saved
    predict_with_generate= False,
    do_train= True,
    do_eval= True,
    per_device_train_batch_size= 2,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps= 4,
    learning_rate= 5e-4,
    evaluation_strategy= 'epoch',
    num_train_epochs= 5,
    save_total_limit= 2,
    save_strategy= 'epoch',
    load_best_model_at_end= True,
    logging_strategy='epoch',
    seed= 42
    )

  from .autonotebook import tqdm as notebook_tqdm


## Setting up the seed

In [7]:
from transformers import set_seed

set_seed(42)

## Load Dataset

In [8]:
from datasets import load_dataset

data_files = { 'train': data_args['train_file'], 'validation': data_args['validation_file'] }
datasets = load_dataset('json', data_files=data_files)

Found cached dataset json (/work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/json/default-842a59ebcb976eea/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 2/2 [00:00<00:00, 314.90it/s]


In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 13083
    })
    validation: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 14484
    })
})

## Load pretrained model and tokenizer

In [10]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM

config = AutoConfig.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    config = config,
    from_tf=bool('.ckpt' in model_name_or_path)
)

if model_random_weights:
    model.init_weights()

## Tokenize the inputs and targets

### Tokenizing Datasets

In [11]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['inputs']]
    targets = [ex for ex in examples['targets']]
    model_inputs = tokenizer(inputs, max_length= data_args['max_source_length'], padding = False, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length = data_args['max_target_length'], padding = False, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [12]:
train_dataset, eval_dataset = datasets['train'], datasets['validation']

## Note: Due to memory out of bound issue: Restricting the validation to 1000 examples
eval_dataset.shuffle(42)
eval_dataset = eval_dataset.select(range(500))

print("train dataset", train_dataset)
print("eval dataset", eval_dataset)

Loading cached shuffled indices for dataset at /work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/json/default-842a59ebcb976eea/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-7bba45a32481dbe3.arrow


train dataset Dataset({
    features: ['inputs', 'targets'],
    num_rows: 13083
})
eval dataset Dataset({
    features: ['inputs', 'targets'],
    num_rows: 500
})


In [13]:
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=True,
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names,
    load_from_cache_file=True,
)

Loading cached processed dataset at /work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/json/default-842a59ebcb976eea/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-47a9628863b3e29f.arrow
Loading cached processed dataset at /work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/json/default-842a59ebcb976eea/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-3a2e25feeaf2c78e.arrow


### Data Collator

In [14]:
eval_dataset[0]

{'input_ids': [2334,
  1008,
  11,
  4488,
  21301,
  12,
  82,
  8067,
  1734,
  12,
  17112,
  13,
  7913,
  23424,
  5,
  32099,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [32099, 2334, 304, 2911, 3350, 5, 1, 1]}

In [15]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100 if data_args['ignore_pad_token_for_loss'] else tokenizer.pad_token_id

data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id)

### Initialise Trainer

### Metrics

In [16]:
from transformers import TrainerCallback, TrainerState
import math
import pandas as pd
import numpy as np

class AccCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
      super().__init__()
      self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
      print("Calculating Accuracy on eval Dataset: START")
      model = self._trainer.model
      eval_dataloader = self._trainer.get_eval_dataloader()

      exact_match_acc = []
      first_word_acc = []
      for steps, inputs in enumerate(eval_dataloader):
        input_ids = inputs['input_ids'].to('cuda:0')
        attention_mask = inputs['attention_mask'].to('cuda:0')
        labels = inputs['labels'].to('cuda:0')
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, max_new_tokens = data_args['max_target_length'])
  
        pred_labels = np.array([tokenizer.decode(masked, skip_special_tokens=True) for masked in outputs])
        gold_labels = np.array([tokenizer.decode(masked, skip_special_tokens=True) for masked in labels])
        
        # remove space before period/question mark
        gold_labels = np.array([word.replace(' ?', '?').replace(' .', '.').replace(' ,', ',') for word in gold_labels])
        
        # total masked tokens
        total_masked_labels = len(pred_labels)
        
        exact_match_acc.append(np.sum(pred_labels == gold_labels)/ total_masked_labels)
        
        # first word accuracy
        pred_labels = np.array([word.split()[0] if len(word.split()) else '' for word in pred_labels])
        gold_labels = np.array([word.split()[0] if len(word.split()) else '' for word in gold_labels])
        first_word_acc.append(np.sum(pred_labels == gold_labels)/ total_masked_labels)
    
      exact_match_acc_mean = np.mean(exact_match_acc)
      first_word_acc_mean = np.mean(first_word_acc)
        
      print(f"Epoch {state.epoch} Exact words match accuracy: {exact_match_acc_mean}")
      print(f"Epoch {state.epoch} First word match accuracy: {first_word_acc_mean}")
      print("Calculating Accuracy on eval Dataset: END")

class PerplexityCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
      super().__init__()
      self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
      print("Calculating Perplexity on eval Dataset: START")
      model = self._trainer.model
      eval_dataloader = self._trainer.get_eval_dataloader()
      
      loss = []
      for steps, inputs in enumerate(eval_dataloader):
        input_ids = inputs['input_ids'].to('cuda:0')
        attention_mask = inputs['attention_mask'].to('cuda:0')
        labels = inputs['labels'].to('cuda:0')
        with torch.no_grad():
          outputs = model(input_ids=input_ids, labels=labels)
        loss.append(outputs.loss.item())

      loss_mean = np.mean(loss)
      print(f"Epoch {state.epoch} loss_mean:", loss_mean)
      print(f"Epoch {state.epoch} perplexity:", np.exp(loss_mean))
      print("Calculating Perplexity on eval Dataset: END")
        
class PrintLossCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        history = state.log_history
        df = pd.DataFrame(history)
        
        # Filter rows for the last epoch
        last_epoch = df[df['epoch'] == df['epoch'].max()]

        # Select the columns of interest
        losses = last_epoch[['loss', 'eval_loss']]

        eval_loss = losses[['eval_loss']].dropna().iloc[0,0]
        train_loss = losses[['loss']].dropna().iloc[0,0]
        print(f"Epoch {state.epoch} Train loss:", train_loss)
        print(f"Epoch {state.epoch} Validation loss:", eval_loss)
        try:
            train_perplexity = math.exp(train_loss)
        except OverflowError:
            train_perplexity = math.inf
        try:
            eval_perplexity = math.exp(eval_loss)
        except OverflowError:
            eval_perplexity = math.inf
        print(f'Epoch {state.epoch} Train Perplexity:', train_perplexity)
        print(f'Epoch {state.epoch} Validation Perplexity:', eval_perplexity)

In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.add_callback(PrintLossCallback())
trainer.add_callback(AccCallback(trainer))
trainer.add_callback(PerplexityCallback(trainer))

In [18]:
import torch
torch.cuda.is_available()

True

In [19]:
model.to('cuda:0')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Training

In [None]:
import os

train_result = trainer.train()
trainer.save_model()

output_train_file = os.path.join(training_args.output_dir, 'train_results.txt')
if trainer.is_world_process_zero():
    with open(output_train_file, 'w') as writer:
        for key, value in sorted(train_result.metrics.items()):
            writer.write(f'{key} = {value}\n')

    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
    trainer.state.save_to_json(os.path.join(training_args.output_dir, 'trainer_state.json'))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,0.2779,2.07517
2,0.0638,2.917077


Calculating Accuracy on eval Dataset: START
Epoch 0.9996942830938551 Exact words match accuracy: 0.0
Epoch 0.9996942830938551 First word match accuracy: 0.036
Calculating Accuracy on eval Dataset: END
Calculating Perplexity on eval Dataset: START
Epoch 0.9996942830938551 loss_mean: 2.708416027545929
Epoch 0.9996942830938551 perplexity: 15.00548840049559
Calculating Perplexity on eval Dataset: END
Epoch 0.9996942830938551 Train loss: 0.2779
Epoch 0.9996942830938551 Validation loss: 2.0751700401306152
Epoch 0.9996942830938551 Train Perplexity: 1.3203541551915854
Epoch 0.9996942830938551 Validation Perplexity: 7.965900866703523
Calculating Accuracy on eval Dataset: START
Epoch 2.0 Exact words match accuracy: 0.0
Epoch 2.0 First word match accuracy: 0.014
Calculating Accuracy on eval Dataset: END
Calculating Perplexity on eval Dataset: START
Epoch 2.0 loss_mean: 3.2316864681243898
Epoch 2.0 perplexity: 25.322326276738725
Calculating Perplexity on eval Dataset: END
Epoch 2.0 Train loss: 0.0