## W&B Logging (Kaggle)

In [1]:
# from kaggle_secrets import UserSecretsClient
# import wandb

# user_secrets = UserSecretsClient()

# my_secret = user_secrets.get_secret("wandb_api") 

# wandb.login(key=my_secret)

## Libraries

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

## Dataset

In [3]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

In [4]:
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset, tokenizer

## Finetune

In [5]:
def fine_tune(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare config and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    training_args = TrainingArguments(
        output_dir=output_dir,           # output directory
        num_train_epochs=1,              # total number of training epochs
        per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
        save_steps=500,                  # number of updates steps before checkpoint saves
        save_total_limit=3,              # limit the total amount of checkpoints and deletes the older checkpoints
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=200,
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        tokenizer=tokenizer
    )

    return trainer

In [6]:
# Load TTE Dataset
ds_tte = load_dataset("nickmuchi/trade-the-event-finance")
# Use first 1k data points
train_texts, train_labels = ds_tte['train']['text'][:1000], ds_tte['train']['title'][:1000]

# Fine Tune F-Sum Pegasus Large
model_name = 'human-centered-summarization/financial-summarization-pegasus'
train, __, __, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = fine_tune(model_name, tokenizer, train)

trainer.train()

Downloading:   0%|          | 0.00/868 [00:00<?, ?B/s]

Downloading and preparing dataset json/default (download: 785.12 MiB, generated: 1.40 GiB, post-processed: Unknown size, total: 2.17 GiB) to /root/.cache/huggingface/datasets/parquet/nickmuchi--trade-the-event-finance-6bcad652637af166/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/124M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/nickmuchi--trade-the-event-finance-6bcad652637af166/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmaze508[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
200,1.8477
400,0.3097
600,0.2845
800,0.2563
1000,0.2462


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1000, training_loss=0.5888946800231933, metrics={'train_runtime': 521.1961, 'train_samples_per_second': 1.919, 'train_steps_per_second': 1.919, 'total_flos': 1444732207104000.0, 'train_loss': 0.5888946800231933, 'epoch': 1.0})