In [64]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import transformers
from torch.optim import Adam
from tqdm import tqdm

In [65]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

In [66]:
from utils import FinQA_Dataset

### Declarations

In [67]:
BATCH_SIZE = 4
NUM_EPOCHS = 2
TESTING = True

In [68]:
TOKENIZER = T5Tokenizer.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.0001)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


### Read Data and create DataLoader

In [69]:
finqa_train = torch.load('../finqa_dataset/finqa_train.pth')
finqa_valid = torch.load('../finqa_dataset/finqa_valid.pth')
# torch.save(finqa_test, '../finqa_dataset/finqa_test.pth')

In [70]:
if TESTING:
    train_loader = DataLoader(Subset(finqa_train, range(100)), batch_size=BATCH_SIZE)
    val_loader = DataLoader(Subset(finqa_valid, range(10)), batch_size=BATCH_SIZE)
else:
    train_loader = DataLoader(finqa_train, batch_size=BATCH_SIZE)
    val_loader = DataLoader(finqa_valid, batch_size=BATCH_SIZE)

### Train Loop

In [71]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [72]:
train_loss = 0
val_loss = 0
b=0
train_hist=[]
val_hist = []
num_batches = len(train_loader)
print_every=num_batches//10

for epoch in range(NUM_EPOCHS):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
        outputs = MODEL(input_ids=input_ids,attention_mask=attention_mask,
                          labels=labels,decoder_attention_mask=decoder_attention_mask)
        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        b+=1
        if b%(print_every)==0:
            print(f"Batch {b+1}/{num_batches} -> Train loss: {train_loss/b:.4f}")
    train_hist.append(train_loss / len(train_loader))
    #Evaluation
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)
        with torch.no_grad():
            outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask,
                          labels=labels, decoder_attention_mask=decoder_attention_mask)
        val_loss += outputs.loss.item()
    val_hist.append(val_loss / len(val_loader))
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} -> Train loss: {train_hist[-1]:.4f}\tValidation loss: {val_hist[-1]:.4f}")

Training batches:   0%|          | 0/25 [00:00<?, ?it/s]

Training batches:   8%|▊         | 2/25 [01:16<14:42, 38.36s/it]

Batch 3/25 -> Train loss: 6.2316


Training batches:  16%|█▌        | 4/25 [02:33<13:25, 38.37s/it]

Batch 5/25 -> Train loss: 4.2467


Training batches:  24%|██▍       | 6/25 [03:51<12:18, 38.89s/it]

Batch 7/25 -> Train loss: 3.2732


Training batches:  28%|██▊       | 7/25 [04:32<11:54, 39.68s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training batches:  32%|███▏      | 8/25 [05:14<11:22, 40.14s/it]

Batch 9/25 -> Train loss: 2.8531


Training batches:  40%|████      | 10/25 [06:37<10:18, 41.25s/it]

Batch 11/25 -> Train loss: 2.5550


Training batches:  48%|████▊     | 12/25 [08:06<09:17, 42.89s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Batch 13/25 -> Train loss: 2.3315


Training batches:  56%|█████▌    | 14/25 [09:35<08:03, 43.91s/it]

Batch 15/25 -> Train loss: 2.2208


Training batches:  64%|██████▍   | 16/25 [11:16<07:10, 47.87s/it]

Batch 17/25 -> Train loss: 2.0881


Training batches:  72%|███████▏  | 18/25 [13:02<05:51, 50.17s/it]

Batch 19/25 -> Train loss: 1.9609


Training batches:  80%|████████  | 20/25 [14:36<04:02, 48.52s/it]

Batch 21/25 -> Train loss: 1.8866


Training batches:  88%|████████▊ | 22/25 [16:03<02:19, 46.38s/it]

Batch 23/25 -> Train loss: 1.7716


Training batches:  96%|█████████▌| 24/25 [17:46<00:48, 48.87s/it]

Batch 25/25 -> Train loss: 1.6757


Training batches: 100%|██████████| 25/25 [18:35<00:00, 44.63s/it]
Validation batches: 100%|██████████| 3/3 [00:22<00:00,  7.41s/it]


Epoch 1/2 -> Train loss: 1.6383	Validation loss: 0.4996


Training batches:   8%|▊         | 2/25 [01:31<17:29, 45.61s/it]

Batch 3/25 -> Train loss: 21.3276


Training batches:  12%|█▏        | 3/25 [02:20<17:12, 46.95s/it]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,)
axes[0].plot(train_hist)
axes[0].set_title('Training Loss')

axes[1].plot(val_hist)
axes[1].set_title('Validation Loss')

In [None]:
MODEL.save_pretrained("./finqa_finetune_t5.pth")