## This model enables fine tuning of the T5 model using your own .csv files
Other than the conversion to enable .csv documents to be read, this notebook leans heavilily on Venelin Valkov's excellent tutorial on youtube, referenced in the URL: https://www.youtube.com/watch?v=r6XY80Z9eSA

As a true text to text transformer, this model expects the following .csv file column headers and data:
- 'context' - This is the text which you are trying to extract the answer from
- 'question' - This is the question being asked
- 'answer_text' - This is the answer to the question. NOTE, the answer does not have to be in the 'context', as this is a true language model using both the Encoder and Decoder of the Transformer

Note, this model is extremely accurate when using your own data to fine tune. However in the same vein, it loses generality very quickly if you are not careful

## Train

In [1]:
# :pad libraries
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pdfminer.high_level import extract_text

In [2]:
from transformers import(AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup)

In [3]:
# Swap this for your own fine tuned data
df = pd.read_csv('DEMO 6D master_clauses_clean_updated2503.csv')

In [4]:
# Remove columns not needed from df
df = df.drop(labels=['Unnamed: 0', 'Filename'], axis = 1)
df = df.dropna()
df = df.rename(columns={'Question' : 'question', 'Context' : 'context', 'Answer' : 'answer_text'})

In [5]:
# set MODEL_NAME as pre-trained t5 model, in this case t5-base
MODEL_NAME = 't5-base'

In [6]:
# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [7]:
class QADataset(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 1000,
        target_max_token_len: int = 1000
    ):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(
            data_row['question'],
            data_row['context'],
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation='only_second',
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        target_encoding = tokenizer(
            data_row['answer_text'],
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
 
        labels = target_encoding['input_ids']
        labels[labels == 0] = -100
        
        return dict(
        question=data_row['question'],
            context=data_row['context'],
            answer_text=data_row['answer_text'],
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
        )

In [8]:
train_df, val_df = train_test_split(df, test_size = 0.05)
print(train_df.shape, val_df.shape)

(6213, 3) (327, 3)


In [9]:
class QADataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 1000,
        target_max_token_len: int = 1000
    ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        #self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
        
    def setup(self):
        self.train_dataset = QADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
        self.test_dataset = QADataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle=True,
            num_workers = 4
        )
        
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = 1,
            num_workers = 4
        )       

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size = 1,
            num_workers = 4
        )

In [10]:
BATCH_SIZE = 1
N_EPOCHS = 6

data_module = QADataModule(train_df, val_df, tokenizer, batch_size = BATCH_SIZE)
data_module.setup()

In [11]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [12]:
##Run this one
class QAModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels=labels
        )
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels=batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss,  logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels=batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels=batch['labels']
        
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss,  logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [13]:
model = QAModel()

In [14]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode = 'min')

In [15]:
trainer = pl.Trainer(
    checkpoint_callback = checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [16]:
#%load_ext tensorboard

In [17]:
#%tensorboard --logdir ./lightning_logs

In [18]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

RuntimeError: CUDA out of memory. Tried to allocate 46.00 MiB (GPU 0; 10.76 GiB total capacity; 8.37 GiB already allocated; 126.12 MiB free; 8.68 GiB reserved in total by PyTorch)

In [None]:
trainer.test()

In [None]:
trained_model = QAModel.load_from_checkpoint('checkpoints/best-checkpoint.ckpt')
trained_model.freeze()

## Predict

In [None]:
from transformers import(AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer)

In [None]:
def generate_answer(question):
    source_encoding = tokenizer(
        question['question'],
        question['context'],
        max_length = 1000, 
        padding='max_length',
        truncation='only_second',
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    print(len(source_encoding))
    generate_ids = trained_model.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=1000,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )
    
    preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generate_ids]
    
    return "".join(preds)

In [None]:
sample_question = val_df.iloc[200]

In [None]:
sample_question['context']

In [None]:
sample_question['question']

In [None]:
sample_question['answer_text']

In [None]:
generate_answer(sample_question)