### Fine Tuning on SGH News Data for Abstractive Text Summarization using T5

##### 1. Install Libraries

In [None]:
!pip install transformers 
!pip install SentencePiece 
!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 51.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K 

In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


##### 2. Create custom dataset for the dataloader

In [None]:
# Defining classes to read/load the dataframe and pass it to a neural network for model finetuning

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.Summary = self.data.Summary
        self.Article = self.data.Article

    def __len__(self):
        return len(self.Summary)

    def __getitem__(self, index):
        Article = str(self.Article[index])
        Article = ' '.join(Article.split())

        Summary = str(self.Summary[index])
        Summary = ' '.join(Summary.split())

        source = self.tokenizer.batch_encode_plus([Article], max_length=self.source_len, pad_to_max_length=True,return_tensors='pt', truncation=True)
        target = self.tokenizer.batch_encode_plus([Summary], max_length=self.summ_len, pad_to_max_length=True,return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

##### 3. Defining the training and validation functions 

In [None]:
# Define the training function, which will be run in the main process according to the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed it to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

##### 4. Setting the training parameters

In [None]:
# Defining key training variables for model training  
TRAIN_BATCH_SIZE = 8    # input batch size for training
VALID_BATCH_SIZE = 8    # input batch size for testing
TRAIN_EPOCHS = 2        # number of epochs to train
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate set to 0.0001
SEED = 123               # random seed set to 123
MAX_LEN = 512
SUMMARY_LEN = 150 

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenizer for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# Mount workbook on Google Colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


##### 5. Load dataset

In [None]:
# Load dataset
df = pd.read_excel('drive/My Drive/Colab_Notebooks/PLP_Project/MediaReportExcel.xlsx')
df.head(3)

Unnamed: 0,Article,Summary,Title,Url
0,​SINGAPORE - A prescription for innovative hea...,A five-year Memorandum Of Understanding (MOU) ...,"SingHealth, SIT sign agreement to team up to p...",https://www.singhealth.com.sg/news/tomorrows-m...
1,SINGAPORE - Diagnosed with severe asthma four ...,People riddled with severe asthma are set to b...,Data registry set up to help Singapore patient...,https://www.singhealth.com.sg/news/tomorrows-m...
2,Mention chimeric antigen receptor (CAR) T-cell...,"Dr Francesa Lorraine Lim, Senior Consultant, D...",SGH doctor on types of blood cancers and CAR-T...,https://www.singhealth.com.sg/news/singapore-h...


In [None]:
# 'Content' is the original text (source), and 'Summary' is the summary (target).
df = df[['Summary','Article']]
df.Article = 'summarize: ' + df.Article # add prefix "summarize: " to input indicating the task

In [None]:
# Creation of Dataset and Dataloader
# Defining the train size - 80% of the data will be used for training and the rest will be used for validation (testing). 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (101, 2)
TRAIN Dataset: (81, 2)
TEST Dataset: (20, 2)


In [None]:
# Creating the Training and Validation dataset as part of the dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [None]:
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
     }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

##### 6. Load and fine-tune T5 model on training data

In [None]:
# Defining the model - a t5-base model with an added language model layer on top for generation of summaries. 
# The model is also sent to a device (GPU/TPU) to use their greater processing power.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

model.save_pretrained("drive/My Drive/Colab_Notebooks/PLP_Project/SGH_FinetunedT5Model") # Exports model into a separate folder for subsequent deployment as a Streamlit app

Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  8.157137870788574
Epoch: 1, Loss:  2.333050012588501


##### 7. Test and evaluate finetuned model on SGH News Reports

In [None]:
# Test finetuned model and save resulting file with predictions and actuals in a dataframe.
# Using the Recall-Oriented Understudy for Gisting Evaluation (ROUGE) library to evaluate finetuned T5 text summarization model.
# Saving the dataframe as SGHNews_predictions.csv

!pip install rouge
from rouge import Rouge
rouge = Rouge()

print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    rouge_scores = rouge.get_scores(predictions, actuals, avg=True)
    loss_result = model.eval_model(val_loader)
    final_df.to_csv('drive/My Drive/Colab_Notebooks/PLP_Project/SGHNews_predictions.csv')
    print('Output Files generated for review')
    print('ROUGE Score:', rouge_scores)
    print(loss_result)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe




Completed 0


AttributeError: ignored

In [None]:
# Print summary of ROUGE scores
for rouge, scores in rouge_scores.items():
  print('{} {}'.format(rouge, scores))

# Check final dataframe
final_df.head(3)

rouge-1 {'r': 0.4010367974397024, 'p': 0.4445771854797381, 'f': 0.41559732773692853}
rouge-2 {'r': 0.25107926067922126, 'p': 0.28943749493649806, 'f': 0.26310721584631824}
rouge-l {'r': 0.3832424559037979, 'p': 0.4247891317058996, 'f': 0.3969730916466443}
