# **Finetuning Bart-Large-CNN Model** 
###Trained on SGH Dataset
Adapted from Source: 
https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb

## 1. Installing packages

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


## 2. Loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
dfall = pd.read_csv("/content/drive/MyDrive/SGH Project/SGH_combined100.csv", encoding = 'utf_8')

In [None]:
dfall = dfall.rename(columns = {'Article': 'document', 'Summary': 'summary'})

In [None]:
# Selecting only the necessary columns
dfall = dfall[['document','summary']]

In [None]:
# Dropping index column
dfall = dfall.reset_index(drop=True)

In [None]:
dfall.head(5)

Unnamed: 0,document,summary
0,​SINGAPORE - A prescription for innovative hea...,A five-year Memorandum Of Understanding (MOU) ...
1,SINGAPORE - Diagnosed with severe asthma four ...,People riddled with severe asthma are set to b...
2,Mention chimeric antigen receptor (CAR) T-cell...,"Dr Francesa Lorraine Lim, Senior Consultant, D..."
3,​SINGAPORE - Wheeled robots carrying cargo are...,SGH is using 13 robots in SingHealth Tower to ...
4,SINGAPORE - A new digital tool to ensure healt...,SGH would deploy within the next two months a ...


In [None]:
from datasets import load_dataset, load_metric, Dataset, DatasetDict
# 90% for train, 10% for validation
train_data_txt, validation_data_txt  = Dataset.from_pandas(dfall).train_test_split(test_size=0.1).values()

## 3. Data preprocessing

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn", padding="encoder_max_length", truncation="only_first")
encoder_max_length = 1024  
decoder_max_length = 512

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation="only_first", max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation="only_first", max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## 4. Loading model and metrics

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  """


## 5. Model training, evaluation and deployment

In [None]:
# Defining training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="bart_large_summarise_v2",
    num_train_epochs=10,  
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=10,
    save_total_limit=3,
    push_to_hub=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Pass training argumnets, model, tokenizer, datasets and compute_metrics function to the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

/content/bart_large_summarise_v2 is already a clone of https://huggingface.co/debbiesoon/bart_large_summarise_v2. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
# Train model
trainer.train()

***** Running training *****
  Num examples = 90
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 230


Step,Training Loss
10,2.8012
20,2.7467
30,2.5887
40,2.5139
50,2.5287
60,2.3569
70,2.3889
80,2.3745
90,2.1716
100,2.2537




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=230, training_loss=2.1379932569420856, metrics={'train_runtime': 154.7058, 'train_samples_per_second': 5.817, 'train_steps_per_second': 1.487, 'total_flos': 1950394141900800.0, 'train_loss': 2.1379932569420856, 'epoch': 10.0})

In [None]:
# Generating evaluation metrics
results = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 11
  Batch size = 4


In [None]:
print(results)

{'eval_loss': 2.738941192626953, 'eval_rouge1': 52.9676, 'eval_rouge2': 36.0168, 'eval_rougeL': 39.6093, 'eval_rougeLsum': 48.2053, 'eval_gen_len': 137.9091, 'eval_runtime': 9.7588, 'eval_samples_per_second': 1.127, 'eval_steps_per_second': 0.307, 'epoch': 10.0}


In [None]:
# Pushing model onto HuggingFace Hub
trainer.push_to_hub()

Saving model checkpoint to bart_large_summarise
Configuration saved in bart_large_summarise/config.json
Model weights saved in bart_large_summarise/pytorch_model.bin
tokenizer config file saved in bart_large_summarise/tokenizer_config.json
Special tokens file saved in bart_large_summarise/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 3.33k/1.51G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/debbiesoon/bart_large_summarise
   242906c..fff3ab1  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/debbiesoon/bart_large_summarise
   242906c..fff3ab1  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 52.9676}]}
To https://huggingface.co/debbiesoon/bart_large_summarise
   fff3ab1..263e39f  main -> main

   fff3ab1..263e39f  main -> main



'https://huggingface.co/debbiesoon/bart_large_summarise/commit/fff3ab1ccb07a7c565e25948e16d35b8b635e345'

In [None]:
# Save model to Google Drive
trainer.save_model('/content/drive/MyDrive/SGH Project/bart_large_summarise/')

Saving model checkpoint to /content/drive/MyDrive/SGH Project/bart_large_summarise/
Configuration saved in /content/drive/MyDrive/SGH Project/bart_large_summarise/config.json
Model weights saved in /content/drive/MyDrive/SGH Project/bart_large_summarise/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/SGH Project/bart_large_summarise/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/SGH Project/bart_large_summarise/special_tokens_map.json
Saving model checkpoint to bart_large_summarise
Configuration saved in bart_large_summarise/config.json
Model weights saved in bart_large_summarise/pytorch_model.bin
tokenizer config file saved in bart_large_summarise/tokenizer_config.json
Special tokens file saved in bart_large_summarise/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge