In [None]:
!pip install datasets
! pip install -U accelerate
! pip install -U transformers

In [None]:
from datasets import Dataset
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import json


Task 1 Part 1: The dataset I used has text data from BBC UK News. As my input, I used the full article and as my output, I used the summary of the article. The idea is to use the LLM for summarization.
Link to dataset: https://data.world/opensnippets/bbc-uk-news-dataset

In [None]:
with open('bbc_news_list_uk.json') as f:
  data = json.load(f)
df = pd.DataFrame(data)

In [None]:
print(df.head)

<bound method NDFrame.head of                                                     tags  \
0                                                          
1      North Korea, Kim Jong-un, Kim Jong-il, Kim Jon...   
2                                                          
3                                                          
4                                                          
...                                                  ...   
15820                                                      
15821                                                      
15822                                                      
15823                                                      
15824                                                      

                                                   title  \
0      Why quitting heroin substitute methadone is 'v...   
1      North Korea leader's eldest son 'opposes dynasty'   
2      Paintings by gangster Reggie Kray go on sale i...   
3       B

In [None]:
new_df = df.filter(['content','short_description'])

In [None]:
print(new_df.head)

In [None]:
dataset = Dataset.from_pandas(new_df)
dataset = dataset.train_test_split(test_size = .10)

Task 1 Part 2: Loading model and tokenizer below.

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
tokenizer = tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
prefix = "summarize: "


def preprocessor(dataset):
  max_input_length = 1024
  inputs = [prefix + text for text in dataset["content"]]
 # print(len(inputs))

  model_inputs = tokenizer(inputs, max_length = max_input_length,truncation = True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(dataset["short_description"], max_length = max_input_length)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocessor,batched = True)

Map:   0%|          | 0/13451 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/2374 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
!pip install accelerate>=0.20.3 transformers
!pip install evaluate
!pip install rouge-score

In [None]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Finetune and training below.

In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")

training_args = Seq2SeqTrainingArguments(
    output_dir="model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.9746,0.996865,0.6465,0.5254,0.6176,0.6175,19.9014
2,0.7439,0.927171,0.6516,0.5291,0.6221,0.6216,19.8652
3,0.5953,0.943195,0.6533,0.5293,0.6231,0.6228,19.8943
4,0.4824,0.965913,0.6514,0.5278,0.6214,0.6209,19.9082


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=13452, training_loss=0.7135493493739459, metrics={'train_runtime': 20914.3212, 'train_samples_per_second': 2.573, 'train_steps_per_second': 0.643, 'total_flos': 7.107115834871808e+16, 'train_loss': 0.7135493493739459, 'epoch': 4.0})

Task 1 Part 3: I was only able to find out how to calculate one metric at a time and could not run this code again due to the cost of Compute Units. My Rouge 1 and Rouge 2 scores increased over 4 epoches which is a good sign. This is because this means there is similarity between the outputs and what the model generated.
Rouge 1 Epoch 1: 0.646500
Rouge 1 Epoch 2: 0.651600
Rouge 1 Epoch 3: 0.653300
Rouge 1 Epoch 4: 0.651400. My loss also decreased while training.


In [None]:
trainer.push_to_hub()

In [None]:
import torch

In [None]:
prefix = 'summarize: '
end = 0
for j in dataset['train']['content']:
  inputs = tokenizer(prefix+ j,return_tensors = 'pt').to(torch.device('cuda:0'))

  outputs = model.generate(**inputs)
  print(tokenizer.batch_decode(outputs,skip_special_tokens=True))
  end+= 1
  if end == 3:
    break


end = 0
for j in dataset['train']['short_description']:
  print(j)
  end+=1
  if end == 3:
    break




['EDF Energy faces a £2m penalty over its handling of a three-day']
['A paramedic denies murdering his three-month-old son who died from a brain']
['The US will be able to scrutinise European bank transactions again next month after Euro MPs']
EDF Energy may have to pay a £2m penalty over its handling of a three-day power cut which affected 94,000 Kent and London homes.
A paramedic denies murdering his son, who died from a brain injury, and harming the baby's twin sister.
The EU has agreed to let US anti-terror investigators see European bank data - so what does it mean for EU citizens?


Task 1 Part 4: One thing I had to experiment with in terms of hyper parameters is smaller batch sizes. This is due to limitations in RAM and this introduces a common problem when it comes to training LLMs. Introducing weight decay is also important so that your model doesn't overfit to your training data. I believe that the choice of LLM plays a large part in the results. This is because BART is limited by how many tokens it can take in. It also has far less parameters than other popular models of today. Result wise, the model had some problems completing some full summaries. This is due to the limits of the RAM as well as the max tokens of the model. Overall, the model did a decent job of summarizing given the limitation.