## Installing dependencies

In [89]:
!pip install datasets evaluate rouge_score -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)



True

# Importing necessary tools and packages

In [34]:
import re
import nltk
from nltk.corpus import stopwords
import random
import gc
import torch
import pandas as pd
import evaluate
import numpy as np
import seaborn as sns
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, concatenate_datasets
from evaluate import load
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


import nltk
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Reading data and preprocessing

In [35]:
df = pd.read_csv('/kaggle/input/cnn-news-dataset/cnn_dataset.csv')

In [36]:
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [37]:
df = df.rename(columns={'highlights': 'summary'})

In [38]:
dataset = Dataset.from_pandas(df)

In [39]:
dataset

Dataset({
    features: ['id', 'article', 'summary'],
    num_rows: 11490
})

In [40]:
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

In [41]:
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

In [42]:
# Combine the splits into a single DatasetDict
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})

## Taking a look at any 5 pair of articles and their summaries

In [43]:
for _ in range(5):
    idx = random.randint(0, 9191)
    sample = dataset["train"][idx]

    print("")
    print("="*100)
    print(sample["article"])
    print("-"*100)
    print(sample["summary"])
    print("")


Gone are the days when Eileen Dover was considered an unfortunate name, with celebrity culture booming like never before, sharing a name with a star can be pretty unlucky, too. Something that New York resident Beyoncé knows about all too well. No, not the legendary singer but another woman who says having the same name as the wife of Jay-Z has caused her grief her entire life. Humans of New York photographer shared this image of a NY resident whose name is Beyoncé. The photo was accompanied by a statement from her describing the inconvenience of sharing a celebrity name . In a post that has now gone viral on Facebook, Beyonce says: 'Sometimes I hate my name because it always draws attention to me, and I'm not a very social person. 'My family moved this year from Pennsylvania. I was so scared the first day of school that someone would notice me. I wouldn't even adjust my seat because I thought it would make a noise. One time I really had to cough, but I held it in. This was followed wi

## Loading tokenizer for T5 model

In [44]:
model_checkpoint = "t5-small"

In [45]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [46]:
tokenizer("Good morning, everyone!")

{'input_ids': [1804, 1379, 6, 921, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [47]:
max_dialogue_length = 1024
max_summary_length = 128
prefix = "summarize: "

## Preprocess function to tokenize input data as well as labels

In [48]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_dialogue_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_summary_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [49]:
dataset["train"][0]

{'id': 'c6dcbe7607940f308e40689944c3d0920e22eb6c',
 'article': "BBC presenter Charlie Stayt has been blasted after he misspelt the word 'education' during a live report on the issue. Viewers\xa0took to Twitter to complain after he missed out the letter 'c' when he scrawled the word on a whiteboard at a primary school in Southampton, Hampshire. Viewer Hardy Lion tweeted: '#bbcbreakfast ha ha spelt education wrong!!! Back to school!' Back to school: Eagle-eyed viewers spotted Charlie Stayt's embarrassing spelling error during a live broadcast . The presenter left out the letter 'c' when he wrote the word for the third time on the whiteboard this morning . The presenter later described the gaffe as 'one of those things,' after being alerted  to it by colleagues . Whilst Mike Knowles tweeted: 'If you're presenting a live report about education on #BBCBreakfast maybe double check how you spell education.' Viewer John Morley added: 'Fantastic, #bbcbreakfast presenter can't spell education.' 

In [50]:
preprocess_function(dataset["train"][0])

{'input_ids': [9938, 915, 49, 12707, 8026, 17, 65, 118, 3, 115, 19054, 227, 3, 88, 3041, 4343, 17, 8, 1448, 3, 31, 29117, 31, 383, 3, 9, 619, 934, 30, 8, 962, 5, 4197, 277, 808, 12, 3046, 12, 15524, 227, 3, 88, 4785, 91, 8, 2068, 3, 31, 75, 31, 116, 3, 88, 14667, 9, 210, 1361, 8, 1448, 30, 3, 9, 872, 1976, 44, 3, 9, 2329, 496, 16, 28165, 6, 17944, 5, 4197, 49, 6424, 63, 10371, 27975, 10, 3, 31, 4663, 115, 115, 75, 14577, 11584, 4244, 4244, 3, 7, 4343, 17, 1073, 1786, 3158, 3195, 12, 496, 55, 31, 3195, 12, 496, 10, 10341, 18, 15, 10093, 13569, 3, 16972, 12707, 8026, 17, 31, 7, 27445, 19590, 3505, 383, 3, 9, 619, 6878, 3, 5, 37, 915, 49, 646, 91, 8, 2068, 3, 31, 75, 31, 116, 3, 88, 2832, 8, 1448, 21, 8, 1025, 97, 30, 8, 872, 1976, 48, 1379, 3, 5, 37, 915, 49, 865, 3028, 8, 7922, 7398, 38, 3, 31, 782, 13, 273, 378, 6, 31, 227, 271, 5685, 15, 26, 12, 34, 57, 6976, 3, 5, 549, 11414, 4794, 8900, 965, 27975, 10, 3, 31, 5801, 25, 31, 60, 3, 12072, 3, 9, 619, 934, 81, 1073, 30, 1713, 7640, 254,

In [51]:
final_data = dataset.map(preprocess_function, batched=True,
                     remove_columns=['id', 'article', 'summary'])

Map:   0%|          | 0/9192 [00:00<?, ? examples/s]

Map:   0%|          | 0/1149 [00:00<?, ? examples/s]

Map:   0%|          | 0/1149 [00:00<?, ? examples/s]

In [52]:
final_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9192
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1149
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1149
    })
})

In [53]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## ROUGE compute metric to evaluate predictions

In [54]:
rouge_score = evaluate.load("rouge")

In [62]:
metric = load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key : value * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v,4) for k,v in result.items()}

**Note**:- Since mT5 is an encoder-decoder Transformer model, one subtlety with preparing our batches is that during decoding we need to shift the labels to the right by one. This is required to ensure that the decoder only sees the previous ground truth labels and not the current or future ones, which would be easy for the model to memorize. 

In [63]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## Setting-up training arguments

In [64]:
from transformers import Seq2SeqTrainingArguments

num_train_epochs=3
batch_size = 8

logging_steps = len(final_data["train"]) // batch_size
model_name = f"{model_checkpoint}-news-summarizer"

args = Seq2SeqTrainingArguments(
    output_dir=f"/kaggle/working/{model_name}-finetuned-t5",  # Directory to save the model and checkpoints
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=batch_size,  # Batch size per device for training
    per_device_eval_batch_size=batch_size,  # Batch size per device for evaluation
    weight_decay=0.01,  # Weight decay for regularization to prevent overfitting
    save_total_limit=3,  # Limit the total number of checkpoints saved, only keep the 3 most recent
    num_train_epochs=num_train_epochs,  # Total number of training epochs
    predict_with_generate=True,  # When set to `True`, the model generates the output sequences during evaluation (e.g., at the end of each epoch or during validation) and prediction, rather than just outputting raw logits or token IDs.
    logging_steps=logging_steps,  # Log training metrics every 'logging_steps' steps
    fp16=True, 
)

In [65]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=final_data["train"],
    eval_dataset=final_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [66]:
gc.collect()

6618

In [67]:
torch.cuda.empty_cache()

## Finetuning model

In [68]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.727893,24.5008,11.6302,20.3195,23.0931,18.9956
2,1.905700,1.7157,24.4917,11.7517,20.3784,23.1519,18.9956
3,1.905700,1.712374,24.6681,11.8971,20.515,23.3033,18.9956


TrainOutput(global_step=1725, training_loss=1.90033203125, metrics={'train_runtime': 1722.1798, 'train_samples_per_second': 16.012, 'train_steps_per_second': 1.002, 'total_flos': 7464206086963200.0, 'train_loss': 1.90033203125, 'epoch': 3.0})

## Saving model


In [69]:
tokenizer.save_pretrained("/kaggle/working/mymodel")

('/kaggle/working/mymodel/tokenizer_config.json',
 '/kaggle/working/mymodel/special_tokens_map.json',
 '/kaggle/working/mymodel/spiece.model',
 '/kaggle/working/mymodel/added_tokens.json',
 '/kaggle/working/mymodel/tokenizer.json')

In [70]:
model.save_pretrained("/kaggle/working/mymodel")

## Inferencing model

In [71]:
from transformers import pipeline

In [85]:
 model_checkpoint = "/kaggle/working/mymodel"

In [81]:
pipe = pipeline("summarization", model=model_checkpoint, tokenizer=model_checkpoint)

In [82]:
result = pipe("""
Virat Kohli (Hindi pronunciation: [ʋɪˈɾɑːʈ ˈkoːɦli] ⓘ; born 5 November 1988) is an Indian international cricketer and the former captain of the Indian national cricket team. He is a right-handed batsman and an occasional medium-fast bowler. He currently represents Royal Challengers Bengaluru in the IPL and Delhi in domestic cricket. Kohli is widely regarded as one of the greatest batsmen of all time.[4] He holds the record as the highest run-scorer in IPL, ranks second in T20I, third in ODI, and stands as the fourth-highest in international cricket.[5] He also holds the record for scoring the most centuries in ODI cricket and stands second in the list of most international centuries scored. Kohli was a key member of the Indian team that won the 2011 Cricket World Cup, 2013 ICC Champions Trophy, and captained India to win the ICC Test mace three consecutive times in 2017, 2018, and 2019.[6]
""")

In [83]:
result[0]['summary_text']

'Virat Kohli is an Indian international cricketer and former captain of the Indian national cricket team . He holds the record as the highest run-scorer in IPL, ranks second in T20I, third in ODI, and stands as the fourth-highest in international cricket .'

## Measuring model size

In [78]:
import torch

# ################ monkey patch for quanto
def named_module_tensors(module, recurse=False):
    for named_parameter in module.named_parameters(recurse=recurse):
      name, val = named_parameter
      flag = True
      if hasattr(val,"_data") or hasattr(val,"_scale"):
        if hasattr(val,"_data"):
          yield name + "._data", val._data
        if hasattr(val,"_scale"):
          yield name + "._scale", val._scale
      else:
        yield named_parameter

    for named_buffer in module.named_buffers(recurse=recurse):
      yield named_buffer

def dtype_byte_size(dtype):
    """
    Returns the size (in bytes) occupied by one parameter of type `dtype`.
    """
    import re
    if dtype == torch.bool:
        return 1 / 8
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8

def compute_module_sizes(model):
    """
    Compute the size of each submodule of a given model.
    """
    from collections import defaultdict
    module_sizes = defaultdict(int)
    for name, tensor in named_module_tensors(model, recurse=True):
      size = tensor.numel() * dtype_byte_size(tensor.dtype)
      name_parts = name.split(".")
      for idx in range(len(name_parts) + 1):
        module_sizes[".".join(name_parts[:idx])] += size

    return module_sizes

In [86]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 0.242026496 GB


In [93]:
!zip -r file.zip /kaggle/working/mymodel

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/mymodel/ (stored 0%)
  adding: kaggle/working/mymodel/model.safetensors (deflated 10%)
  adding: kaggle/working/mymodel/tokenizer_config.json (deflated 95%)
  adding: kaggle/working/mymodel/tokenizer.json (deflated 74%)
  adding: kaggle/working/mymodel/config.json (deflated 62%)
  adding: kaggle/working/mymodel/generation_config.json (deflated 29%)
  adding: kaggle/working/mymodel/spiece.model (deflated 48%)
  adding: kaggle/working/mymodel/special_tokens_map.json (deflated 85%)


In [94]:
from IPython.display import FileLink
FileLink(r'file.zip')