In [1]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score




In [2]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import datasets
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [3]:
pp = pprint.PrettyPrinter()


# Loading and preparing the dataset.
For this summarization task, we will be using the BBC News Summary dataset as it covers a wide range of text across different domains eg. entertainment, politics, sports...

In [4]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
df = pd.DataFrame(dataset)
print('size of our data : ',df.shape)
print('\n')
df.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


size of our data :  (2224, 3)




Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


In [5]:
df.isna().sum()

File_path    0
Articles     0
Summaries    0
dtype: int64

In [6]:
def get_avg_text_length(data) :
  size = data.shape[0]
  len_art = 0
  len_sum = 0
  for v in range(0,size) :
    len_art+=len(data.iloc[v,1].split())
    len_sum+=len(data.iloc[v,2].split())
  return len_art/size, len_sum/size

print("Average article length : "+ str(int(get_avg_text_length(df)[0]))  +" words\nAverage summary length : "+ str(int(get_avg_text_length(df)[1]))+" words")

Average article length : 379 words
Average summary length : 165 words


In [7]:
def find_longest_length(dataset):
    """
    Get an idea about the distribution of article and summary lengths in the dataset.
    This will serve later for tokenizing.

    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(df['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(df['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 4377 words
Artciles larger than 4000 words: 1
Artciles larger than 2000 words: 7
Artciles larger than 1000 words: 21
Artciles larger than 500 words: 441
Longest summary length: 2073 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 1
Summaries larger than 1000 words: 7
Summaries larger than 500 words: 16


In [8]:
trainset, testset = train_test_split(df,test_size = 0.2, shuffle = True)
print('Size of the training data : ',trainset.shape)
print('Size of the testing data : ',testset.shape)


Size of the training data :  (1779, 3)
Size of the testing data :  (445, 3)


# Configuration

In [23]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512

In [10]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Function to convert text data into model inputs and targets

''' We will be prefixing the input data with 'summarize:' since T5 model performs various text-to-text tasks,
and expects a prefix specified for each task (summarization in our case) ''';

def preprocess_function(data):
    inputs = [f"summarize: {article}" for article in data['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in data['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [12]:
trainset_tok = datasets.Dataset.from_pandas(trainset).map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
testset_tok = datasets.Dataset.from_pandas(testset).map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]



In [13]:
# loading the HuggingFace transformer model
model = T5ForConditionalGeneration.from_pretrained(MODEL)
# checking for available cuda-compatible GPUs to accelerate model training via parallel processing.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# checking total parametes
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

222,903,552 total parameters.
222,903,552 training parameters.


# ROUGE metric
We will be using the rouge metric for automatic evaluation of our model's text summerization

In [14]:
rouge = evaluate.load("rouge")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Training the model

In [16]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [20]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4


In [17]:
# using the Trainer class which handles optimization (gradient descent), checkpointing, logging, evaluation

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset_tok,
    eval_dataset=testset_tok,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

logs = trainer.train()




Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.3968,0.414524,0.8945,0.8204,0.8752,233.0382
400,0.4126,0.355332,0.9013,0.8305,0.8836,233.9888
600,0.3704,0.344368,0.9049,0.835,0.8871,233.9888
800,0.4298,0.332507,0.9069,0.8384,0.8897,233.9888
1000,0.3227,0.329268,0.9082,0.8408,0.8915,233.9888
1200,0.2427,0.323858,0.9092,0.8415,0.8925,233.9888
1400,0.2535,0.322292,0.9099,0.8438,0.8938,233.9888
1600,0.2518,0.320511,0.9109,0.8445,0.8942,233.9888
1800,0.2382,0.318759,0.9117,0.8464,0.8953,233.9888
2000,0.3116,0.316957,0.9116,0.8458,0.8952,233.991




In [24]:
tokenizer.save_pretrained(OUT_DIR)


('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [25]:
!zip -r {OUT_DIR} {OUT_DIR}


  adding: results_t5base/ (stored 0%)
  adding: results_t5base/events.out.tfevents.1705251078.374b6640745f.841.1 (deflated 63%)
  adding: results_t5base/checkpoint-4450/ (stored 0%)
  adding: results_t5base/checkpoint-4450/generation_config.json (deflated 27%)
  adding: results_t5base/checkpoint-4450/scheduler.pt (deflated 56%)
  adding: results_t5base/checkpoint-4450/model.safetensors (deflated 8%)
  adding: results_t5base/checkpoint-4450/optimizer.pt (deflated 8%)
  adding: results_t5base/checkpoint-4450/trainer_state.json (deflated 87%)
  adding: results_t5base/checkpoint-4450/config.json (deflated 63%)
  adding: results_t5base/checkpoint-4450/rng_state.pth (deflated 25%)
  adding: results_t5base/checkpoint-4450/training_args.bin (deflated 51%)
  adding: results_t5base/checkpoint-4005/ (stored 0%)
  adding: results_t5base/checkpoint-4005/generation_config.json (deflated 27%)
  adding: results_t5base/checkpoint-4005/scheduler.pt (deflated 55%)
  adding: results_t5base/checkpoint-4005

# Inference (yay!)

In [26]:
model_path = f"{OUT_DIR}/checkpoint-4450"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [29]:
text1 = """New York
CNN
 —
The artificial intelligence industry was shaken by tectonic shifts over the weekend.

Sam Altman — the leader of one of the world’s most influential AI companies, OpenAI, and perhaps the most visible figure in the space — was fired Friday night by the startup’s board in a surprise move. Within about 48 hours, he’d been hired to run a new division at Microsoft where he’ll be arguably even more powerful, with the resources of one of the world’s biggest tech companies and the direct backing of its chief executive.

And as for OpenAI, the future feels deeply uncertain, with the board apparently having lost the trust of the company’s staff and employees threatening to head for the exits en masse.

The debacle unfolded just over a week after OpenAI held its first-ever developer conference, where it laid out new, commercialized versions of its technology, including the option to customize its ChatGPT AI chatbot.

If you’re just catching up, here’s what you missed from a weekend that could fundamentally change the AI development arms race:"""

text2 = """Microsoft has hired Sam Altman to power up its innovation in artificial intelligence after the co-founder of OpenAI was ousted as CEO in a chaotic boardroom coup on Friday. Meanwhile, the ChatGPT company will get its third CEO in three days.

It’s another major shakeup to the balance of power over artificial intelligence, the most significant new technology in decades.

Greg Brockman, another co-founder of OpenAI, is also joining Microsoft (MSFT) — the startup’s biggest financial backer. Brockmann quit as OpenAI president after Altman was fired.

Emmett Shear, the former CEO of Amazon’s streaming service Twitch, will join OpenAI as interim CEO. He replaces Mira Murati, who was named interim CEO when Altman was fired. She will return to her role as OpenAI’s chief technology officer.

“We look forward to getting to know Emmett Shear,” Microsoft CEO Satya Nadella said in a post on X, formerly known as Twitter. “And we’re extremely excited to share the news that Sam Altman and Greg Brockman, together with colleagues, will be joining Microsoft to lead a new advanced AI research team.”"""



In [32]:
for t in [text1,text2] :
  pp.pprint(summarize_text(t,model,tokenizer))
  print('\n','*' * 80)


('Sam Altman — the leader of one of the world’s most influential AI companies, '
 'OpenAI, and perhaps the most visible figure in the space — was fired Friday '
 'night by the startup’s board in a surprise move.')

 ********************************************************************************
('Microsoft has hired Sam Altman to power up its innovation in artificial '
 'intelligence after the co-founder of OpenAI was ousted as CEO in a chaotic '
 'boardroom coup on Friday. Brockmann quit as OpenAI president after Altman '
 'was fired')

 ********************************************************************************
