# Pretrained Model on CNN_dailymail

In [None]:
%%capture
!pip install transformers
!pip install datasets
!pip install rouge_score

In [None]:
from transformers import T5Tokenizer, T5Model
from datasets import load_dataset, load_metric
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
import numpy as np
from tqdm.auto import tqdm
import gc

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# Randomly sample a portion of the data for fine-tuning
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000)) # Adjust the range as needed
test_dataset = dataset["test"].shuffle(seed=42).select(range(100))

In [None]:
# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

In [None]:
# Tokenize the data
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=1024)
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=142)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids
    return batch

In [None]:
train_dataset = train_dataset.map(process_data_to_model_inputs, batched=True)
test_dataset = test_dataset.map(process_data_to_model_inputs, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to("cuda")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
def train_model(train_loader):
  optimizer = AdamW(model.parameters(), lr=5e-5)

  num_epochs = 3
  num_training_steps = num_epochs * len(train_loader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )

  progress_bar = tqdm(range(num_training_steps))

  model.train()
  for epoch in range(num_epochs):
      for batch in train_loader:
          batch = {k: v.to("cuda") for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

In [None]:
torch.cuda.empty_cache()
gc.collect()

109

In [None]:
train_model(train_loader)



  0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
def evaluate_model(data_loader):
    progress_bar = tqdm(range(len(data_loader)))
    rouge = load_metric("rouge")
    model.eval()
    predictions = []
    references = []
    for batch in data_loader:
        with torch.no_grad():
            batch = {k: v.to("cuda") for k, v in batch.items()}
            outputs = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"])

            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            for pred, label in zip(decoded_preds, decoded_labels):
              predictions.append(pred)
              references.append(label)
            progress_bar.update(1)
    # Compute and return ROUGE scores
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    return predictions, references, rouge_scores

In [None]:
_,_,final_scores = evaluate_model(test_loader)

  0%|          | 0/25 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
# import pandas as pd
# df = pd.DataFrame({'prediction': prediction, 'reference': reference})

In [None]:
final_scores

{'rouge1': AggregateScore(low=Score(precision=0.3416840075172186, recall=0.488797140923519, fmeasure=0.3947156710976033), mid=Score(precision=0.36590389624447006, recall=0.5136637899513505, fmeasure=0.41730383646832026), high=Score(precision=0.38966684125539924, recall=0.5380518650178778, fmeasure=0.439130125140267)),
 'rouge2': AggregateScore(low=Score(precision=0.14136973294427563, recall=0.1998722341532475, fmeasure=0.16219489699448525), mid=Score(precision=0.16019284466090877, recall=0.22214900814507166, fmeasure=0.18167056117804092), high=Score(precision=0.1793391038865266, recall=0.24581935265672028, fmeasure=0.2005057591078439)),
 'rougeL': AggregateScore(low=Score(precision=0.22967836721494103, recall=0.32569762895243354, fmeasure=0.26417674060666263), mid=Score(precision=0.24646997249623165, recall=0.34663035210870974, fmeasure=0.28137471589999097), high=Score(precision=0.26523991262762714, recall=0.36957630830617977, fmeasure=0.29860568409146365)),
 'rougeLsum': AggregateScor

#Current News Data


In [None]:
import uuid
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#Load our news dataset and do certain preprocessing

df = pd.read_csv('2024-02-13.csv')

df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
df.rename(columns={'content': 'article', 'summary': 'highlights'}, inplace=True)
new_train_df, new_test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the splits to Dataset objects
new_train_dataset = Dataset.from_pandas(new_train_df)
new_test_dataset = Dataset.from_pandas(new_test_df)

# Process datasets with your existing function
new_train_dataset = new_train_dataset.map(process_data_to_model_inputs, batched=True)
new_test_dataset = new_test_dataset.map(process_data_to_model_inputs, batched=True)

new_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
new_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoaders for the new datasets
new_train_loader = DataLoader(new_train_dataset, batch_size=4, shuffle=True)
new_test_loader = DataLoader(new_test_dataset, batch_size=4)

In [None]:
_,_,pre_train_new_scores = evaluate_model(new_test_loader)

In [None]:
pre_train_new_scores

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
train_model(new_train_loader)

In [None]:
_,_,trained_new_scores = evaluate_model(new_test_loader)

In [None]:
_,_,trained_old_scores = evaluate_model(test_loader)

In [None]:
trained_new_scores

In [None]:
trained_old_scores

#Saving Model