# Pretrained Model on CNN_dailymail

In [2]:
!pip install transformers[torch] accelerate -U
!pip install datasets
!pip install rouge_score

Collecting transformers[torch]
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
import numpy as np
from tqdm.auto import tqdm
import gc
import nltk
import string

In [4]:
metric = load_metric("rouge")
nltk.download('punkt')

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [6]:
dataset_filtered = dataset.filter(
    lambda example: (len(example['article']) >= 500)
)

Filter:   0%|          | 0/287113 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13368 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [7]:
# Randomly sample a portion of the data for fine-tuning
train_dataset = dataset["train"].shuffle(seed=42).select(range(4000)) # Adjust the range as needed
test_dataset = dataset["test"].shuffle(seed=42).select(range(400))

In [55]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")

In [56]:
prefix = "summarize: "
max_input_length = 512
max_target_length = 64

In [57]:
def clean_text(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                    if len(sent) > 0 and
                                    sent[-1] in string.punctuation]
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned

In [58]:
def preprocess_data(examples):
    texts_cleaned = [clean_text(text) for text in examples["article"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length,
                            truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [59]:
tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]



In [60]:
batch_size = 1

In [61]:
model_name = "t5-small-news-summarization"
model_dir = f"./Models/{model_name}"

In [62]:
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=4000,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=4000,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [63]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [64]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [65]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [66]:
trainer = Seq2SeqTrainer(
#   model_init=model_init,
    model = model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [67]:
torch.cuda.empty_cache()
gc.collect()

1233

In [68]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4000,2.1059,1.870503,28.3993,14.1566,23.8014,26.905,18.995


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4000, training_loss=2.095926528930664, metrics={'train_runtime': 492.7754, 'train_samples_per_second': 8.117, 'train_steps_per_second': 8.117, 'total_flos': 522645628256256.0, 'train_loss': 2.095926528930664, 'epoch': 1.0})

In [None]:
text = """
CNN — Forget soccer’s transfer deadline day on Thursday. The only switch most sports fans were talking about was the announcement that seven-time Formula One champion Lewis Hamilton would be leaving Mercedes at the end of the this season to join Ferrari in 2025, replacing Carlos Sainz at the Italian outfit.
“No two ways about it. For me it’s the single biggest driver transfer in the history of the sport. An incredible story,” F1 broadcaster Will Buxton said on social media.
Despite signing an extension with Mercedes until 2025 in August last year, the team said in a statement that Hamilton had “activated a release option in the contract” and would depart the Silver Arrows at the conclusion of the upcoming campaign.
“I have had an amazing 11 years with this team and I’m so proud of what we have achieved together … making the decision to leave was one of the hardest decisions I have ever had to make,” Hamilton said in the statement. “But the time is right for me to take this step and I’m excited to be taking on a new challenge.”
Ferrari announced shortly after that it had signed Hamilton to “a multi-year contract.”
Prior to Red Bull’s recent dominance in F1, the Hamilton-Mercedes partnership has become synonymous with success, with the German manufacturer securing eight World Constructors’ championships and Hamilton winning the drivers’ title six times.
For Ferrari, it’s a remarkable coup, having lured a driver that many consider to be the greatest of all time away from the team that he won six championships with. For Mercedes, it is a devastating loss, unable to retain a generational talent and the face of its entire motorsport brand.
Is the grass greener?
There is jeopardy in Hamilton’s decision to swap the Silver Arrows for the Prancing Horse. He has been so successful with Mercedes that it’s almost hard to imagine the 39-year-old Briton racing for any other team.
Hamilton holds the all-time record for F1 wins (103), achieving 82 of those with Mercedes and 21 with McLaren. His association with the German constructor goes back over 25 years to when he was signed as junior driver by McLaren – who used Mercedes engines – at 13 years old.
Hamilton will depart a team that he knows well – leaving behind mechanics and engineers that he has forged relationships with – to step into an unknown environment.
The Scuderia is statistically the most successful F1 team of all time, although it has not claimed a constructors’ crown since 2008, the year that Hamilton won his first F1 title with McLaren, and has not produced a world champion driver since 2007.
Though Ferrari mounted a challenge with Fernando Alonso and Sebastian Vettel in 2012 and 2017 respectively, it was unable to make a dent in Red Bull’s era of dominance between 2010 and 2013 and fell victim to the Mercedes dynasty during the remainder of that decade.
Now, Red Bull, led by three-time world champion Max Verstappen, is again on top and Ferrari have been unable to get close.
Charles Leclerc briefly flirted with challenging Verstappen at the start of the 2022 season, though both Ferrari drivers finished behind Hamilton in 2023 and Mercedes beat the Scuderia to second in the constructors’ race.
Success in red is far from a guarantee for Hamilton, who will also have to contend with new teammate Leclerc, who is well-established at the team and has long been touted as Ferrari’s next drivers’ champion.
Despite Hamilton’s apparent ability to defy the effects of time, age could also be a factor in how much he achieves with the Maranello-based team.
“[Hamilton] will actually be 40 when he starts racing for Ferrari, which is remarkable in itself that he’s starting this new chapter so late in his career,” Phil Duncan, who is PA Media’s F1 correspondent, told CNN Sport.
The oldest man to win an F1 world title is Juan Manuel Fangio at the age of 46 in 1956 and no driver has won a championship in their forties since Jack Brabham in 1966.
Winning a title in what will be his 19th year in the sport would mean that Hamilton would be doing something unprecedented in modern times.
For the best?
Perhaps the move, however, is a masterstroke.
“A change of scenery can completely rejuvenate you,” said CNN’s Coy Wire on World Sport. “Going into a new building, getting new teammates, no more staleness.”
Wire pointed to the NFL as an example, explaining that Tom Brady’s decision to end a 20-year tenure with the New England Patriots and sign with the Tampa Bay Buccaneers “brought him to life,” with the quarterback winning a Super Bowl with the franchise in 2021.
Hamilton’s move could reinvigorate the seven-time world champion and give him another chance to compete at the top of the sport. It also reunites him with Ferrari team principal Fred Vasseur, who Hamilton raced for at junior level.
Hamilton is without a win since December 2021, having watched Verstappen claim the world title in the final race of that season and then lap the field in 2022 and 2023.
Much of this can be attributed to Mercedes essentially falling off a cliff after 2021. New technical regulations were introduced in 2022, and while Red Bull swam, Mercedes sunk.
The team has looked uncompetitive, finishing hundreds of points behind Red Bull and losing out on podiums to the likes of Ferrari, McLaren and Aston Martin. Last season, a Mercedes driver failed to win a race across a whole season for the first time since 2011.
The last couple of seasons have been a bumpy ride for Hamilton, pictured here colliding with Fernando Alonso in 2022.
The last couple of seasons have been a bumpy ride for Hamilton, pictured here colliding with Fernando Alonso in 2022. John Thys/AFP via Getty Images
A damning indictment of Mercedes’ car development came over the airwaves from team principal Toto Wolff in Austria last year: “Lewis, the car is bad, we know. Please drive it.”
It is uncertain whether Hamilton will have better fortunes on the track at Ferrari, but with Mercedes’ process stagnating, it may well be worth taking the leap to racing’s most famous team.
Aside from all else, the move holds significance on a personal level for Hamilton. He has previously described driving for Ferrari as a “dream” and was unsure about why a move there never came to fruition. Now, as Duncan put it, it is a dream, “fulfilled.”
"""

inputs = ["summarize: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt").to("cuda")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_summary = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_summary)

Lewis Hamilton will leave Mercedes at the end of the season to join Ferrari in 2025.


In [None]:
# torch.save(model.state_dict(), 'model_state_dict.pth')

#Current News Data


In [35]:
import uuid
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#Load our news dataset and do certain preprocessing

df = pd.read_csv('2024-02-13.csv')

df = df.dropna(subset=['summary'])
df = df[df['content'].apply(lambda x: len(x) >= 500)]
df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
df.rename(columns={'content': 'article', 'summary': 'highlights'}, inplace=True)
new_train_df, new_test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the splits to Dataset objects
new_train_dataset = Dataset.from_pandas(new_train_df)
new_test_dataset = Dataset.from_pandas(new_test_df)
new_train_size = len(new_train_dataset)

In [None]:
# Process datasets with your existing function
new_train_dataset = new_train_dataset.map(preprocess_data, batched=True)
new_test_dataset = new_test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/339 [00:00<?, ? examples/s]



Map:   0%|          | 0/85 [00:00<?, ? examples/s]

In [None]:
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=new_train_size,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=new_train_size,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [None]:
trainer = Seq2SeqTrainer(
#   model_init=model_init,
    model = model,
    args=args,
    train_dataset=new_train_dataset,
    eval_dataset=new_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
torch.cuda.empty_cache()
gc.collect()

691

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
339,1.9576,1.929674,29.5056,15.7484,25.5068,25.7468,18.6235


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=339, training_loss=1.9684211708451442, metrics={'train_runtime': 59.7405, 'train_samples_per_second': 5.675, 'train_steps_per_second': 5.675, 'total_flos': 42780697559040.0, 'train_loss': 1.9684211708451442, 'epoch': 1.0})

In [None]:
trainer.evaluate(tokenized_test_dataset)



{'eval_loss': 2.1147475242614746,
 'eval_rouge1': 28.1347,
 'eval_rouge2': 12.8993,
 'eval_rougeL': 22.8619,
 'eval_rougeLsum': 26.0546,
 'eval_gen_len': 18.92,
 'eval_runtime': 27.3425,
 'eval_samples_per_second': 3.657,
 'eval_steps_per_second': 3.657,
 'epoch': 1.0}

# Pipeline for retraining

In [35]:
past_test = [tokenized_test_dataset]

In [69]:
def retrain(fileName):
  df = pd.read_csv(fileName)

  #preprocessing/cleaning data
  df = df.dropna(subset=['summary'])
  df = df[df['content'].apply(lambda x: len(x) >= 500)]

  df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
  df.rename(columns={'content': 'article', 'summary': 'highlights'}, inplace=True)
  new_train_df, new_test_df = train_test_split(df, test_size=0.2, random_state=42)

  # Convert the splits to Dataset objects
  new_train_dataset = Dataset.from_pandas(new_train_df)
  new_test_dataset = Dataset.from_pandas(new_test_df)
  new_train_size = len(new_train_dataset)

  # Process datasets with your existing function
  new_train_dataset = new_train_dataset.map(preprocess_data, batched=True)
  new_test_dataset = new_test_dataset.map(preprocess_data, batched=True)

  # Save the new test_dataset to a set of past test data
  past_test.append(new_test_dataset)

  args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=new_train_size,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=new_train_size,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
  )

  trainer = Seq2SeqTrainer(
  #   model_init=model_init,
      model = model,
      args=args,
      train_dataset=new_train_dataset,
      eval_dataset=new_test_dataset,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )

  torch.cuda.empty_cache()
  gc.collect()

  trainer.train()

In [70]:
def reevaluate(past_test):
  for df in past_test:
    results = trainer.evaluate(df)
    print(results)

#Experiments with several datasets


In [39]:
retrain('2024-03-17.csv')

Map:   0%|          | 0/276 [00:00<?, ? examples/s]



Map:   0%|          | 0/70 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
276,2.0476,1.74684,31.2059,18.2337,26.6908,26.9099,18.8143


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [40]:
reevaluate(past_test)



{'eval_loss': 1.9650659561157227, 'eval_rouge1': 27.3551, 'eval_rouge2': 13.4916, 'eval_rougeL': 22.5196, 'eval_rougeLsum': 25.7461, 'eval_gen_len': 18.99, 'eval_runtime': 118.2167, 'eval_samples_per_second': 3.384, 'eval_steps_per_second': 3.384, 'epoch': 1.0}
{'eval_loss': 1.7468398809432983, 'eval_rouge1': 31.2059, 'eval_rouge2': 18.2337, 'eval_rougeL': 26.6908, 'eval_rougeLsum': 26.9099, 'eval_gen_len': 18.8143, 'eval_runtime': 20.0126, 'eval_samples_per_second': 3.498, 'eval_steps_per_second': 3.498, 'epoch': 1.0}


In [41]:
retrain('2024-03-18.csv')
reevaluate(past_test)

Map:   0%|          | 0/349 [00:00<?, ? examples/s]



Map:   0%|          | 0/88 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
349,1.9301,1.390487,38.9099,26.5159,35.6809,36.1292,18.7955


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 2.035022020339966, 'eval_rouge1': 26.5532, 'eval_rouge2': 12.3384, 'eval_rougeL': 21.4924, 'eval_rougeLsum': 24.65, 'eval_gen_len': 18.9475, 'eval_runtime': 118.5675, 'eval_samples_per_second': 3.374, 'eval_steps_per_second': 3.374, 'epoch': 1.0}
{'eval_loss': 1.735974907875061, 'eval_rouge1': 34.517, 'eval_rouge2': 21.3186, 'eval_rougeL': 29.6719, 'eval_rougeLsum': 29.845, 'eval_gen_len': 18.7714, 'eval_runtime': 20.7757, 'eval_samples_per_second': 3.369, 'eval_steps_per_second': 3.369, 'epoch': 1.0}
{'eval_loss': 1.3904874324798584, 'eval_rouge1': 38.9099, 'eval_rouge2': 26.5159, 'eval_rougeL': 35.6809, 'eval_rougeLsum': 36.1292, 'eval_gen_len': 18.7955, 'eval_runtime': 25.822, 'eval_samples_per_second': 3.408, 'eval_steps_per_second': 3.408, 'epoch': 1.0}


In [42]:
retrain('2024-03-19.csv')
reevaluate(past_test)

Map:   0%|          | 0/347 [00:00<?, ? examples/s]



Map:   0%|          | 0/87 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
347,1.7663,1.8125,36.1121,24.4806,33.4611,33.9694,18.5402


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 2.055870771408081, 'eval_rouge1': 26.1831, 'eval_rouge2': 11.5467, 'eval_rougeL': 21.1064, 'eval_rougeLsum': 24.2008, 'eval_gen_len': 18.9425, 'eval_runtime': 121.7062, 'eval_samples_per_second': 3.287, 'eval_steps_per_second': 3.287, 'epoch': 1.0}
{'eval_loss': 1.723526120185852, 'eval_rouge1': 34.6343, 'eval_rouge2': 22.2462, 'eval_rougeL': 30.2237, 'eval_rougeLsum': 30.3487, 'eval_gen_len': 18.7143, 'eval_runtime': 21.1548, 'eval_samples_per_second': 3.309, 'eval_steps_per_second': 3.309, 'epoch': 1.0}
{'eval_loss': 1.3753613233566284, 'eval_rouge1': 39.4239, 'eval_rouge2': 27.1186, 'eval_rougeL': 36.4687, 'eval_rougeLsum': 36.4813, 'eval_gen_len': 18.6705, 'eval_runtime': 26.61, 'eval_samples_per_second': 3.307, 'eval_steps_per_second': 3.307, 'epoch': 1.0}
{'eval_loss': 1.8125004768371582, 'eval_rouge1': 36.1121, 'eval_rouge2': 24.4806, 'eval_rougeL': 33.4611, 'eval_rougeLsum': 33.9694, 'eval_gen_len': 18.5402, 'eval_runtime': 26.0111, 'eval_samples_per_second': 3.34

# Rehearsal Retraining

In [71]:
from datasets import concatenate_datasets
past_train = tokenized_train_dataset
past_test = [tokenized_test_dataset]

In [72]:
def retrain_rehearsal(fileName):
  df = pd.read_csv(fileName)

  #preprocessing/cleaning data
  df = df.dropna(subset=['summary'])
  df = df[df['content'].apply(lambda x: len(x) >= 500)]

  df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]
  df.rename(columns={'content': 'article', 'summary': 'highlights'}, inplace=True)
  new_train_df, new_test_df = train_test_split(df, test_size=0.2, random_state=42)

  # Convert the splits to Dataset objects
  new_train_dataset = Dataset.from_pandas(new_train_df)
  new_test_dataset = Dataset.from_pandas(new_test_df)

  # Process datasets with your existing function
  new_train_dataset = new_train_dataset.map(preprocess_data, batched=True)
  new_test_dataset = new_test_dataset.map(preprocess_data, batched=True)

  # Save the new test_dataset to a set of past test data

  new_past = concatenate_datasets([new_train_dataset, past_train])
  new_train_dataset = concatenate_datasets([new_train_dataset, past_train.shuffle(seed=42).select(range(100))])
  past_test.append(new_test_dataset)
  new_train_size = len(new_train_dataset)

  args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=new_train_size,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=new_train_size,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
  )

  trainer = Seq2SeqTrainer(
  #   model_init=model_init,
      model = model,
      args=args,
      train_dataset=new_train_dataset,
      eval_dataset=new_test_dataset,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )

  torch.cuda.empty_cache()
  gc.collect()

  trainer.train()

  return new_past

In [73]:
past_train = retrain_rehearsal('2024-03-17.csv')

Map:   0%|          | 0/276 [00:00<?, ? examples/s]



Map:   0%|          | 0/70 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
376,2.0569,1.765622,31.2201,18.4068,26.5998,26.6062,18.6857


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [74]:
reevaluate(past_test)



{'eval_loss': 1.92219877243042, 'eval_rouge1': 27.4251, 'eval_rouge2': 13.3685, 'eval_rougeL': 22.6429, 'eval_rougeLsum': 25.7593, 'eval_gen_len': 18.995, 'eval_runtime': 115.5883, 'eval_samples_per_second': 3.461, 'eval_steps_per_second': 3.461, 'epoch': 1.0}
{'eval_loss': 1.7656217813491821, 'eval_rouge1': 31.2201, 'eval_rouge2': 18.4068, 'eval_rougeL': 26.5998, 'eval_rougeLsum': 26.6062, 'eval_gen_len': 18.6857, 'eval_runtime': 20.3141, 'eval_samples_per_second': 3.446, 'eval_steps_per_second': 3.446, 'epoch': 1.0}


In [75]:
past_train = retrain_rehearsal('2024-03-18.csv')
reevaluate(past_test)

Map:   0%|          | 0/349 [00:00<?, ? examples/s]



Map:   0%|          | 0/88 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
449,1.9661,1.386014,38.2552,25.8238,35.1426,35.3012,18.6932


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 1.913529634475708, 'eval_rouge1': 27.3721, 'eval_rouge2': 13.2682, 'eval_rougeL': 22.4525, 'eval_rougeLsum': 25.6158, 'eval_gen_len': 18.9375, 'eval_runtime': 116.0795, 'eval_samples_per_second': 3.446, 'eval_steps_per_second': 3.446, 'epoch': 1.0}
{'eval_loss': 1.7216171026229858, 'eval_rouge1': 34.737, 'eval_rouge2': 22.4744, 'eval_rougeL': 30.1899, 'eval_rougeLsum': 30.142, 'eval_gen_len': 18.6286, 'eval_runtime': 20.3324, 'eval_samples_per_second': 3.443, 'eval_steps_per_second': 3.443, 'epoch': 1.0}
{'eval_loss': 1.3860137462615967, 'eval_rouge1': 38.2552, 'eval_rouge2': 25.8238, 'eval_rougeL': 35.1426, 'eval_rougeLsum': 35.3012, 'eval_gen_len': 18.6932, 'eval_runtime': 25.2904, 'eval_samples_per_second': 3.48, 'eval_steps_per_second': 3.48, 'epoch': 1.0}


In [77]:
past_train = retrain_rehearsal('2024-03-19.csv')
reevaluate(past_test)

Map:   0%|          | 0/347 [00:00<?, ? examples/s]



Map:   0%|          | 0/87 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
447,1.7482,1.808954,36.1595,24.2773,33.7156,34.1126,18.6437


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 1.9222761392593384, 'eval_rouge1': 27.0582, 'eval_rouge2': 12.815, 'eval_rougeL': 22.1374, 'eval_rougeLsum': 25.2559, 'eval_gen_len': 18.9525, 'eval_runtime': 117.8936, 'eval_samples_per_second': 3.393, 'eval_steps_per_second': 3.393, 'epoch': 1.0}
{'eval_loss': 1.7229385375976562, 'eval_rouge1': 37.0328, 'eval_rouge2': 24.4671, 'eval_rougeL': 32.3948, 'eval_rougeLsum': 32.5364, 'eval_gen_len': 18.7857, 'eval_runtime': 20.47, 'eval_samples_per_second': 3.42, 'eval_steps_per_second': 3.42, 'epoch': 1.0}
{'eval_loss': 1.376887321472168, 'eval_rouge1': 39.0697, 'eval_rouge2': 26.6126, 'eval_rougeL': 35.7564, 'eval_rougeLsum': 35.9192, 'eval_gen_len': 18.6932, 'eval_runtime': 25.3091, 'eval_samples_per_second': 3.477, 'eval_steps_per_second': 3.477, 'epoch': 1.0}
{'eval_loss': 1.808953881263733, 'eval_rouge1': 36.1595, 'eval_rouge2': 24.2773, 'eval_rougeL': 33.7156, 'eval_rougeLsum': 34.1126, 'eval_gen_len': 18.6437, 'eval_runtime': 24.7702, 'eval_samples_per_second': 3.512, 