In [1]:
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

In [2]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["article"]
        summary = self.data.iloc[index]["highlights"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

In [3]:
train_df = pd.read_csv("/Users/HP/Documents/PYTHONCODES/DATASETS/cnn_dailymail/train.csv")
test_df = pd.read_csv("/Users/HP/Documents/PYTHONCODES/DATASETS/cnn_dailymail/test.csv")
val_df = pd.read_csv("/Users/HP/Documents/PYTHONCODES/DATASETS/cnn_dailymail/validation.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/HP/Documents/PYTHONCODES/DATASETS/cnn_dailymail/train.csv'

In [None]:
train_df.shape

In [None]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

In [None]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

In [None]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [None]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2,collate_fn=collate_fn)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=10000)

In [None]:
len(train_loader),len(val_loader)

In [None]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if step%300==0 and step>0:
            print("Step-{},Train Loss-{}".format(step,loss.item()))
            break#intentionally breaking the training after 100 steps since it's going to take long to train,feel free to comment and train more
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            if step%100==0 and step>0:
                print("Step-{},Val Loss-{}".format(step,loss.item()))
                break #intentionally breaking the training after 100 steps since it's going to take long to validate,feel free to comment and validate more
        val_loss /= len(val_loader)
    model.train()
    break # when you train more then uncomment this, too !
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

In [None]:
model.save_pretrained("fine_tuned_pegasus")
tokenizer.save_pretrained("fine_tuned_pegasus")

In [None]:
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

In [None]:
test_dataset = SummarizationDataset(test_df, tokenizer)

In [None]:
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

In [None]:
len(test_loader)

In [None]:
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)
        if step==100:
            break # breaking after generating 100 predictions.. since it's going to take long to predict on entire set

In [None]:
len(predictions)

In [None]:
# Save the predictions to a CSV file
test_df = test_df[:101]# for 100 predicitons only
print(len(test_df))
test_df["predictions"] = predictions
test_df.to_csv("test_predictions.csv", index=False)

In [None]:
test_df

In [None]:
print("Article:", test_df.iloc[0]['article'])

In [None]:
print("Highlights:", test_df.iloc[0]['highlights'])
print("Predictions:", test_df.iloc[0]['predictions'])

In [None]:
# Calculate the ROUGE scores between the predicted summaries and the actual summaries
rouge = Rouge()
scores = rouge.get_scores(predictions, test_df["highlights"].tolist(), avg=True)

# Print the ROUGE scores
print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")

# Prediction from random input

In [None]:
# Load the model and tokenizer
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

# Set the device (CPU or GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Define the input text for prediction
input_text = "Do they Colorado Buffaloes play this weekend? If you're like us, that's the first thing you think about with each new week of the 2023 NCAA college football season. The 4-3 Buffaloes, led by NFL and MLB great Deion 'Coach Prime' Sanders, are the biggest thing to hit college football this year.Coach Prime's Buffaloes, led by Sander's stellar quarterback and son Shadeur Sanders, came into the 2023 college football season with a shocking upset win over TCU. Ever since then, the Buffaloes and Coach Prime have been ruffling feathers, winning games and turning one football fan after the next into Colorado Buffaloes fans.Keep reading for how and when to watch the next Colorado Buffaloes next game.Do the Colorado Buffaloes play this weekend?The Buffaloes have a bye week this week, which gives them plenty of time to recover from that ugly Week 7 loss to the Stanford Cardinals. The Buffaloes return to face off against the UCLA Bruins on Saturday, Oct. 28 at 7:30 p.m. ET (4:30 p.m. PT). The game will be broadcast on ABC.Next weekend's game will be held at the Rose Bowl in Pasadena, CA. The game is completely sold out, with as many as 67,000 people expected to attend live."

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate predictions for the input text
with torch.no_grad():
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode the output tokens to get the predicted text
predicted_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the predicted text
print(predicted_text)