In [None]:
!pip install datasets
!pip install py7zr

In [None]:
def save_model(model, optimizer, scheduler, epoch, train_losses, eval_losses, filepath):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),  # Ensure model is a valid PyTorch model object
        'optimizer_state_dict': optimizer.state_dict(),  # Ensure optimizer is a valid optimizer object
        'scheduler_state_dict': scheduler.state_dict(),  # Ensure scheduler is a valid scheduler object
        'train_losses': train_losses,  # Ensure train_losses is a list or value
        'eval_losses': eval_losses,  # Ensure eval_losses is a list or value
    }
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved at {filepath}")

In [None]:
from datasets import load_dataset

dataset = load_dataset("samsum")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the FLAN-T5 base model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    inputs = [f"summarize: {dialogue}" for dialogue in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the summaries (labels)
    labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

In [None]:
save_directory = "/content/drive/MyDrive/LLM Models/FLAN-T5-base x SAMsum/tokenized_samsum"
tokenized_datasets.save_to_disk(save_directory)

In [None]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk("/content/drive/MyDrive/LLM Models/FLAN-T5-small x CNN-DailyMail/tokenized_cnn_dailymail")

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [None]:
from torch.utils.data import DataLoader

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=collate_fn)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Set up learning rate scheduler
total_steps = len(train_dataloader) * 10  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
epochs = 10

train_losses = []
eval_losses = []
best_eval_loss = float('inf')

for epoch in range(epochs):
    torch.cuda.empty_cache()
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        torch.cuda.empty_cache()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}")

    # Evaluate the model on the validation set
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            torch.cuda.empty_cache()

            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    eval_losses.append(avg_eval_loss)

    # Save the best model based on validation loss
    if avg_eval_loss < best_eval_loss:
        best_eval_loss = avg_eval_loss
        save_model(
            epoch=epoch,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            train_losses=train_losses,
            eval_losses=eval_losses,
            filepath = r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x SAMsum/best_model.pth'
        )
        print(f"Best model saved with eval loss {avg_eval_loss:.4f} at epoch {epoch+1}")

    print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {avg_eval_loss}")

In [None]:
#Testing

# Select a single example from the test set
single_data = dataset["test"][10]

# Display the dialogue and summary
print("Dialogue:")
print(single_data["dialogue"])
print("\nGround Truth Summary:")
print(single_data["summary"])

In [None]:
# Prepare the input for the model
input_text = f"summarize: {single_data['dialogue']}"
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Move inputs to GPU if available
inputs = {key: value.to(device) for key, value in inputs.items()}

In [None]:
# Generate the summary
summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)

# Decode the generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nGenerated Summary:")
print(generated_summary)

In [None]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric("rouge")

# Compute ROUGE scores
scores = rouge.compute(predictions=[generated_summary], references=[single_data["summary"]])

print("\nROUGE Scores:")
print(scores)

In [None]:
!pip install rouge-score

In [None]:
# Prepare lists to store predictions and references
predictions = []
references = []

# Loop through the evaluation dataset and generate summaries
for example in dataset["validation"]:
    input_text = f"summarize: {example['dialogue']}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Move inputs to GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)

    # Decode the generated summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Append the generated summary and reference to the respective lists
    predictions.append(generated_summary)
    references.append(example["summary"])

# Compute ROUGE scores for the entire dataset
scores = rouge.compute(predictions=predictions, references=references)

# Print the ROUGE scores
print("ROUGE Scores for the entire evaluation dataset:")
print(scores['rouge1'])
print(scores['rouge2'])
print(scores['rougeL'])
print(scores['rougeLsum'])