\# This notebook has steps for model initialisation , Training and Testing**



In [1]:
# ensure that the imported .py file will get auto imported and updated whenever there is a change
%load_ext autoreload
%autoreload 2

In [2]:
!pip install transformers
!pip install tokenizer
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install rouge
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/
Looking in indexes: https://pypi.org/simple, https://intel.github.io/custom-aarch64-wheels/pytorch/


# Run Model Trainer

In [6]:
from ModelSummarizerFix import train_model

MODEL_NAME = 'allenai/led-large-16384-arxiv'

train_model(MODEL_NAME)

KeyboardInterrupt: 

Load the datasets

In [2]:
import os
import json
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from ModelSummarizer import SummarizationModel
from ModelSummarizer import load_data

# Define the path for the datasets

train_file_path = os.path.join('../dataset/', 'dataset_ground_truth.json')  # 100 pdfs
test_file_path =  os.path.join('../dataset/', 'dataset_test_ground_truth.json')   #20 pdfs
val_file_path =  os.path.join('../dataset/', 'dataset_eval_ground_truth.json')  #20 pdfs

model_name = "allenai/led-large-16384-arxiv"
summarizer = SummarizationModel(model_name)
model = summarizer.model

# Load training data
train_data = load_data(train_file_path)

# Load testing data
test_data = load_data(test_file_path)

#Load val data
val_data=load_data(val_file_path)

#Define Sequence length of model
seq_length=1024



  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Model Training

In [None]:

#Initialize the list for storing the losses
train_losses = []
val_losses = []
rouge_scores = []

#Declare variable for storing the checkpoint
checkpoint_filename = "model_checkpoint.pt"

# Initialize variables for training
best_val_loss = float('inf')
epochs_no_improve = 0
num_epochs=2
patience = 3
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
val_loader = DataLoader(val_data, batch_size=1, shuffle=True)

for param in summarizer.model.parameters():
    param.requires_grad = False

for name, param in summarizer.model.named_parameters():
  if 'lm_head' in name:  # Unfreeze parameters in the lm_head module
        param.requires_grad = True
  elif name.startswith('led.decoder.layers.11'):  # Unfreeze the last layer of the decoder
        param.requires_grad = True
  else:
        param.requires_grad = False

for epoch in range(num_epochs):
  train_loss = summarizer.train_model(train_loader)
  avg_train_loss = train_loss / len(train_loader)
  train_losses.append(avg_train_loss)

  # Validate the model
  val_loss,total_rouge1_f1,total_rouge2_f1,total_rougeL_f1,num_samples  = summarizer.validate_model(val_loader)
  avg_rouge1_f1 = total_rouge1_f1 / num_samples
  avg_rouge2_f1 = total_rouge2_f1 / num_samples
  avg_rougeL_f1 = total_rougeL_f1 / num_samples
  rouge_scores.append((avg_rouge1_f1, avg_rouge2_f1, avg_rougeL_f1))

  avg_val_loss = val_loss / len(val_loader)
  val_losses.append(avg_val_loss)
  
  # CosineAnnealing LR to adjust the learning rate
  summarizer.scheduler.step()

  # Log metrics to file
  summarizer.log_metrics(epoch, avg_train_loss, avg_val_loss, (avg_rouge1_f1, avg_rouge2_f1, avg_rougeL_f1))

  print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

  # Save checkpoint when the val loss improves
  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    epochs_no_improve = 0
    checkpoint_path = os.path.join('Checkpoints/', checkpoint_filename)
    torch.save(model.state_dict(), checkpoint_path)
  else:
    epochs_no_improve += 1
    if epochs_no_improve == patience:
      print("Early stopping triggered")
      break

# Plotting losses

plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


Model Testing using Test dataset

In [None]:

#Load the file from Checkpoint

checkpoint_file_path = os.path.join('Checkpoints', 'model_checkpoint.pt')   
model.load_state_dict(torch.load(checkpoint_file_path))
all_results = []
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

for data in test_loader:
    section_result = summarizer.test_model(data,model)
    all_results.append(section_result)

# Calculate average ROUGE scores across all sections
total_sections = len(all_results)
print("total sections",total_sections)
total_rouge1 = sum(result.get("ROUGE-1 F1", 0) for result in all_results)
total_rouge2 = sum(result.get("ROUGE-2 F1", 0) for result in all_results)
total_rougeL = sum(result.get("ROUGE-L F1", 0) for result in all_results)

average_rouge1 = total_rouge1 / total_sections
average_rouge2 = total_rouge2 / total_sections
average_rougeL = total_rougeL / total_sections

print("Average ROUGE-1 F1:", average_rouge1)
print("Average ROUGE-2 F1:", average_rouge2)
print("Average ROUGE-L F1:", average_rougeL)

# Save results to JSON file
with open("summary_results_with_rouge.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)
