# Install libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !pip install textstat
# !pip install pytorch-lightning
# !pip install keybert
# !git clone https://github.com/feralvam/easse.git
# !pip install -e ./easse

In [3]:
datasets_base_dir = "/content/drive/MyDrive/NLP-Sem-3 Project"

# Code

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# from utils.utils
# Import relevant libraries
import time
import json
from pathlib import Path
import time
from tqdm import tqdm

def create_experiment_dir(repo_dir, model_config):
    """
    Create a unique experiment directory based on the current timestamp and model configuration.

    Args:
        repo_dir: The repository directory location
        model_config: The configuration dictionary to include in the directory name

    Returns:
        Path: The created experiment directory path
    """
    # Make sure repo_dir is a Path object
    if not isinstance(repo_dir, Path):
        repo_dir = Path(repo_dir)

    # Find the next folder number based on existing directories
    existing_dirs = [d for d in repo_dir.iterdir() if d.is_dir()]
    next_number = 1 + max(
        (int(d.name.split('_')[0]) for d in existing_dirs if d.name.split('_')[0].isdigit()),
        default=0
    )

    # Construct a directory name with model configuration values
    dir_name = (
        f"{next_number}_"
        f"{model_config['model_name']}_"
        f"{model_config['dataset']}_"
        f"epochs-{model_config['num_train_epochs']}_"
        f"batch-{model_config['train_batch_size']}_"
        f"val_batch-{model_config['valid_batch_size']}_"
        f"lambda-{model_config['lambda_']}_"
        f"prompt-{model_config['prompting_strategy']}_"
        f"div-{model_config['div_score']}_"
        f"keywords-{model_config['top_keywords']}_"
        f"test_sample-{model_config['test_sample_size']}"
    )
    print(dir_name)
    path = repo_dir / dir_name
    path.mkdir(parents=True, exist_ok=True)
    return path

def log_parameters(filepath, parameters):
    """
    Log parameters to a JSON file.

    Args:
        filepath: Path to the JSON file
        parameters: Parameters to log
    """
    with filepath.open('w') as f:
        json.dump({k: str(v) for k, v in parameters.items()}, f, indent=4)


def save_log(output_dir, model_name, epoch, loss=None, sari=None, data_type='train'):
    """
    Save log for training or validation data.

    Args:
        output_dir: output directory
        model_name: use model name to save
        epoch (int): Current epoch.
        loss (float): Loss value (optional).
        sari (float): SARI score (optional).
        data_type: Data type train or validation
    """
    if data_type == 'train':
        with open('{}/{}_training_log.csv'.format(output_dir, model_name.replace("/", "-")), 'a') as f:
            log_line = f"{epoch},{loss if loss is not None else ''}\n"
            f.write(log_line)
    elif data_type == 'validation':
        with open('{}/{}_validation_log.csv'.format(output_dir, model_name.replace("/", "-")), 'a') as f:
            log_line = f"{epoch},{loss if loss is not None else ''},{sari if sari is not None else ''}\n"
            f.write(log_line)


In [6]:
# from preprocessing

# Import required libraries
from pathlib import Path
import hashlib


def yield_lines(filepath):
    """
    Generator function to yield lines from a file.

    Args:
        filepath (str or Path): Path to the file.

    Yields:
        str: Each line from the file, stripped of trailing whitespace.
    """
    filepath = Path(filepath)
    with filepath.open('r') as f:
        for line in f:
            yield line.rstrip()


def read_lines(filepath):
    """
    Reads all lines from a file and returns them as a list.

    Args:
        filepath (str or Path): Path to the file.

    Returns:
        list: List of lines from the file, each stripped of trailing whitespace.
    """
    return [line.rstrip() for line in yield_lines(filepath)]


def get_data_filepath(data_set_dir, dataset, phase, data_type, i=None):
    """
    Constructs the file path for a dataset file based on provided parameters.

    Args:
        data_set_dir (str or Path): Directory containing datasets.
        dataset (str): Name of the dataset.
        phase (str): Phase of the data (e.g., 'train' or 'valid').
        data_type (str): Type of data (e.g., 'complex' or 'simple').
        i (int, optional): Optional index to append as a suffix to the filename.

    Returns:
        Path: Constructed file path as a Path object.
    """
    suffix = f'.{i}' if i is not None else ''
    data_filename = f'{dataset}.{phase}.{data_type}{suffix}'
    return Path(data_set_dir) / dataset / data_filename


def generate_hash(data):
    h = hashlib.new('md5')
    h.update(str(data).encode())
    return h.hexdigest()


def count_line(filepath):
    filepath = Path(filepath)
    line_count = 0
    with filepath.open("r") as f:
        for line in f:
            line_count += 1
    return line_count


def write_lines(lines, filepath):
    filepath = Path(filepath)
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with filepath.open("w") as fout:
        for line in lines:
            fout.write(line + '\n')


In [7]:

# from util.train_valid_data_generation

# Import libraries
from torch.utils.data import Dataset
# from util.processing.preprocessor import (
#     yield_lines, read_lines, get_data_filepath
# )


class TrainDataset(Dataset):
    def __init__(self, data_set_dir, dataset, tokenizer, max_len=256, sample_size=1):
        """
        Initializes the training dataset.

        Args:
            data_set_dir: Path to data
            dataset: Name of the dataset.
            tokenizer: Tokenizer object to tokenize the data.
            max_len (int): Maximum length of the tokenized sequences.
            sample_size (float): Fraction of the dataset to sample.
        """
        self.sample_size = sample_size
        self.max_len = max_len
        self.tokenizer = tokenizer

        print("Initializing TrainDataset...")
        self.source_filepath = get_data_filepath(data_set_dir, dataset, 'train', 'complex')
        self.target_filepath = get_data_filepath(data_set_dir, dataset, 'train', 'simple')
        print("Dataset paths initialized.")

        self._load_data()

    def _load_data(self):
        """Loads the source and target data."""
        self.inputs = read_lines(self.source_filepath)
        self.targets = read_lines(self.target_filepath)

    def __len__(self):
        """Returns the length of the dataset based on the sample size."""
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        """Fetches a single item from the dataset."""
        source = self.inputs[index]
        target = self.targets[index]

        tokenized_inputs = self.tokenizer(
            [source],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )
        tokenized_targets = self.tokenizer(
            [target],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )

        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()
        src_mask = tokenized_inputs["attention_mask"].squeeze()
        target_mask = tokenized_targets["attention_mask"].squeeze()

        return {
            "source_ids": source_ids,
            "source_mask": src_mask,
            "target_ids": target_ids,
            "target_mask": target_mask,
            "sources": source,
            "targets": [target],
            "source": source,
            "target": target
        }


class ValDataset(Dataset):
    def __init__(self, data_set_dir, dataset, tokenizer, max_len=256, sample_size=1):
        """
        Initializes the validation dataset.

        Args:
            data_set_dir: Path to data
            dataset: Name or path of the dataset.
            tokenizer: Tokenizer object to tokenize the data.
            max_len (int): Maximum length of the tokenized sequences.
            sample_size (float): Fraction of the dataset to sample.
        """
        self.sample_size = sample_size
        self.max_len = max_len
        self.tokenizer = tokenizer

        print("Initializing ValDataset...")
        self.source_filepath = get_data_filepath(data_set_dir, dataset, 'valid', 'complex')
        self.target_filepaths = get_data_filepath(data_set_dir, dataset, 'valid', 'simple')
        print("Dataset paths initialized.")

        self._load_data()

    def _load_data(self):
        """Loads the source and target data."""
        self.inputs = [line for line in yield_lines(self.source_filepath)]
        self.targets = [line for line in yield_lines(self.target_filepaths)]

    def __len__(self):
        """Returns the length of the dataset based on the sample size."""
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        """Fetches a single item from the dataset."""
        return {
            "source": self.inputs[index],
            "targets": self.targets[index]
        }


In [8]:
# from evaluate_model.evaluation metrics

# Import necessary libraries
from nltk.tokenize import word_tokenize
from pathlib import Path
import textstat
# from util.processing.preprocessor import get_data_filepath
from easse.sari import corpus_sari as easse_corpus_sari
from easse.fkgl import corpus_fkgl as easse_corpus_fkgl
import random
import pandas as pd


def load_dataset_validation(dataset_dir, dataset_name, phase='test', percentage=1.0):
    """
    Load the dataset for evaluation with an optional parameter to specify the percentage of data to be used.

    Args:
        dataset_dir (str or Path): Path to the dataset directory.
        dataset_name (str): Name of the dataset (e.g., 'dwiki' or 'wiki_doc').
        phase (str): Dataset phase to load ('train', 'valid', 'test').
        percentage (float): Percentage of data to be used (value between 0.0 and 1.0).

    Returns:
        tuple: (list of complex sentences, list of simple sentences)
    """
    complex_filepath = get_data_filepath(dataset_dir, dataset_name, phase, 'complex')
    simple_filepath = get_data_filepath(dataset_dir, dataset_name, phase, 'simple')

    # Read lines from files
    complex_sents = Path(complex_filepath).read_text().splitlines()
    simple_sents = Path(simple_filepath).read_text().splitlines()

    # Use the specified percentage of the data
    data_size = len(complex_sents)
    selected_size = int(data_size * percentage)

    # Randomly select the data subset
    indices = list(range(data_size))
    random.shuffle(indices)
    selected_indices = indices[:selected_size]

    complex_sents = [complex_sents[i] for i in selected_indices]
    simple_sents = [simple_sents[i] for i in selected_indices]

    return complex_sents, simple_sents



class BartModelEvaluator:
    """
    A class for evaluating a BART-based summarization model using SARI, D-SARI, and FKGL metrics.

    Args:
        model_config : Configuration dictionary containing the device to run the model on ("cuda", "cpu", or "cpu").
        model (BartForConditionalGeneration): Pre-trained BART model to evaluate.
        tokenizer (BartTokenizer): Tokenizer for the BART model.
    """
    def __init__(self, model_config, model, tokenizer):
        self.model = model.to(model_config['device'])
        self.tokenizer = tokenizer
        self.device = model_config['device']
        self.max_seq_length = model_config['max_seq_length']
        self.output_location = model_config['output_dir']

    def generate_summary(self, sentence, max_length=256):
        """
        Generate a summary for a given input sentence using the BART model.

        Args:
            sentence (str): Input sentence to be summarized.
            max_length (int): Maximum length of the generated summary.

        Returns:
            str: Generated summary.
        """
        inputs = self.tokenizer(
            sentence,
            return_tensors="pt",
            max_length=self.max_seq_length,
            truncation=True,
            padding="max_length"
        ).to(self.device)

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        summary_ids = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    @staticmethod
    def calculate_sari_and_d_sari(source_sent, predicted_sent, references):
        """
        Calculate SARI and D-SARI scores for text simplification.

        Args:
            source_sent (str): Source sentence.
            predicted_sent (str): Predicted simplified sentence.
            references (list): List of reference simplified sentences.

        Returns:
            tuple: (SARI score, D-SARI score)
        """
        source_tokens = set(word_tokenize(source_sent))
        predicted_tokens = set(word_tokenize(predicted_sent))
        reference_tokens = [set(word_tokenize(ref)) for ref in references]

        # Calculate addition, deletion, and keep scores
        add_scores = [
            len(predicted_tokens - ref) / max(1, len(predicted_tokens))
            for ref in reference_tokens
        ]
        keep_scores = [
            len(predicted_tokens & ref) / max(1, len(ref))
            for ref in reference_tokens
        ]
        delete_score = len(source_tokens - predicted_tokens) / max(1, len(source_tokens))

        sari = (sum(add_scores) + sum(keep_scores) + delete_score) / (len(add_scores) + len(keep_scores) + 1)
        d_sari = delete_score  # D-SARI focuses specifically on the deletion component

        return sari, d_sari

    def calculate_fkgl(self, text):
        """
        Calculate the Flesch-Kincaid Grade Level (FKGL) score.

        Args:
            text (str): Input text.

        Returns:
            float: FKGL score.
        """
        return textstat.flesch_kincaid_grade(text)

    import pandas as pd

    def evaluate(self, source_sentences, reference_sentences):
        """
        Evaluate a set of source and reference sentences using SARI, D-SARI, and FKGL metrics.

        Args:
            source_sentences (list): List of source sentences to be simplified.
            reference_sentences (list): List of corresponding reference sentences.

        Returns:
            dict: Dictionary containing average SARI, D-SARI, and FKGL scores.
        """
        total_sari, total_d_sari, total_fkgl = 0, 0, 0
        predictions = []
        metrics = []

        for i, source_sent in enumerate(tqdm(source_sentences, desc="Evaluating sentences")):
            try:
                predicted_sent = self.generate_summary(source_sent)
            except Exception as e:
                print(f"Error generating summary for sample {i}: {e}")
                predicted_sent = ""  # Fallback to an empty prediction

            predictions.append(predicted_sent)
            references = [reference_sentences[i]]  # Assuming one reference per source

            # Calculate SARI and D-SARI scores
            sari, d_sari = self.calculate_sari_and_d_sari(source_sent, predicted_sent, references)
            total_sari += sari
            total_d_sari += d_sari

            # Calculate FKGL score
            fkgl = self.calculate_fkgl(predicted_sent)
            total_fkgl += fkgl

            # Calculate EASSE SARI and FKGL for this sample
            try:
                easse_sari = easse_corpus_sari(orig_sents=[source_sent], sys_sents=[predicted_sent],
                                               refs_sents=[references])
                easse_fkgl = easse_corpus_fkgl([predicted_sent])
            except Exception as e:
                print(f"Error calculating EASSE metrics for sample {i}: {e}")
                easse_sari = 0
                easse_fkgl = 0

            # # Print metrics for the sample
            # print(f"Sample {i + 1}/{len(source_sentences)}")
            # print(f"Source: {source_sent}")
            # print(f"Predicted: {predicted_sent}")
            # print(f"Reference: {references[0]}")
            # print(f"SARI: {sari:.2f}, D-SARI: {d_sari:.2f}, FKGL: {fkgl:.2f}")
            # print(f"EASSE SARI: {easse_sari:.2f}, EASSE FKGL: {easse_fkgl:.2f}\n")

            # Store metrics in a dictionary for each sample
            metrics.append({
                'Sample': i + 1,
                'Source': source_sent,
                'Predicted': predicted_sent,
                'Reference': references[0],
                'SARI': sari,
                'D-SARI': d_sari,
                'FKGL': fkgl,
                'EASSE SARI': easse_sari,
                'EASSE FKGL': easse_fkgl
            })

        # Calculate average scores
        avg_sari = total_sari / len(source_sentences)
        avg_d_sari = total_d_sari / len(source_sentences)
        avg_fkgl = total_fkgl / len(source_sentences)

        # Calculate EASSE SARI and FKGL scores for all predictions
        try:
            easse_sari = easse_corpus_sari(orig_sents=source_sentences, sys_sents=predictions,
                                           refs_sents=[reference_sentences])
            easse_fkgl = easse_corpus_fkgl(predictions)
        except Exception as e:
            print(f"Error calculating EASSE metrics for all predictions: {e}")
            easse_sari = 0
            easse_fkgl = 0

        print(f"Average SARI: {avg_sari:.2f}")
        print(f"Average D-SARI: {avg_d_sari:.2f}")
        print(f"Average FKGL: {avg_fkgl:.2f}")
        print(f"EASSE SARI: {easse_sari:.2f}")
        print(f"EASSE FKGL: {easse_fkgl:.2f}")

        # Save metrics to a CSV file using pandas
        df = pd.DataFrame(metrics)
        df.to_csv('{}/evaluation_metrics_baseline.csv'.format(self.output_location), index=False)

        return {
            "SARI": avg_sari,
            "D-SARI": avg_d_sari,
            "FKGL": avg_fkgl,
            "EASSE SARI": easse_sari,
            "EASSE FKGL": easse_fkgl
        }, df



In [9]:
# baseline_models.baseline_model

# Import necessary libraries
from torch.utils.data import DataLoader
# from util.train_valid_data_generation import TrainDataset, ValDataset
import pytorch_lightning as pl
from transformers import (
    AdamW,
    AutoModelForSeq2SeqLM, AutoTokenizer,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
)
from easse.sari import corpus_sari
# from util.utils import save_log


class Seq2SeqFineTunedModel(pl.LightningModule):
    """
    A generic PyTorch Lightning module for fine-tuning a sequence-to-sequence model for summarization or text simplification.

    Args:
        training_parameters (dict): Dictionary of training parameters.
        model_name (str): Pre-trained model to fine-tune (e.g., 't5-base', 'Yale-LILY/brio-cnndm-uncased').
    """
    def __init__(self, training_parameters, model_name='t5-base'):
        super(Seq2SeqFineTunedModel, self).__init__()

        # Store hyperparameters and initialize model and tokenizer
        self.save_hyperparameters()
        self.training_parameters = training_parameters
        self.device_name = training_parameters['device']

        # Initialize parameters from the training dictionary
        self.model_name = training_parameters['model_name']
        self.train_batch_size = training_parameters['train_batch_size']
        self.valid_batch_size = training_parameters['valid_batch_size']
        self.learning_rate = training_parameters['learning_rate']
        self.max_seq_length = training_parameters['max_seq_length']
        self.adam_epsilon = training_parameters['adam_epsilon']
        self.weight_decay = training_parameters['weight_decay']
        self.warmup_steps = training_parameters['warmup_steps']
        self.train_sample_size = training_parameters['train_sample_size']
        self.valid_sample_size = training_parameters['valid_sample_size']
        self.num_train_epochs = training_parameters['num_train_epochs']
        self.gradient_accumulation_steps = training_parameters['gradient_accumulation_steps']
        self.custom_loss = training_parameters.get('custom_loss', False)
        self.scheduler_type = training_parameters.get('scheduler_type', 'linear')
        with open('{}/{}_training_log.csv'.format(
                self.training_parameters['output_dir'],
                self.training_parameters['model_name'].replace("/", "-")
        ), 'w') as f: f.write('epoch,loss\n')
        with open('{}/{}_validation_log.csv'.format(
                training_parameters['output_dir'],
                self.training_parameters['model_name'].replace("/", "-")
        ), 'w') as f: f.write('epoch,loss,sari\n')

        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(self.device_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Data and output paths
        self.dataset = self.training_parameters['dataset']
        self.data_location = self.training_parameters['data_location']
        self.model_store_path = training_parameters['output_dir'] / (model_name + '_fine_tuned')

    def is_logger(self):
        """
        Returns True if this is the first rank (for distributed training), False otherwise.
        """
        return self.trainer.global_rank <= 0

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None,
                decoder_attention_mask=None, labels=None):
        """
        Defines the forward pass of the model.

        Args:
            input_ids (tensor): Input tensor containing tokenized input IDs.
            attention_mask (tensor): Attention mask for the input.
            decoder_input_ids (tensor): Decoder input IDs for sequence generation.
            decoder_attention_mask (tensor): Attention mask for the decoder.
            labels (tensor): Target labels for training.

        Returns:
            ModelOutput: Model's output, including loss if labels are provided.
        """
        return self.model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          decoder_input_ids=decoder_input_ids,
                          decoder_attention_mask=decoder_attention_mask,
                          labels=labels)

    def training_step(self, batch, batch_idx):
        """
        Performs a training step, computes loss, and logs the results.

        Args:
            batch (dict): Batch of training data.
            batch_idx (int): Index of the current batch.

        Returns:
            Tensor: Loss value for the current batch.
        """
        source = batch["source"]
        labels = batch['target_ids']

        # Ignore padding tokens in loss calculation
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(input_ids=batch["source_ids"],
                       attention_mask=batch["source_mask"],
                       labels=labels,
                       decoder_attention_mask=batch["target_mask"])

        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)
        save_log(
            self.training_parameters['output_dir'],
            self.training_parameters['model_name'],
            self.current_epoch,
            loss=loss.item(),
            data_type='train'
        )
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Performs a validation step, computes loss, and logs the results.

        Args:
            batch (dict): Batch of validation data.
            batch_idx (int): Index of the current batch.

        Returns:
            Tensor: Loss value for the current batch.
        """
        loss = self.sari_validation_step(batch)
        self.log('val_loss', loss, batch_size=self.valid_batch_size)
        return loss

    def sari_validation_step(self, batch):
        """
        Calculates the SARI score (Summarization Accuracy with Respect to ROUGE) for the validation batch.

        Args:
            batch (dict): Batch of validation data.

        Returns:
            float: SARI score for the batch.
        """

        def generate(sentence):
            encoding = self.tokenizer(
                [sentence],
                max_length=self.max_seq_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            ).to(self.device)

            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']

            beam_outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                do_sample=True,
                max_length=256,
                num_beams=5,
                top_k=120,
                top_p=0.95,
                early_stopping=True,
                num_return_sequences=1
            ).to(self.device)

            return self.tokenizer.decode(beam_outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

        pred_sents = [generate(source) for source in batch["source"]]
        score = corpus_sari(batch["source"], pred_sents, [batch["targets"]])
        loss = 1 - score / 100
        save_log(
            self.training_parameters['output_dir'],
            self.training_parameters['model_name'],
            self.current_epoch,
            loss=loss,
            sari=score,
            data_type='validation'
        )

        return loss

    def configure_optimizers(self):
        """
        Configures the optimizer and learning rate scheduler.

        Returns:
            list: A list containing the optimizer and scheduler.
        """
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)

        # Calculate the total training steps
        t_total = (
                (
                        len(self.train_dataloader().dataset) // self.train_batch_size
                ) // self.gradient_accumulation_steps
                * float(self.num_train_epochs)
        )

        if self.scheduler_type == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=t_total
            )
        else:
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=t_total
            )

        return [optimizer], [{'scheduler': scheduler, 'interval': 'step', 'frequency': 1}]

    def save_core_model(self):
        """
        Saves the fine-tuned model and tokenizer to the specified directory.
        """
        self.model.save_pretrained(self.model_store_path)
        self.tokenizer.save_pretrained(self.model_store_path)

    def train_dataloader(self):
        """
        Returns the training DataLoader.

        Returns:
            DataLoader: The training DataLoader.
        """
        train_dataset = TrainDataset(
            data_set_dir=self.data_location,
            dataset=self.dataset,
            tokenizer=self.tokenizer,
            max_len=self.max_seq_length,
            sample_size=self.train_sample_size,
        )
        dataloader = DataLoader(
            train_dataset,
            batch_size=self.train_batch_size,
            drop_last=True,
            shuffle=True,
            pin_memory=True,
            num_workers=0
        )
        return dataloader

    def val_dataloader(self):
        """
        Returns the validation DataLoader.

        Returns:
            DataLoader: The validation DataLoader.
        """
        val_dataset = ValDataset(
            data_set_dir=self.data_location,
            dataset=self.dataset,
            tokenizer=self.tokenizer,
            max_len=self.max_seq_length,
            sample_size=self.valid_sample_size
        )
        return DataLoader(
            val_dataset,
            batch_size=self.valid_batch_size
        )


In [10]:
# from keyword_prompting

from keybert import KeyBERT

# Initialize KeyBERT model
keybert_model = KeyBERT()


# Function to extract keywords with KeyBERT
def extract_keywords(text, top_n=5, diversity=0.5):
    """
    Extracts keywords from a text using KeyBERT.

    Args:
        text (str): The input text.
        top_n (int): Number of top keywords to extract.
        diversity (float): Controls the diversity of keywords (0 = low diversity, 1 = high diversity).

    Returns:
        list: List of tuples containing keywords and their scores.
    """
    return keybert_model.extract_keywords(text, top_n=top_n, diversity=diversity)


# Function to create prompts using the kw_score strategy
def create_kw_score_prompt(text, top_n=5, diversity=0.5):
    """
    Creates a prompt using the kw_score strategy.

    Args:
        text (str): The input text.
        top_n (int): Number of keywords to extract.
        diversity (float): Controls the diversity of keywords (0 = low diversity, 1 = high diversity).

    Returns:
        str: The generated prompt.
    """
    keywords = extract_keywords(text, top_n, diversity)
    keyword_prompt = " ".join([f"{kw[0]}:{kw[1]:.2f}" for kw in keywords])
    return f"{keyword_prompt} {text}"


# Function to create prompts using the kw_sep strategy
def create_kw_sep_prompt(text, top_n=5, diversity=0.5):
    """
    Creates a prompt using the kw_sep strategy.

    Args:
        text (str): The input text.
        top_n (int): Number of keywords to extract.
        diversity (float): Controls the diversity of keywords (0 = low diversity, 1 = high diversity).

    Returns:
        str: The generated prompt.
    """
    keywords = extract_keywords(text, top_n, diversity)
    keyword_prompt = " </s> ".join([kw[0] for kw in keywords]) + " </s>"
    return f"{keyword_prompt} {text}"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# from evaluate_model.simsum_evaluator

# Import necessary libraries
from nltk.tokenize import word_tokenize
from pathlib import Path
import textstat
# from util.processing.preprocessor import get_data_filepath
# from util.simsum_models.keyword_prompting import create_kw_sep_prompt, create_kw_score_prompt
from easse.sari import corpus_sari as easse_corpus_sari
from easse.fkgl import corpus_fkgl as easse_corpus_fkgl
import pandas as pd


def load_dataset(dataset_dir, dataset_name, phase='test'):
    """
    Load the dataset for evaluation.

    Args:
        dataset_dir (str or Path): Path to the dataset directory.
        dataset_name (str): Name of the dataset (e.g., 'dwiki' or 'wiki_doc').
        phase (str): Dataset phase to load ('train', 'valid', 'test').

    Returns:
        tuple: (list of complex sentences, list of simple sentences)
    """
    complex_filepath = get_data_filepath(dataset_dir, dataset_name, phase, 'complex')
    simple_filepath = get_data_filepath(dataset_dir, dataset_name, phase, 'simple')

    # Read lines from files
    complex_sents = Path(complex_filepath).read_text().splitlines()
    simple_sents = Path(simple_filepath).read_text().splitlines()

    return complex_sents, simple_sents


class SumSimEvaluator:
    """
    A class for evaluating a SumSim-based summarization model using SARI, D-SARI, and FKGL metrics.

    Args:
        model_config : Configuration dictionary containing the device to run the model on ("cuda", "cpu", or "cpu").
        summarizer (AutoModelForSeq2SeqLM): Pre-trained summarization model.
        simplifier (AutoModelForSeq2SeqLM): Pre-trained simplification model.
        summarizer_tokenizer (AutoTokenizer): Tokenizer for the summarization model.
        simplifier_tokenizer (AutoTokenizer): Tokenizer for the simplification model.
    """
    def __init__(self, model_config, summarizer, simplifier, summarizer_tokenizer, simplifier_tokenizer):
        self.summarizer = summarizer.to(model_config['device'])
        self.simplifier = simplifier.to(model_config['device'])
        self.summarizer_tokenizer = summarizer_tokenizer
        self.simplifier_tokenizer = simplifier_tokenizer
        self.device = model_config['device']
        self.max_seq_length = model_config['max_seq_length']
        self.prompting_strategy = model_config.get('prompting_strategy', 'kw_sep')
        self.output_location = model_config['output_dir']

    def generate_summary(self, sentence, max_length=256):
        """
        Generate a summary for a given input sentence using the summarizer.

        Args:
            sentence (str): Input sentence to be summarized.
            max_length (int): Maximum length of the generated summary.

        Returns:
            str: Generated summary.
        """
        inputs = self.summarizer_tokenizer(
            sentence,
            return_tensors="pt",
            max_length=self.max_seq_length,
            truncation=True,
            padding="max_length"
        ).to(self.device)

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        summary_ids = self.summarizer.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

        return self.summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    def generate_simplified_text(self, source_sent):
        """
        Generate simplified text using the SumSim model with keyword prompting.

        Args:
            source_sent (str): Source sentence to be simplified.

        Returns:
            str: Simplified text.
        """
        # Apply keyword prompting based on strategy
        if self.prompting_strategy == 'kw_score':
            prompt_text = create_kw_score_prompt(source_sent)
        elif self.prompting_strategy == 'kw_sep':
            prompt_text = create_kw_sep_prompt(source_sent)
        else:
            prompt_text = source_sent

        # Generate summary using the summarizer
        summary = self.generate_summary(prompt_text)

        # Tokenize the summary for simplification
        inputs = self.simplifier_tokenizer(
            summary,
            return_tensors="pt",
            max_length=self.max_seq_length,
            truncation=True,
            padding="max_length"
        ).to(self.device)

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        # Generate simplified output using the simplifier
        simplified_ids = self.simplifier.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=256,
            num_beams=5,
            early_stopping=True
        )

        return self.simplifier_tokenizer.decode(simplified_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    @staticmethod
    def calculate_sari_and_d_sari(source_sent, predicted_sent, references):
        """
        Calculate SARI and D-SARI scores for text simplification.

        Args:
            source_sent (str): Source sentence.
            predicted_sent (str): Predicted simplified sentence.
            references (list): List of reference simplified sentences.

        Returns:
            tuple: (SARI score, D-SARI score)
        """
        source_tokens = set(word_tokenize(source_sent))
        predicted_tokens = set(word_tokenize(predicted_sent))
        reference_tokens = [set(word_tokenize(ref)) for ref in references]

        # Calculate addition, deletion, and keep scores
        add_scores = [
            len(predicted_tokens - ref) / max(1, len(predicted_tokens))
            for ref in reference_tokens
        ]
        keep_scores = [
            len(predicted_tokens & ref) / max(1, len(ref))
            for ref in reference_tokens
        ]
        delete_score = len(source_tokens - predicted_tokens) / max(1, len(source_tokens))

        sari = (sum(add_scores) + sum(keep_scores) + delete_score) / (len(add_scores) + len(keep_scores) + 1)
        d_sari = delete_score  # D-SARI focuses specifically on the deletion component

        return sari, d_sari

    def calculate_fkgl(self, text):
        """
        Calculate the Flesch-Kincaid Grade Level (FKGL) score.

        Args:
            text (str): Input text.

        Returns:
            float: FKGL score.
        """
        return textstat.flesch_kincaid_grade(text)

    import pandas as pd

    def evaluate(self, source_sentences, reference_sentences):
        """
        Evaluate a set of source and reference sentences using SARI, D-SARI, and FKGL metrics.

        Args:
            source_sentences (list): List of source sentences to be simplified.
            reference_sentences (list): List of corresponding reference sentences.

        Returns:
            dict: Dictionary containing average SARI, D-SARI, and FKGL scores.
        """
        total_sari, total_d_sari, total_fkgl = 0, 0, 0
        predictions = []
        metrics = []

        for i, source_sent in enumerate(tqdm(source_sentences, desc="Evaluating sentences")):
            try:
                predicted_sent = self.generate_simplified_text(source_sent)
            except Exception as e:
                print(f"Error generating simplified text for sample {i}: {e}")
                predicted_sent = ""  # Fallback to an empty prediction

            predictions.append(predicted_sent)
            references = [reference_sentences[i]]  # Assuming one reference per source

            # Calculate SARI and D-SARI scores
            sari, d_sari = self.calculate_sari_and_d_sari(source_sent, predicted_sent, references)
            total_sari += sari
            total_d_sari += d_sari

            # Calculate FKGL score
            fkgl = self.calculate_fkgl(predicted_sent)
            total_fkgl += fkgl

            # Calculate EASSE SARI and FKGL for this sample
            try:
                easse_sari = easse_corpus_sari(orig_sents=[source_sent], sys_sents=[predicted_sent],
                                               refs_sents=[references])
                easse_fkgl = easse_corpus_fkgl([predicted_sent])
            except Exception as e:
                print(f"Error calculating EASSE metrics for sample {i}: {e}")
                easse_sari = 0
                easse_fkgl = 0

            # # Print metrics for the sample
            # print(f"Sample {i + 1}/{len(source_sentences)}")
            # print(f"Source: {source_sent}")
            # print(f"Predicted: {predicted_sent}")
            # print(f"Reference: {references[0]}")
            # print(f"SARI: {sari:.2f}, D-SARI: {d_sari:.2f}, FKGL: {fkgl:.2f}")
            # print(f"EASSE SARI: {easse_sari:.2f}, EASSE FKGL: {easse_fkgl:.2f}\n")

            # Store metrics in a dictionary
            metrics.append({
                'Sample': i + 1,
                'Source': source_sent,
                'Predicted': predicted_sent,
                'Reference': references[0],
                'SARI': sari,
                'D-SARI': d_sari,
                'FKGL': fkgl,
                'EASSE SARI': easse_sari,
                'EASSE FKGL': easse_fkgl
            })

        # Calculate average scores
        avg_sari = total_sari / len(source_sentences)
        avg_d_sari = total_d_sari / len(source_sentences)
        avg_fkgl = total_fkgl / len(source_sentences)

        # Calculate EASSE SARI and FKGL scores for all predictions
        try:
            easse_sari = easse_corpus_sari(orig_sents=source_sentences, sys_sents=predictions,
                                           refs_sents=[reference_sentences])
            easse_fkgl = easse_corpus_fkgl(predictions)
        except Exception as e:
            print(f"Error calculating EASSE metrics for all predictions: {e}")
            easse_sari = 0
            easse_fkgl = 0

        print(f"Average SARI: {avg_sari:.2f}")
        print(f"Average D-SARI: {avg_d_sari:.2f}")
        print(f"Average FKGL: {avg_fkgl:.2f}")
        print(f"EASSE SARI: {easse_sari:.2f}")
        print(f"EASSE FKGL: {easse_fkgl:.2f}")

        # Save metrics to a CSV file using pandas
        df = pd.DataFrame(metrics)
        df.to_csv('{}/evaluation_metrics_simsum.csv'.format(self.output_location), index=False)

        return {
            "SARI": avg_sari,
            "D-SARI": avg_d_sari,
            "FKGL": avg_fkgl,
            "EASSE SARI": easse_sari,
            "EASSE FKGL": easse_fkgl
        }, df



In [12]:
# functions from utils.train
# Import relevant libraries
import os
import logging
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
# from util.baseline_models.baseline_model import Seq2SeqFineTunedModel

logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        """
        Logs validation results at the end of each validation epoch.
        """
        logger.info("***** Validation results *****")
        if hasattr(pl_module, "is_logger") and pl_module.is_logger():
            metrics = trainer.callback_metrics
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info(f"{key} = {metrics[key]}\n")
                    print(f"{key}: {metrics[key]}")

    def on_test_end(self, trainer, pl_module):
        """
        Logs and saves test results to a file at the end of testing.
        """
        logger.info("***** Test results *****")
        if hasattr(pl_module, "is_logger") and pl_module.is_logger():
            metrics = trainer.callback_metrics
            output_file = os.path.join(pl_module.args.output_dir, "test_results.txt")
            with open(output_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info(f"{key} = {metrics[key]}\n")
                        writer.write(f"{key} = {metrics[key]}\n")


def train(model_config, model_instance=None):
    """
    Function to train the model.

    Args:
        model_config: Dictionary containing model configurations.
        model_instance: Instance of the model to be trained (optional).
    """
    # Seed for reproducibility
    seed = model_config.get('seed', 42)
    pl.seed_everything(seed)

    # Model checkpointing configuration
    model_name = model_config.get('model_name')
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=model_config['output_dir'],
        filename=f"{model_name}-checkpoint-{{epoch}}",
        monitor="val_loss",
        verbose=True,
        mode="min",
        save_top_k=1
    )
    # Progress bar callback
    bar_callback = pl.callbacks.TQDMProgressBar(refresh_rate=1)

    # Training parameters
    train_params = {
        'accumulate_grad_batches': model_config.get('gradient_accumulation_steps', 1),
        'max_epochs': model_config.get('num_train_epochs', 5),
        'callbacks': [LoggingCallback(), checkpoint_callback, bar_callback],
        'logger': TensorBoardLogger(f"{model_config['output_dir']}/logs"),
        'num_sanity_val_steps': 0
    }

    # Model initialization (if model instance is not provided)
    if model_instance is None:
        print("Initializing baseline model...")
        model = Seq2SeqFineTunedModel(model_config)
    else:
        model = model_instance

    # Trainer setup and training
    trainer = pl.Trainer(**train_params)
    print("Starting training...")
    trainer.fit(model)
    print("Training finished.")

    # Saving the trained model
    output_dir = model_config['output_dir']
    if "simsum" in model_name:
        summarizer_save_path = os.path.join(output_dir, f"{model_name}-summarizer-final")
        simplifier_save_path = os.path.join(output_dir, f"{model_name}-simplifier-final")
        print(f"Saving summarizer to {summarizer_save_path}...")
        model.summarizer.save_pretrained(summarizer_save_path)
        model.summarizer_tokenizer.save_pretrained(summarizer_save_path)
        print(f"Summarizer saved at {summarizer_save_path}.")

        print(f"Saving simplifier to {simplifier_save_path}...")
        model.simplifier.save_pretrained(simplifier_save_path)
        model.simplifier_tokenizer.save_pretrained(simplifier_save_path)
        print(f"Simplifier saved at {simplifier_save_path}.")
    else:
        model_save_path = os.path.join(output_dir, f"{model_name}-final")
        print(f"Saving model to {model_save_path}...")
        model.model.save_pretrained(model_save_path)
        model.tokenizer.save_pretrained(model_save_path)
        print(f"Model saved at {model_save_path}.")

    return model, output_dir


In [13]:
# from generate_plots

# Import relevant libraries
import pandas as pd
import matplotlib.pyplot as plt
import os


def identify_files(folder_path):
    """
    Identifies and categorizes files in a specified folder based on
    substrings "_training_log", "_validation_log", and "_evaluation_metrics".

    Parameters:
    - folder_path (str): Path to the folder containing files.

    Returns:
    - dict: A dictionary with categorized files, with keys:
      'training_log', 'validation_log', and 'evaluation_metrics'.
    """
    categorized_files = {
        'training_log': None,
        'validation_log': None,
        'evaluation_metrics': None
    }

    # Iterate over files in the folder
    for file_name in os.listdir(folder_path):
        if "_training_log" in file_name:
            categorized_files['training_log'] = os.path.join(folder_path, file_name)
        elif "_validation_log" in file_name:
            categorized_files['validation_log'] = os.path.join(folder_path, file_name)
        elif "evaluation_metrics" in file_name:
            categorized_files['evaluation_metrics'] = os.path.join(folder_path, file_name)

    return categorized_files


def plot_average_loss(output_dir, training_log_path, validation_log_path, output_file='average_loss.csv'):
    """
    Plots the average training and validation loss over epochs, and saves
    the averaged loss data to a CSV file.

    Parameters:
    - output_dir: Output directory
    - training_log_path (str): Path to the training log CSV file.
    - validation_log_path (str): Path to the validation log CSV file.
    - output_file (str): Path to save the output CSV file containing average loss data.
    """
    # Load data
    training_log = pd.read_csv(training_log_path)
    validation_log = pd.read_csv(validation_log_path)

    # Calculate average loss
    avg_training_loss = training_log.groupby('epoch')['loss'].mean().reset_index()
    avg_training_loss['data_type'] = 'training'
    avg_validation_loss = validation_log.groupby('epoch')['loss'].mean().reset_index()
    avg_validation_loss['data_type'] = 'validation'

    # Combine the data
    combined_loss = pd.concat([avg_training_loss, avg_validation_loss], axis=0)
    combined_loss.columns = ['epoch', 'average_loss', 'data_type']

    # Save to CSV
    combined_loss.to_csv("{}/{}".format(output_dir, output_file), index=False)

    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(avg_training_loss['epoch'], avg_training_loss['loss'], marker='o', linestyle='-', color='b', label='Average Training Loss')
    # plt.plot(avg_validation_loss['epoch'], avg_validation_loss['loss'], marker='x', linestyle='--', color='r', label='Average Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Average Loss')
    plt.title('Epoch vs Average Training Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig("{}/loss.png".format(output_dir))


def plot_metric_distributions(output_dir, evaluation_metrics_path, output_file='average_metrics.csv'):
    """
    Plots the distribution of specified metrics from the evaluation metrics data
    and saves the averaged metrics to a CSV file.

    Parameters:
    - output_dir: Output directory
    - evaluation_metrics_path (str): Path to the evaluation metrics CSV file.
    - output_file (str): Path to save the output CSV file containing average metrics data.
    """
    # Load data
    evaluation_metrics = pd.read_csv(evaluation_metrics_path)
    metrics_columns = ['SARI', 'D-SARI', 'FKGL', 'EASSE SARI', 'EASSE FKGL']

    # Calculate average metrics
    avg_metrics = evaluation_metrics[metrics_columns].mean().reset_index()
    avg_metrics.columns = ['metric', 'average_value']
    avg_metrics['data_type'] = 'evaluation'

    # Save to CSV
    avg_metrics.to_csv("{}/{}".format(output_dir, output_file), index=False)

    # Plot distributions
    for metric in metrics_columns:
        plt.figure(figsize=(8, 5))
        evaluation_metrics[metric].plot(kind='hist', bins=30, alpha=0.7, color='teal', edgecolor='black')
        plt.xlabel(metric)
        plt.ylabel('Frequency')
        plt.title(f'Distribution of {metric}')
        plt.grid(True)
        plt.savefig("{}/metrics_{}.png".format(output_dir, metric))


In [14]:
# Import necessary libraries
# Import necessary libraries
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import (
    AdamW, AutoModelForSeq2SeqLM, AutoTokenizer,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
)
from easse.sari import corpus_sari
import torch
import torch.nn as nn

class SumSimModel(pl.LightningModule):
    """
    A PyTorch Lightning model for fine-tuning summarization and simplification using a sequence-to-sequence model
    with keyword prompting and custom loss.

    Args:
        training_parameters (dict): Dictionary of training parameters.
        summarizer_model_name (str): Pre-trained summarization model.
        simplifier_model_name (str): Pre-trained simplification model.
    """
    def __init__(self, training_parameters, summarizer_model_name='t5-base', simplifier_model_name='t5-base'):
        super(SumSimModel, self).__init__()

        # Store hyperparameters and initialize model and tokenizer
        self.save_hyperparameters()
        self.training_parameters = training_parameters
        self.device_name = training_parameters['device']

        # Initialize parameters from the training dictionary
        self.summarizer_model_name = training_parameters.get('summarizer_model_name', summarizer_model_name)
        self.simplifier_model_name = training_parameters.get('simplifier_model_name', simplifier_model_name)
        self.train_batch_size = training_parameters['train_batch_size']
        self.valid_batch_size = training_parameters['valid_batch_size']
        self.learning_rate = training_parameters['learning_rate']
        self.max_seq_length = training_parameters['max_seq_length']
        self.adam_epsilon = training_parameters['adam_epsilon']
        self.weight_decay = training_parameters['weight_decay']
        self.warmup_steps = training_parameters['warmup_steps']
        self.train_sample_size = training_parameters['train_sample_size']
        self.valid_sample_size = training_parameters['valid_sample_size']
        self.num_train_epochs = training_parameters['num_train_epochs']
        self.gradient_accumulation_steps = training_parameters['gradient_accumulation_steps']
        self.custom_loss = training_parameters.get('custom_loss', False)
        self.lambda_ = training_parameters.get('lambda_', 1)
        self.hidden_size = training_parameters.get('hidden_size', 1)
        self.w1 = training_parameters.get('w1', 1)
        self.top_keywords = training_parameters['top_keywords']
        self.div_score = training_parameters['div_score']
        self.prompting_strategy = training_parameters.get('prompting_strategy', 'no_prompting')
        with open('{}/{}_training_log.csv'.format(
                self.training_parameters['output_dir'],
                self.training_parameters['model_name']
        ), 'w') as f: f.write('epoch,loss\n')
        with open('{}/{}_validation_log.csv'.format(
                self.training_parameters['output_dir'],
                self.training_parameters['model_name']
        ), 'w') as f: f.write('epoch,loss,sari\n')

        # Initialize summarizer and simplifier
        self.summarizer = AutoModelForSeq2SeqLM.from_pretrained(self.summarizer_model_name).to(self.device_name)
        self.summarizer_tokenizer = AutoTokenizer.from_pretrained(self.summarizer_model_name)

        self.simplifier = AutoModelForSeq2SeqLM.from_pretrained(self.simplifier_model_name).to(self.device_name)
        self.simplifier_tokenizer = AutoTokenizer.from_pretrained(self.simplifier_model_name)

        # Custom weight matrix for embedding similarity
        self.W = torch.randn((768, int(self.hidden_size)), requires_grad=True, device=self.device_name)
        self.CosSim = nn.CosineSimilarity(dim=2, eps=1e-6)
        self.relu = nn.ReLU()

        # Data and output paths
        self.dataset = self.training_parameters['dataset']
        self.data_location = self.training_parameters['data_location']
        self.model_store_path = training_parameters['output_dir'] / (training_parameters['model_name'] + '_fine_tuned')

    def is_logger(self):
        """
        Returns True if this is the first rank (for distributed training), False otherwise.
        """
        return self.trainer.global_rank <= 0

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):
        """
        Forward pass of the simplifier model.
        """
        return self.simplifier(input_ids=input_ids,
                               attention_mask=attention_mask,
                               decoder_input_ids=decoder_input_ids,
                               decoder_attention_mask=decoder_attention_mask,
                               labels=labels)

    def training_step(self, batch, batch_idx):
        """
        Performs a training step, computes loss based on summarizer and simplifier stages, and logs the results.

        Args:
            batch (dict): Batch of training data.
            batch_idx (int): Index of the current batch.

        Returns:
            Tensor: Loss value for the current batch.
        """
        source = batch["source"]
        labels = batch['target_ids']
        targets = batch['target']
        labels[labels[:, :] == self.simplifier_tokenizer.pad_token_id] = -100

        # Select the keyword prompting strategy based on training parameters
        if self.training_parameters.get('prompting_strategy') == 'kw_score':
            prompt_source = [create_kw_score_prompt(text, self.top_keywords, self.div_score) for text in source]
        elif self.training_parameters.get('prompting_strategy') == 'kw_sep':
            prompt_source = [create_kw_sep_prompt(text, self.top_keywords, self.div_score) for text in source]
        else:
            prompt_source = source

        # Tokenize targets for the simplifier
        targets_encoding = self.simplifier_tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        tgt_ids = targets_encoding['input_ids'].to(self.device_name)
        tgt_mask = targets_encoding['attention_mask'].to(self.device_name)

        # Forward pass through the simplifier
        tgt_output = self.simplifier(
            input_ids=tgt_ids,
            attention_mask=tgt_mask,
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )
        H_sim = tgt_output.encoder_last_hidden_state

        # Summarizer stage
        inputs = self.summarizer_tokenizer(
            prompt_source,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        src_ids = inputs['input_ids'].to(self.device_name)
        src_mask = inputs['attention_mask'].to(self.device_name)

        # Forward pass through the summarizer
        sum_outputs = self.summarizer(
            input_ids=src_ids,
            attention_mask=src_mask,
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        # Generate summary
        summary_ids = self.summarizer.generate(
            inputs['input_ids'].to(self.device_name),
            do_sample=True,
            num_beams=5,
            min_length=10,
            max_length=256
        ).to(self.device_name)

        # Pad summaries for simplifier input
        padded_summary_ids = torch.zeros((summary_ids.shape[0], 256), dtype=torch.long).fill_(
            self.simplifier_tokenizer.pad_token_id).to(self.device_name)
        for i, summary_id in enumerate(summary_ids):
            padded_summary_ids[i, :summary_id.shape[0]] = summary_id

        summary_attention_mask = torch.ones(padded_summary_ids.shape).to(self.device_name)
        summary_attention_mask[padded_summary_ids[:, :] == self.simplifier_tokenizer.pad_token_id] = 0

        # Forward pass through the simplifier with summaries
        sim_outputs = self(
            input_ids=padded_summary_ids,
            attention_mask=summary_attention_mask,
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )
        H2 = sim_outputs.encoder_last_hidden_state

        # Compute similarity
        Rep1 = torch.matmul(H_sim, self.W)
        Rep2 = torch.matmul(H2, self.W)
        Rep1 = self.relu(Rep1)
        Rep2 = self.relu(Rep2)
        sim_score = self.CosSim(Rep1, Rep2)

        # Custom loss logic
        if self.training_parameters.get('custom_loss'):
            loss = sim_outputs.loss * self.training_parameters['w1']
            loss += (-self.training_parameters['lambda_'] * (sim_score.mean(dim=1).mean(dim=0)))
            self.log('train_loss', sim_outputs.loss, on_step=True, prog_bar=True, logger=True)
        else:
            loss = sim_outputs.loss
            self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)

        # Save loss for each data point
        save_log(
            self.training_parameters['output_dir'],
            self.training_parameters['model_name'],
            self.current_epoch,
            loss=loss.item(),
            data_type='train'
        )

        return loss

    def validation_step(self, batch, batch_idx):
        """
        Performs a validation step, computes and accumulates SARI scores, and logs the results.

        Args:
            batch (dict): Batch of validation data.
            batch_idx (int): Index of the current batch.

        Returns:
            float: SARI score for the current batch.
        """
        loss = self.sari_validation_step(batch)
        self.log('val_loss', loss, batch_size=self.training_parameters['valid_batch_size'])

        return torch.tensor(loss, dtype=float)

    def sari_validation_step(self, batch):
        """
        Calculates the SARI score (Summarization Accuracy with Respect to ROUGE) for the validation batch.

        Args:
            batch (dict): Batch of validation data.

        Returns:
            float: SARI score for the batch.
        """
        def generate(sentence):
            inputs = self.summarizer_tokenizer(
                ["summarize: " + sentence],
                max_length=512,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            summary_ids = self.summarizer.generate(
                inputs['input_ids'].to(self.device_name),
                num_beams=15,
                max_length=256,
                top_k=130,
                top_p=0.95
            ).to(self.device_name)

            summary_attention_mask = torch.ones(summary_ids.shape).to(self.device_name)
            summary_attention_mask[summary_ids[:, :] == self.summarizer_tokenizer.pad_token_id] = 0

            beam_outputs = self.simplifier.generate(
                input_ids=summary_ids,
                attention_mask=summary_attention_mask,
                do_sample=True,
                max_length=256,
                num_beams=2,
                top_k=80,
                top_p=0.90,
                early_stopping=True,
                num_return_sequences=1
            ).to(self.device_name)
            return self.simplifier_tokenizer.decode(
                beam_outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
            )

        pred_sents = [generate(source) for source in batch["source"]]
        score = corpus_sari(batch["source"], pred_sents, [batch["targets"]])
        loss = 1 - score / 100
        save_log(
            self.training_parameters['output_dir'],
            self.training_parameters['model_name'],
            self.current_epoch,
            loss=loss,
            sari=score,
            data_type='validation'
        )
        return loss

    def configure_optimizers(self):
        """
        Configures the optimizer and learning rate scheduler.

        Returns:
            list: A list containing the optimizer and scheduler.
        """
        model1 = self.summarizer
        model2 = self.simplifier
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model2.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.training_parameters['weight_decay'],
            },
            {
                "params": [p for n, p in model2.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {
                "params": [p for n, p in model1.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.training_parameters['weight_decay'],
            },
            {
                "params": [p for n, p in model1.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
            {
                "params": self.W
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.training_parameters['learning_rate'], eps=self.training_parameters['adam_epsilon'])
        t_total = (
                (
                        len(self.train_dataloader().dataset) // self.training_parameters['train_batch_size']
                ) // self.training_parameters['gradient_accumulation_steps']
                * float(self.training_parameters['num_train_epochs'])
        )

        if self.training_parameters['scheduler_type'] == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=self.training_parameters['warmup_steps'], num_training_steps=t_total
            )
        else:
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=self.training_parameters['warmup_steps'], num_training_steps=t_total
            )

        return [optimizer], [{'scheduler': scheduler, 'interval': 'step', 'frequency': 1}]

    def save_core_model(self):
        """
        Saves the fine-tuned model and tokenizer to the specified directory.
        """
        self.model.save_pretrained(self.model_store_path)
        self.tokenizer.save_pretrained(self.model_store_path)

    def train_dataloader(self):
        """
        Returns the training DataLoader.

        Returns:
            DataLoader: The training DataLoader.
        """
        train_dataset = TrainDataset(
            data_set_dir=self.data_location,
            dataset=self.dataset,
            tokenizer=self.simplifier_tokenizer,
            max_len=self.max_seq_length,
            sample_size=self.train_sample_size,
        )
        dataloader = DataLoader(
            train_dataset,
            batch_size=self.train_batch_size,
            drop_last=True,
            shuffle=True,
            pin_memory=True,
            num_workers=0
        )
        return dataloader

    def val_dataloader(self):
        """
        Returns the validation DataLoader.

        Returns:
            DataLoader: The validation DataLoader.
        """
        val_dataset = ValDataset(
            data_set_dir=self.data_location,
            dataset=self.dataset,
            tokenizer=self.simplifier_tokenizer,
            max_len=self.max_seq_length,
            sample_size=self.valid_sample_size
        )
        return DataLoader(
            val_dataset,
            batch_size=self.valid_batch_size
        )


In [15]:
# Import libraries
from pathlib import Path
import sys
import torch

# Import user-defined libraries
# from util.utils import create_experiment_dir, log_parameters
# from util.train import train
# from util.evaluate_model.simsum_evaluator import SumSimEvaluator
# from util.evaluate_model.evaluation_metrics import BartModelEvaluator, load_dataset
# from util.simsum_models.simsum_model import SumSimModel
# from util.baseline_models.baseline_model import Seq2SeqFineTunedModel
# from util.generate_plots import plot_average_loss, plot_metric_distributions, identify_files


class ModelRunner:
    def __init__(self, configuration):
        """
        Initialize the trainer with model configuration.

        Args:
            configuration: The dictionary containing model configurations
        """
        # Ensure the project root is added to the Python path
        # sys.path.append(str(Path(__file__).resolve().parent))

        # Initialise the output directory
        self.repo_dir = datasets_base_dir
        self.exp_dir = os.path.join(self.repo_dir,'outputs')
        if not os.path.exists(self.exp_dir):
            os.makedirs(self.exp_dir)

        # Define the model name
        self.model_config = configuration.copy()

        # Store the model locations
        self.model_config['output_dir'] = create_experiment_dir(self.exp_dir, self.model_config)
        self.model_config['data_location'] = os.path.join(self.repo_dir,'datasets')
        self.model_config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model_name = configuration['model_name'].lower()
        self.model_save_path = None
        self.model = None
        self.model_details = None
        self.select_model()

    def select_model(self):
        """
        Function to select the model class and configure the settings based on the model name.
        """
        if self.model_name == 'bart-baseline':
            self.model_config['model_name'] = 'facebook/bart-base' # 'Yale-LILY/brio-cnndm-uncased'
            self.model_config['scheduler_type'] = 'linear'
        elif self.model_name == 't5-baseline':
            self.model_config['model_name'] = 't5-base'
            self.model_config['scheduler_type'] = 'cosine'
        elif self.model_name == 'bart-simsum':
            self.model_config['summarizer_model_name'] = 'ainize/bart-base-cnn'
            self.model_config['simplifier_model_name'] = 'facebook/bart-base'
            self.model_config['scheduler_type'] = 'cosine'
            self.model = SumSimModel(
                self.model_config,
                summarizer_model_name=self.model_config['summarizer_model_name'],
                simplifier_model_name=self.model_config['simplifier_model_name']
            )
        elif self.model_name == 't5-simsum':
            self.model_config['summarizer_model_name'] = 't5-base'
            self.model_config['simplifier_model_name'] = 't5-base'
            self.model_config['scheduler_type'] = 'cosine'
            self.model = SumSimModel(
                self.model_config,
                summarizer_model_name=self.model_config['summarizer_model_name'],
                simplifier_model_name=self.model_config['simplifier_model_name']
            )
        else:
            raise ValueError("Invalid model name. Use 'bart-baseline', 't5-baseline', 'bart-simsum', or 't5-simsum'.")

    def train_model(self):
        """
        Run the training process.
        """
        # Log training arguments
        log_parameters(self.model_config['output_dir'] / "params.json", self.model_config)

        # Start training
        print(
            f"Starting training with {self.model_name.upper()} model on dataset: {self.model_config['dataset']}"
        )
        if self.model is None:
            # Initialize model if not already set (for baseline models)
            self.model = Seq2SeqFineTunedModel(self.model_config)
        self.model, self.model_save_path = train(self.model_config, self.model)

    def evaluate_model(self):
        """
        Function to evaluate the model.
        """
        print("Starting evaluation of models")
        if self.model_name in ['bart-simsum', 't5-simsum']:
            evaluator = SumSimEvaluator(self.model_config, self.model.summarizer, self.model.simplifier,
                                        self.model.summarizer_tokenizer, self.model.simplifier_tokenizer)
        else:
            # Use standard evaluator for baseline models
            evaluator = BartModelEvaluator(self.model_config, self.model.model, self.model.tokenizer)

        # Load datasets (D_Wiki and Wiki_Doc)
        dataset_dir = self.model_config['data_location']
        dataset_name = self.model_config['dataset']

        print(f"Evaluating on {dataset_name}")
        complex_sents, simple_sents = load_dataset_validation(
            dataset_dir, dataset_name, percentage=self.model_config['test_sample_size']
        )
        scores, score_table = evaluator.evaluate(complex_sents, simple_sents)
        print(f"Results for {dataset_name}: {scores}")

        # Generate plots
        files = identify_files(self.model_config['output_dir'])
        plot_average_loss(self.model_config['output_dir'], files['training_log'], files['validation_log'])
        plot_metric_distributions(self.model_config['output_dir'], files['evaluation_metrics'])



# Run code

In [None]:
# # Initialise user defined libraries
# from model_runner import ModelRunner
from datetime import datetime

if __name__ == "__main__":
    """
    Run the main functions to summarize text
    """
    # Get the current time
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print("Current time:", current_time)

    configuration = {
        'seed': 0,
        'gradient_accumulation_steps': 1,
        'learning_rate': 1e-5,
        'max_seq_length': 256,
        'adam_epsilon': 1e-8,
        'weight_decay': 0.0001,
        'warmup_steps': 5,
        'custom_loss': True,
        'hidden_size': 1,
        'w1': 1,
        'num_train_epochs': 10,

        # To edit
        'model_name': 't5-baseline',
        'dataset': 'D_wiki',
        'prompting_strategy': 'no_prompting',
        'train_batch_size': 8,
        'valid_batch_size': 8,
        'train_sample_size': 0.02,
        'valid_sample_size': 0.02,
        'test_sample_size': 0.1,
        'lambda_': 0.001,
        'div_score': 0.5,
        'top_keywords': 5
    }

    # Initialize, run and evaluate the model
    model = ModelRunner(configuration)
    model.train_model()
    model.evaluate_model()

    # Get the current time
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print("Current time:", current_time)

Current time: 2024-11-20 04:48:38
6_t5-baseline_D_wiki_epochs-10_batch-8_val_batch-8_lambda-0.001_prompt-no_prompting_div-0.5_keywords-5_test_sample-0.1
Starting training with T5-BASELINE model on dataset: D_wiki


INFO:lightning_fabric.utilities.seed:Seed set to 0
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/drive/MyDrive/NLP-Sem-3 Project/outputs/6_t5-baseline_D_wiki_epochs-10_batch-8_val_batch-8_lambda-0.001_prompt-no_prompting_div-0.5_keywords-5_test_sample-0.1 exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Starting training...
Initializing TrainDataset...
Dataset paths initialized.


INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | eval
------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
0         Modules in train mode
541       Modules in eval mode


Initializing TrainDataset...
Dataset paths initialized.
Initializing ValDataset...
Dataset paths initialized.


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
