In [1]:
!pip install pytorch_lightning
!pip install transformers
!pip install torchmetrics



In [None]:
import torch
import pandas as pd
import pytorch_lightning as pl
import gc
import torchmetrics
from transformers import BartForConditionalGeneration, BartTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import seed_everything

# Constants
MAX_LENGTH = 512
SUMMARY_LENGTH = 150
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 32
TEST_BATCH_SIZE = 8
NUM_EPOCHS = 1
LEARNING_RATE = 3e-5

# Initialize random seed
seed_everything(1270)

# Load data
# Change to this for Kaggle "/kaggle/input/dataset_name"
path_dataset = "/content/sample_data"
train_df = pd.read_parquet(f"{path_dataset}/train.parquet")
val_df = pd.read_parquet(f"{path_dataset}/validation.parquet")
test_df = pd.read_parquet(f"{path_dataset}/test.parquet")

# Custom Dataset
class SummaryDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['article'].tolist()
        self.summaries = df['highlights'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')
        encoding_summary = self.tokenizer(summary, truncation=True, padding='max_length', max_length=SUMMARY_LENGTH, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': encoding_summary['input_ids'].squeeze(0)}

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Create datasets
train_dataset = SummaryDataset(train_df, tokenizer)
val_dataset = SummaryDataset(val_df, tokenizer)
test_dataset = SummaryDataset(test_df, tokenizer)

# Define Model
class SummaryGenerator(pl.LightningModule):
    def __init__(self, lr):
        super().__init__()
        self.model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
        self.lr = lr
        self.test_outputs = []

    def forward(self, inputs):
        return self.model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                   max_length=SUMMARY_LENGTH, num_beams=4, early_stopping=True)

    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True)
        return {'loss': loss}

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        total_steps = (len(train_dataset) // TRAIN_BATCH_SIZE) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=150, num_training_steps=total_steps)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        self.log('val_loss', loss, prog_bar=True)
        return {'val_loss': loss}

    def test_step(self, batch, batch_idx):
        generated_ids = self.forward(batch)
        generated_ids = generated_ids.cpu().numpy()  # Convert tensor to numpy array if it's a tensor
        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        labels = batch['labels'].cpu().numpy().tolist()  # Convert tensor to numpy array and then to list

        self.test_outputs.append({'generated_text': generated_text, 'labels': labels})  # Store the outputs
        return {'generated_text': generated_text, 'labels': labels}

    def on_test_epoch_end(self):
        rouge_metric = torchmetrics.text.ROUGEScore(rouge_keys=("rouge1", "rouge2", "rougeL", "rougeLsum"))

        # Extract generated and reference texts from the stored test outputs
        generated_texts = [x['generated_text'][0] for x in self.test_outputs]
        reference_texts = [tokenizer.decode(x[0], skip_special_tokens=True) for x in [item['labels'] for item in self.test_outputs]]

        # Debugging: Print first few generated and reference texts
        print("Sample generated_texts:", generated_texts[:3])
        print("Sample reference_texts:", reference_texts[:3])

        # Check if the lists are empty or contain only empty strings
        if not generated_texts or not reference_texts or all(not text for text in generated_texts) or all(not text for text in reference_texts):
            print("Skipping ROUGE calculation: No texts to compare.")
            return

        # Calculate ROUGE scores
        rouge_scores = rouge_metric(
            preds=generated_texts,
            target=reference_texts
        )
        self.log_dict({
            'eval_rouge1': rouge_scores['rouge1_fmeasure'].item() * 100,
            'eval_rouge2': rouge_scores['rouge2_fmeasure'].item() * 100,
            'eval_rougeL': rouge_scores['rougeL_fmeasure'].item() * 100,
            'eval_rougeLsum': rouge_scores['rougeLsum_fmeasure'].item() * 100
        })

        # Clear some memory
        self.test_outputs = []
        gc.collect()

# Initialize the model
model = SummaryGenerator(lr=LEARNING_RATE)

# Initialize Trainer
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, accelerator="auto")

INFO:lightning_fabric.utilities.seed:Global seed set to 1270
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [3]:
# Train the model
print("Training the model...")
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE)
trainer.fit(model, train_loader, val_loader)

Training the model...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 406 M 
-------------------------------------------------------
406 M     Trainable params
0         Non-trainable params
406 M     Total params
1,625.162 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [4]:
# Evaluate the model
print("Testing the model...")
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)
trainer.test(model, test_loader)

# Clear remaining data to free up memory
del test_loader, test_dataset
gc.collect()

Testing the model...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

Sample generated_texts: ['Palestinian Authority officially becomes 123rd member of the International Criminal Court.\nThe formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based.\nPalestinian Foreign Minister Riad al-Malki: "Today brings us closer to our shared goals of justice and peace"', "Maysak gained super typhoon status a few days ago, but has since lost steam.\nIt's now classified as a tropical storm, according to the Philippine national weather service.\nAuthorities took preemptive steps to keep people safe.\nTourists who arrive Saturday in and around the coastal town of Aurora will not be accepted.", 'Amnesty International releases its annual review of the death penalty worldwide.\nDeath sentences recorded in 2014 -- up more than 500 on the previous year -- can also be attributed to governments using death penalty as a political tool.\nIn Pakistan, the government lifted a six-year moratorium on the execution of civilians in the wa

0

In [5]:
# Save the model
print("Saving the model...")
# Change to "/kaggle/working/" for Kaggle
path = "/content/sample_data"
model.model.save_pretrained(path)

# Save the tokenizer
tokenizer_save_path = path
tokenizer.save_pretrained(tokenizer_save_path)

training_args_dict = {
    "_n_gpu": 1,
    "adafactor": False,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "adam_epsilon": 1e-08,
    "auto_find_batch_size": False,
    "bf16": False,
    "bf16_full_eval": False,
    "data_seed": None,
    "dataloader_drop_last": False,
    "dataloader_num_workers": 0,
    "dataloader_pin_memory": True,
    "ddp_backend": None,
    "ddp_broadcast_buffers": None,
    "ddp_bucket_cap_mb": None,
    "ddp_find_unused_parameters": None,
    "ddp_timeout": 1800,
    "debug": [],
    "deepspeed": None,
    "disable_tqdm": False,
    "dispatch_batches": None,
    "do_eval": True,
    "do_predict": False,
    "do_train": True,
    "eval_accumulation_steps": None,
    "eval_delay": 0,
    "eval_steps": None,
    "evaluation_strategy": "epoch",
    "fp16": False,
    "fp16_backend": "auto",
    "fp16_full_eval": False,
    "fp16_opt_level": "O1",
    "fsdp": "",
    "fsdp_config": None,
    "fsdp_min_num_params": 0,
    "fsdp_transformer_layer_cls_to_wrap": None,
    "full_determinism": False,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": False,
    "greater_is_better": True,
    "group_by_length": False,
    "half_precision_backend": "auto",
    "hub_always_push": False,
    "hub_model_id": "bart-summarization",
    "hub_private_repo": False,
    "hub_strategy": "every_save",
    "hub_token": "<HUB_TOKEN>",
    "ignore_data_skip": False,
    "include_inputs_for_metrics": False,
    "jit_mode_eval": False,
    "label_names": None,
    "label_smoothing_factor": 0.0,
    "learning_rate": 3e-05,
    "length_column_name": "length",
    "load_best_model_at_end": True,
    "local_rank": -1,
    "log_level": -1,
    "log_level_replica": -1,
    "log_on_each_node": True,
    "logging_dir": "/opt/ml/output/data/logs",
    "logging_first_step": False,
    "logging_nan_inf_filter": True,
    "logging_steps": 500,
    "logging_strategy": "steps",
    "lr_scheduler_type": "linear",
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "metric_for_best_model": "accuracy",
    "mp_parameters": "",
    "no_cuda": False,
    "num_train_epochs": 1,
    "optim": "adamw_torch",
    "optim_args": None,
    "output_dir": "/opt/ml/model",
    "overwrite_output_dir": False,
    "past_index": -1,
    "per_device_eval_batch_size": 32,
    "per_device_train_batch_size": 8,
    "prediction_loss_only": False,
    "push_to_hub": False,
    "push_to_hub_model_id": None,
    "push_to_hub_organization": None,
    "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
    "ray_scope": "last",
    "remove_unused_columns": True,
    "report_to": ["tensorboard"],
    "resume_from_checkpoint": None,
    "run_name": "/opt/ml/model",
    "save_on_each_node": False,
    "save_safetensors": False,
    "save_steps": 500,
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "seed": 1270,
    "sharded_ddp": [],
    "skip_memory_metrics": True,
    "tf32": None,
    "torch_compile": False,
    "torch_compile_backend": None,
    "torch_compile_mode": None,
    "torchdynamo": None,
    "tpu_metrics_debug": False,
    "tpu_num_cores": None,
    "use_cpu": False,
    "use_ipex": False,
    "use_legacy_prediction_loop": False,
    "use_mps_device": False,
    "warmup_ratio": 0.0,
    "warmup_steps": 150,
    "weight_decay": 0.01,
}

torch.save(training_args_dict, f"{path}/training_args.bin")

Saving the model...


In [6]:
# Step 1: Create a reference summary
reference_article = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."

# Step 2: Tokenize the test article
test_encoding = tokenizer(reference_article, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')

# Step 3: Generate the summary
with torch.no_grad():
    test_input = {'input_ids': test_encoding['input_ids'], 'attention_mask': test_encoding['attention_mask']}
    generated_ids = model.forward(test_input)
    generated_summary = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"Summary: {generated_summary}\n")

# Step 4: Evaluate with ROUGE
rouge_metric = torchmetrics.text.ROUGEScore(rouge_keys=("rouge1", "rouge2", "rougeL", "rougeLsum"))
rouge_scores = rouge_metric(
    preds=generated_summary,
    target=reference_article
)
print(f"ROUGE-1: {rouge_scores['rouge1_fmeasure'].item() * 100:.2f}")
print(f"ROUGE-2: {rouge_scores['rouge2_fmeasure'].item() * 100:.2f}")
print(f"ROUGE-L: {rouge_scores['rougeL_fmeasure'].item() * 100:.2f}")
print(f"ROUGE-Lsum: {rouge_scores['rougeLsum_fmeasure'].item() * 100:.2f}")

Summary: The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.

ROUGE-1: 59.32
ROUGE-2: 58.91
ROUGE-L: 59.23
ROUGE-Lsum: 59.01


In [None]:
# Google Colab only - move the model to Drive
from google.colab import drive
# This will prompt for authorization
drive.mount('/content/gdrive')