In [None]:
!pip install --upgrade transformers

# **1. Install Necessary Libraries**

In [None]:
%%capture
!pip install --upgrade transformers accelerate
!pip install datasets -q
!pip install rouge-score -q
!pip install evaluate -q

In [None]:
import os
os.kill(os.getpid(), 9)

# **2. Import Libraries and Set Up Device**

In [None]:
import torch
from transformers import EncoderDecoderModel, BertTokenizer, MBart50TokenizerFast, MBartForConditionalGeneration

# from datasets import load_dataset, load_metric
import datasets

# Import necessary libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
import nltk
import evaluate
from rouge_score import rouge_scorer
import os

# Download the NLTK Punkt tokenizer for sentence splitting
nltk.download('punkt')

# Set up the device for computation (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cuda


In [None]:
# Import Files from Google Drive to Colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# **3. Load and Prepare the Dataset**


In [None]:
from datasets import load_from_disk

dataset = load_dataset("azzedine/Goud-sum_v2")

dataset

# **5. Preprocessing Function**

In [None]:

encoder_max_length =  256
decoder_max_length =  32
max_input_length = encoder_max_length
max_target_length = decoder_max_length
def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['headline']
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')

    # Adjust labels for BERT2BRT
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels["input_ids"]
        ]


    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# **6. Load AraBART Model and Tokenizer**

In [None]:
# Specify the model name for AraBART
model_name = "facebook/mbart-large-50"

# Load the tokenizer and model from the Hugging Face Hub
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="ar_AR", tgt_lang="ar_AR")
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# Move the model to the computation device (GPU or CPU)
model = model.to(device)

# **7. Tokenize the Datasets**

In [None]:
# Apply the preprocessing function to all splits (train, validation, test)
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['article', 'headline', 'categories'],  # Remove original columns after tokenization
)
tokenized_datasets

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 233421
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9497
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9497
    })
})

# **8. Define Compute Metrics Function**

In [None]:
import evaluate
from rouge_score import rouge_scorer

# Initialize ROUGE
rouge = evaluate.load('rouge')
r_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],  tokenizer=tokenizer)


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)


    candidate_summaries = pred_str
    reference_summaries = label_str
    print(f"candidate_summaries = {candidate_summaries}")
    print(f"reference_summaries = {reference_summaries}")
    # Evaluate ROUGE scores

    # Store scores
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # Calculate scores for the list of texts
    for ref, gen in zip(reference_summaries, candidate_summaries):
      score = r_scorer.score(gen, ref)
      # Append scores to respective lists
      scores['rouge1'].append(score['rouge1'].fmeasure)
      scores['rouge2'].append(score['rouge2'].fmeasure)
      scores['rougeL'].append(score['rougeL'].fmeasure)

    # Calculate mean scores
    mean_scores = {metric: np.mean(values) for metric, values in scores.items()}


    # Calculate the average length of the reference summaries
    reference_lens = [len(label.split()) for label in candidate_summaries]
    mean_scores['ref_len'] = np.mean(reference_lens)

    # Calculate the average length of the generated summaries
    prediction_lens = [len(pred.split()) for pred in reference_summaries]
    mean_scores['gen_len'] = np.mean(prediction_lens)
    print(f"scores = {mean_scores}")
    return mean_scores

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

NameError: name 'tokenizer' is not defined

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

train_batch_size = 10 # 23
val_batch_size = 4 # 2

# Configure the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/MBART/checkpoints",    # Output directory
    eval_strategy="epoch",             # Evaluate every epoch
    save_strategy="epoch",             # Save checkpoint every epoch
    learning_rate=2e-5,                # Learning rate
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    weight_decay=0.01,                 # Weight decay for regularization
    gradient_accumulation_steps=8,
    num_train_epochs=20, #20
    predict_with_generate=True,
    logging_dir='/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/MBART/MBART_logs',
    fp16=torch.cuda.is_available(),
    # Adjusting the save limit
    save_total_limit=None,              # None Save all checkpoints (no limit)
    load_best_model_at_end=True,
    metric_for_best_model='eval_rouge1',
    greater_is_better=True,
    report_to="none"
)

# **10. Initialize the Trainer**

In [None]:
# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,                             # The instantiated 🤗 Transformers model to be trained
    args=training_args,                      # Training arguments
    train_dataset=tokenized_datasets['train'],    # Training dataset ['train'] tokenized_datasets['train'].select(range(10))
    eval_dataset=tokenized_datasets['validation'],# Evaluation dataset ['validation'] tokenized_datasets['validation']select(range(4))
    tokenizer=tokenizer,                     # Tokenizer
    compute_metrics=compute_metrics,         # Function to compute metrics
)

  trainer = Seq2SeqTrainer(


# **11. Train the Model**

In [None]:
trainer.train()

In [None]:
best_checkpoint = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/MBART/checkpoints/checkpoint-32089"
tokenizer = AutoTokenizer.from_pretrained(best_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(best_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
# Generate Summaries Function
def generate_summary(example):
    # Tokenize the input text
    inputs = tokenizer(
        example['article'],
        max_length=max_input_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    # Move tensors to the computation device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate the summary using the model
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_target_length,
        num_beams=5,           # Number of beams for beam search
        early_stopping=True    # Stop when at least num_beams sentences are finished
    )

    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Add the generated summary to the example with a model-specific flag
    example["generated_summary_mbart50"] = summary
    return example

In [None]:
# Apply the generate_summary function to each example in the test set
generate_dataset = dataset['test'].map(generate_summary)

Map:   0%|          | 0/9497 [00:00<?, ? examples/s]

In [None]:
generate_dataset.save_to_disk('/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/MBART/model_result')



Saving the dataset (0/1 shards):   0%|          | 0/9497 [00:00<?, ? examples/s]

In [None]:
# Convert the dataset to a pandas DataFrame
df_generated = generate_dataset.to_pandas()

# Save the DataFrame to a CSV file
df_generated.to_csv('MT5_goud_testset_generated_summaries.csv', index=False)

In [None]:
# Initialize ROUGE
rouge = evaluate.load('rouge')
r_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],  tokenizer=tokenizer)


def compute_metrics_testset(reference_summaries, candidate_summaries):

    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # Calculate scores for the list of texts
    for ref, gen in zip(reference_summaries, candidate_summaries):
      score = r_scorer.score(gen, ref)
      # Append scores to respective lists
      scores['rouge1'].append(score['rouge1'].fmeasure)
      scores['rouge2'].append(score['rouge2'].fmeasure)
      scores['rougeL'].append(score['rougeL'].fmeasure)

    # Calculate mean scores
    mean_scores = {metric: np.mean(values) for metric, values in scores.items()}


    # Calculate the average length of the reference summaries
    reference_lens = [len(label.split()) for label in candidate_summaries]
    mean_scores['ref_len'] = np.mean(reference_lens)

    # Calculate the average length of the generated summaries
    prediction_lens = [len(pred.split()) for pred in reference_summaries]
    mean_scores['gen_len'] = np.mean(prediction_lens)
    print(f"scores = {mean_scores}")
    return mean_scores

In [None]:

candidate_summaries = generate_dataset["generated_summary_mbart50"]
reference_summaries = generate_dataset["headline"]
final_scores_ = compute_metrics_testset(reference_summaries, candidate_summaries)
final_scores_

scores = {'rouge1': 0.3675926467612289, 'rouge2': 0.2441157193233993, 'rougeL': 0.3280699392014489, 'ref_len': 10.760661261450984, 'gen_len': 13.518058334210803}


{'rouge1': 0.3675926467612289,
 'rouge2': 0.2441157193233993,
 'rougeL': 0.3280699392014489,
 'ref_len': 10.760661261450984,
 'gen_len': 13.518058334210803}