**Installing Libraries**

In [1]:

%%capture
!pip install --upgrade transformers accelerate
!pip install datasets -q
!pip install rouge-score -q
!pip install evaluate -q
!pip install  bert-score -q

In [2]:
%%capture
!pip install googletrans

In [3]:
# Import Files from Google Drive to Colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
from datasets import load_from_disk
from bert_score import BERTScorer

import torch
from transformers import EncoderDecoderModel, BertTokenizer

# from datasets import load_dataset, load_metric
import datasets

# Import necessary libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
import nltk
import evaluate
from rouge_score import rouge_scorer
import os



# Download the NLTK Punkt tokenizer for sentence splitting
nltk.download('punkt')

# Set up the device for computation (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


In [5]:



from transformers import AutoTokenizer
from rouge_score import rouge_scorer

# model_name = 'aubmindlab/bert-base-arabertv2'
model_name = 'asafaya/bert-base-arabic'
tokenizer = AutoTokenizer.from_pretrained(model_name) #huggingface model


# Initialize ROUGE
rouge = evaluate.load('rouge')
r_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],  tokenizer=tokenizer)


def compute_metrics_testset(reference_summaries, candidate_summaries):

    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # Calculate scores for the list of texts
    for ref, gen in zip(reference_summaries, candidate_summaries):
      score = r_scorer.score(gen, ref)
      # Append scores to respective lists
      scores['rouge1'].append(score['rouge1'].fmeasure)
      scores['rouge2'].append(score['rouge2'].fmeasure)
      scores['rougeL'].append(score['rougeL'].fmeasure)

    # Calculate mean scores
    mean_scores = {metric: np.mean(values) for metric, values in scores.items()}


    # Calculate the average length of the reference summaries
    reference_lens = [len(label.split()) for label in candidate_summaries]
    mean_scores['ref_len'] = np.mean(reference_lens)

    # Calculate the average length of the generated summaries
    prediction_lens = [len(pred.split()) for pred in reference_summaries]
    mean_scores['gen_len'] = np.mean(prediction_lens)
    print(f"scores = {mean_scores}")
    return mean_scores

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Load the Arabic BERT model
model_name = "asafaya/bert-base-arabic"
#  model_name = 'aubmindlab/bert-base-arabertv2'

def calculate_bertscore(references, candidates):
    # Ensure CUDA is available (if you're using a GPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"


    scorer = BERTScorer(model_type=model_name, num_layers=12, rescale_with_baseline=False)

    # Calculate BERTScore
    P, R, F1 = scorer.score(candidates, references)

    # Print results
    # print("\nBERTScore results:")
    # for i, (p, r, f1) in enumerate(zip(P, R, F1)):
    #     print(f"Sentence pair {i+1}:")
    #     print(f"  Reference: {references[i]}")
    #     print(f"  Candidate: {candidates[i]}")
    #     print(f"  Precision: {p.item():.4f}")
    #     print(f"  Recall: {r.item():.4f}")
    #     print(f"  F1: {f1.item():.4f}")
    #     print()

    # Calculate and print average scores
    avg_P = P.mean().item()
    avg_R = R.mean().item()
    avg_F1 = F1.mean().item()

    print("Average scores:")
    print(f"  Precision: {avg_P:.4f}")
    print(f"  Recall: {avg_R:.4f}")
    print(f"  F1: {avg_F1:.4f}")

    return avg_P, avg_R, avg_F1

# **GPT4 Score Calculation**

In [6]:
from datasets import load_from_disk
gpt4_path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/gpt4/model_resultgpt4V2"
gpt4_dataset = load_from_disk(gpt4_path)
gpt4_dataset

Dataset({
    features: ['article', 'headline', 'categories', 'gpt_4_summary'],
    num_rows: 9497
})

In [None]:
reference_summaries = gpt4_dataset['headline']
candidate_summaries = gpt4_dataset['gpt_4_summary']
compute_metrics_testset(reference_summaries, candidate_summaries)

scores = {'rouge1': 0.18047567175570373, 'rouge2': 0.07758235502210707, 'rougeL': 0.15951473102853175, 'ref_len': 12.14846793724334, 'gen_len': 13.518058334210803}


{'rouge1': 0.18047567175570373,
 'rouge2': 0.07758235502210707,
 'rougeL': 0.15951473102853175,
 'ref_len': 12.14846793724334,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Average scores:
  Precision: 0.6060
  Recall: 0.5976
  F1: 0.6007


In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)

Average scores:
  Precision: 0.6060
  Recall: 0.5976
  F1: 0.6007


In [None]:
avg_F1

0.6007148623466492

# **Mistral, LLAMA3, GEMMA Score Calculation**

In [None]:
from datasets import load_dataset, concatenate_datasets
path1 = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral/split_datasets/goud_test_part_llama_sum_part_1"
path2 = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral/split_datasets/goud_test_part_llama_sum_part_2"
path3 = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral/split_datasets/goud_test_part_llama_sum_part_3"
path4 = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral/split_datasets/goud_test_part_gemma_sum_part_4"



llm_dataset = combined_dataset = concatenate_datasets([load_from_disk(path1), load_from_disk(path2), load_from_disk(path3), load_from_disk(path4)])
llm_dataset

Dataset({
    features: ['article', 'headline', 'categories', 'text', 'mistral_generated_summary', 'llama_generated_summary', 'gemma_generated_summary'],
    num_rows: 9497
})

In [None]:
reference_summaries = llm_dataset['headline']
candidate_summaries = llm_dataset['llama_generated_summary']
compute_metrics_testset(reference_summaries, candidate_summaries)

scores = {'rouge1': 0.16595715325596, 'rouge2': 0.07170393849923103, 'rougeL': 0.14438752748324177, 'ref_len': 13.77255975571233, 'gen_len': 13.518058334210803}


{'rouge1': 0.16595715325596,
 'rouge2': 0.07170393849923103,
 'rougeL': 0.14438752748324177,
 'ref_len': 13.77255975571233,
 'gen_len': 13.518058334210803}

In [None]:

avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)

Average scores:
  Precision: 0.5922
  Recall: 0.5816
  F1: 0.5852


In [None]:
avg_F1

0.5852301120758057

In [None]:
import re
cleaned_text = re.sub(r'[</s>\n,]|Descartado|Descarté:', '', summary)
cleaned_text

'  el Real Madrid para la final de la Champion League'

In [None]:
!pip install googletrans==3.1.0a0


Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
  Created wheel for googletrans: filename=googletrans-3.1.0a0-py3-none-any.whl size=16353 sha256=f5df327b0d316c210b34268ad2af564f92953ce7ff654eca516fb524410787d7
  Stored in directory: /root/.cache/pip/wheels/50/5d/3c/8477d0af4ca2b8b1308812c09f1930863caeebc762fe265a95
Successfully built googletrans
Installing collected packages: googletrans
  Attempting uninstall: googletrans
    Found existing installation: googletrans 3.0.0
    Uninstalling googletrans-3.0.0:
      Successfully uninstalled googletrans-3.0.0
Successfully installed googletrans-3.1.0a0


In [None]:
from googletrans import Translator

translator = Translator()


ريال مدريد لنهائي دوري أبطال أوروبا


In [None]:
type(translated_text.text)

str

In [None]:
import re
def preprocess_mistral_summary(summary):
  summary = summary.split("### Title:")[2]
  summary = summary.split("### Text:")[0]
  cleaned_text = re.sub(r'[</s>\n,]|Descartado|Descarté:', '', summary)
  translated_text = translator.translate(cleaned_text, dest='ar')
  cleaned_summary = translated_text.text
  return cleaned_summary

In [None]:
from tqdm import tqdm
mistral_generated_summaries = []
for i in tqdm(range(len(llm_dataset))):
  mistral_summary = llm_dataset[i]["mistral_generated_summary"]
  cleaned_summary = preprocess_mistral_summary(mistral_summary)
  mistral_generated_summaries.append(cleaned_summary)

100%|██████████| 9497/9497 [25:31<00:00,  6.20it/s]


In [None]:
llm_dataset

Dataset({
    features: ['article', 'headline', 'categories', 'text', 'mistral_generated_summary', 'llama_generated_summary', 'gemma_generated_summary'],
    num_rows: 9497
})

In [None]:
# Add the new column to the dataset
llm_dataset_new = llm_dataset.add_column("mistral_generated_summary_ar", mistral_generated_summaries)
output_dir="/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral"
split_dir = f"{output_dir}/full_dataset"
llm_dataset_new.save_to_disk(split_dir)

Saving the dataset (0/1 shards):   0%|          | 0/9497 [00:00<?, ? examples/s]

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/mistral/full_dataset"
llm_dataset_new = load_from_disk(path)
llm_dataset_new

Dataset({
    features: ['article', 'headline', 'categories', 'text', 'mistral_generated_summary', 'llama_generated_summary', 'mistral_generated_summary_ar', 'gemma_generated_summary'],
    num_rows: 9497
})

In [None]:
mistral_generated_summaries = []
for i in range(len(llm_dataset_new)):
  abs_summaries = llm_dataset_new[i]["mistral_generated_summary_ar"][0]
  mistral_generated_summaries.append(abs_summaries)

In [None]:
reference_summaries = llm_dataset['headline']
candidate_summaries = mistral_generated_summaries
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.07057926270038548, 'rouge2': 0.02111807546797575, 'rougeL': 0.06404359402321676, 'ref_len': 8.014215015267979, 'gen_len': 13.518058334210803}


{'rouge1': 0.07057926270038548,
 'rouge2': 0.02111807546797575,
 'rougeL': 0.06404359402321676,
 'ref_len': 8.014215015267979,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)

Average scores:
  Precision: 0.5621
  Recall: 0.5278
  F1: 0.5431


In [None]:
avg_F1

0.5430681109428406

In [None]:
0.5430681109428406 * 100

54.30681109428406

# **Seq2Seq Score Calculation**

**AraBart**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/AraBART/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_arabart'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_arabart']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.31510330139891235, 'rouge2': 0.19238852973326376, 'rougeL': 0.2897650763185098, 'ref_len': 10.807834052858798, 'gen_len': 13.518058334210803}


{'rouge1': 0.31510330139891235,
 'rouge2': 0.19238852973326376,
 'rougeL': 0.2897650763185098,
 'ref_len': 10.807834052858798,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)

Average scores:
  Precision: 0.6742
  Recall: 0.6528
  F1: 0.6621


In [None]:
avg_F1

0.66206294298172

**AraT5**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/AraT5/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_Arat5'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_Arat5']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.32628692463738523, 'rouge2': 0.20155623674791948, 'rougeL': 0.29974322397678776, 'ref_len': 11.02716647362325, 'gen_len': 13.518058334210803}


{'rouge1': 0.32628692463738523,
 'rouge2': 0.20155623674791948,
 'rougeL': 0.29974322397678776,
 'ref_len': 11.02716647362325,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)
avg_F1

Average scores:
  Precision: 0.6761
  Recall: 0.6564
  F1: 0.6648


0.6648486256599426

**MBart**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/MBART/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_mbart50'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_mbart50']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.33555079238855895, 'rouge2': 0.21563513728851177, 'rougeL': 0.30865661859165544, 'ref_len': 10.760661261450984, 'gen_len': 13.518058334210803}


{'rouge1': 0.33555079238855895,
 'rouge2': 0.21563513728851177,
 'rougeL': 0.30865661859165544,
 'ref_len': 10.760661261450984,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)
avg_F1

Average scores:
  Precision: 0.6842
  Recall: 0.6628
  F1: 0.6722


0.6721636652946472

# **Encoder Only Score Calculation**

**DZBert**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/dziriBERT/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_dzbert_BERT2BERT'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_dzbert_BERT2BERT']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.2416220861295027, 'rouge2': 0.12428194962945674, 'rougeL': 0.21890651364408484, 'ref_len': 10.437822470253764, 'gen_len': 13.518058334210803}


{'rouge1': 0.2416220861295027,
 'rouge2': 0.12428194962945674,
 'rougeL': 0.21890651364408484,
 'ref_len': 10.437822470253764,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)
avg_F1

Average scores:
  Precision: 0.6469
  Recall: 0.6180
  F1: 0.6311


0.6310778260231018

**DarijaBERT**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/models/darijaBERT/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_darija_BERT2BERT'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_darija_BERT2BERT']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.27795476769231986, 'rouge2': 0.1553129957800718, 'rougeL': 0.24796309565931382, 'ref_len': 13.908602716647362, 'gen_len': 13.518058334210803}


{'rouge1': 0.27795476769231986,
 'rouge2': 0.1553129957800718,
 'rougeL': 0.24796309565931382,
 'ref_len': 13.908602716647362,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)
avg_F1

Average scores:
  Precision: 0.6481
  Recall: 0.6391
  F1: 0.6425


0.6424856781959534

**AraBERT**

In [None]:
path = "/content/drive/MyDrive/WACL4_COLING_2025/comparaison_with_papers/model_result"
d2 = load_from_disk(path)
d2

Dataset({
    features: ['article', 'headline', 'categories', 'generated_summary_BERT2BERT'],
    num_rows: 9497
})

In [None]:
reference_summaries = d2['headline']
candidate_summaries = d2['generated_summary_BERT2BERT']
compute_metrics_testset(reference_summaries, candidate_summaries)


scores = {'rouge1': 0.28466028929198955, 'rouge2': 0.15777117050575573, 'rougeL': 0.2547489775774109, 'ref_len': 12.806570495946088, 'gen_len': 13.518058334210803}


{'rouge1': 0.28466028929198955,
 'rouge2': 0.15777117050575573,
 'rougeL': 0.2547489775774109,
 'ref_len': 12.806570495946088,
 'gen_len': 13.518058334210803}

In [None]:
avg_P, avg_R, avg_F1 = calculate_bertscore(reference_summaries, candidate_summaries)
avg_F1

Average scores:
  Precision: 0.6522
  Recall: 0.6458
  F1: 0.6479


0.6478858590126038