In [1]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [2]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
Successfully installed transformers-4.48.2


In [3]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import sacrebleu
from sklearn.model_selection import train_test_split

In [4]:
# Charger et préprocesser les données
df = pd.read_csv('/kaggle/input/cleaned-darija-dataset/cleaned_darija_dataset.csv')


In [5]:
# Split the data into train, validation, and test sets
train_texts, temp_texts, train_translations, temp_translations = train_test_split(
    df['darija'], df['english'], test_size=0.2, random_state=42
)

val_texts, test_texts, val_translations, test_translations = train_test_split(
    temp_texts, temp_translations, test_size=0.5, random_state=42
)

In [6]:
# Create datasets
train_dataset = Dataset.from_dict({
    'source': train_texts.tolist(),
    'target': train_translations.tolist()
})
val_dataset = Dataset.from_dict({
    'source': val_texts.tolist(),
    'target': val_translations.tolist()
})
test_dataset = Dataset.from_dict({
    'source': test_texts.tolist(),
    'target': test_translations.tolist()
})

# Combine into DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [7]:
# Cell 3: Tokenization
model_checkpoint = "facebook/mbart-large-cc25"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [8]:
# Set source and target language codes
tokenizer.src_lang = "en_XX"  # English
tokenizer.tgt_lang = "ar_AR"  # Arabic (adjust if needed)

In [9]:
def preprocess_function(examples):
    inputs = examples['source']
    targets = examples['target']
    
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding='max_length'
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=128, 
            truncation=True, 
            padding='max_length'
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=dataset['train'].column_names
)

Map:   0%|          | 0/75037 [00:00<?, ? examples/s]



Map:   0%|          | 0/9380 [00:00<?, ? examples/s]

Map:   0%|          | 0/9380 [00:00<?, ? examples/s]

In [10]:
# Cell 4: Model Preparation
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    return_tensors="pt"
)

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

In [11]:
# Cell 5: Evaluation Metric (BLEU Score)
def compute_bleu(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu_scores = [
        sacrebleu.sentence_bleu(pred, [ref]).score 
        for pred, ref in zip(decoded_preds, decoded_labels)
    ]
    
    return {
        'bleu_score': np.mean(bleu_scores)
    }

In [12]:
batch_size = 8
gradient_accumulation_step = 2 #needs to be a batch size that can fit into memory 


training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/mbart_darija_translation",
    num_train_epochs=1,
    learning_rate=5e-4,
    gradient_accumulation_steps = gradient_accumulation_step, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,  # Moins d'évaluations
    save_strategy="no",
    #save_steps=500,  # Moins de sauvegardes
    #save_total_limit=1,
    load_best_model_at_end=False,
    metric_for_best_model="bleu_score",
    push_to_hub=False,
    fp16=False,  # Activation demi-précision
    predict_with_generate=True,  # Désactivation de la génération
    generation_max_length=64,  # Limitation des séquences générées
    #gradient_accumulation_steps=4,  # Accumulates gradients for 4 steps before updating weights.
    report_to = "none",
    gradient_checkpointing=True
)




In [13]:
# Cell 7: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_bleu
)


In [14]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Cell 8: Training
# Train the model

trainer.train()

Step,Training Loss,Validation Loss,Bleu Score
500,0.215,0.19783,2.910495
1000,0.1698,0.158156,8.946827
1500,0.1599,0.154144,11.077651
2000,0.1512,0.14576,14.141069
2500,0.131,0.137811,16.755393
3000,0.1379,0.131435,20.756764


In [None]:
import matplotlib.pyplot as plt

train_loss = [entry['loss'] for entry in trainer.state.log_history if 'loss' in entry]
epochs = [entry['epoch'] for entry in trainer.state.log_history if 'loss' in entry]

eval_loss = [entry['eval_loss'] for entry in trainer.state.log_history if 'eval_loss' in entry]
eval_epochs = [entry['epoch'] for entry in trainer.state.log_history if 'eval_loss' in entry]

plt.plot(epochs, train_loss, label="Training Loss")
plt.plot(eval_epochs, eval_loss, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()


In [None]:
# Cell 9: Final Evaluation
test_results = trainer.evaluate(
    eval_dataset=tokenized_datasets['test'], 
    metric_key_prefix="test"
)
print("Test Results:", test_results)


In [None]:
# Optional: Save the model
trainer.save_model("./final_mbert_darija_translation_model")

In [None]:
# Cell 10: Prediction Function
def translate_text(text, model, tokenizer, max_length=128):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        max_length=max_length, 
        truncation=True, 
        padding=True
    )
    
    # Specify target language for generation
    inputs["decoder_start_token_id"] = tokenizer.lang_code_to_id["ar_AR"]
    
    outputs = model.generate(
        inputs['input_ids'], 
        max_length=max_length, 
        num_beams=4, 
        early_stopping=True
    )
    
    translation = tokenizer.decode(
        outputs[0], 
        skip_special_tokens=True
    )
    
    return translation

# Load saved model and tokenizer
saved_model = AutoModelForSeq2SeqLM.from_pretrained("./final_mbart_darija_translation_model")
saved_tokenizer = AutoTokenizer.from_pretrained("./final_mbart_darija_translation_model")

# Example translations
example_texts = [
    "who are you",
    "can you help me"
]

for text in example_texts:
    translation = translate_text(text, saved_model, saved_tokenizer)
    print(f"Original: {text}")
    print(f"Translation: {translation}")
    print("-" * 50)