<a href="https://www.kaggle.com/code/miranacareneandrisoa/notebooke3ded57777?scriptVersionId=238128470" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
from datasets import Dataset, DatasetDict,load_from_disk
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, logging
import evaluate
import numpy as np
import os
import torch
import shutil
import re
from typing import Optional
import types

os.environ["WANDB_DISABLED"] = "true"
# logging.set_verbosity_info() 
torch.cuda.empty_cache()         # Releases unused cached memory
torch.cuda.ipc_collect()         # Collects inter-process communication memory


train_src_path = "../input/translation-model-en-mg/train_clean.en"
train_tgt_path = "../input/translation-model-en-mg/train_clean.mg"
valid_src_path = "../input/translation-model-en-mg/valid_clean.en"
valid_tgt_path = "../input/translation-model-en-mg/valid_clean.mg"
def delete_folder_if_exists(dir_path):
    """
    Supprime le dossier spécifié si il existe.
    
    Args:
        dir_path (str): Chemin vers le dossier à supprimer.
    """
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)  # Supprime le dossier et son contenu
        print(f"Le dossier {dir_path} a été supprimé.")
    else:
        print(f"Le dossier {dir_path} n'existe pas.")
# 🔄 Fonction de chargement depuis fichiers alignés

# delete_folder_if_exists("./cached_en_mg_tokenized")
def load_translation_data(src_file, tgt_file, src_lang, tgt_lang):
    with open(src_file, "r", encoding="utf-8") as f:
        src_lines = [line.strip() for line in f if line.strip()]
    with open(tgt_file, "r", encoding="utf-8") as f:
        tgt_lines = [line.strip() for line in f if line.strip()]

    print(f"{src_file} contient {len(src_lines)} lignes non vides.")
    print(f"{tgt_file} contient {len(tgt_lines)} lignes non vides.")

    assert len(src_lines) == len(tgt_lines), "Les fichiers source et cible doivent avoir le même nombre de lignes."

    return {
        "translation": [
            {src_lang: src, tgt_lang: tgt}
            for src, tgt in zip(src_lines, tgt_lines)
        ]
    }


# 📦 Création des datasets
train_data = load_translation_data(train_src_path, train_tgt_path, "en", "mg")
valid_data = load_translation_data(valid_src_path, valid_tgt_path, "en", "mg")

dataset = DatasetDict({
    "train": Dataset.from_dict(train_data),
    "validation": Dataset.from_dict(valid_data)
})




# ⚙️ Chargement du tokenizer et modèle pré-entraîné
model_name = "Helsinki-NLP/opus-mt-en-mg"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cache GPU vidé.")
else:
    print("CUDA n'est pas disponible. Aucun cache GPU à vider.")

model = model.to(device)
# ✂️ Prétraitement
def preprocess_function(examples):
    inputs = [example['en'] for example in examples['translation']]
    targets = [example['mg'] for example in examples['translation']]

    model_inputs = tokenizer(
        inputs, max_length=64, truncation=True, padding="max_length", text_target=targets
    )

    return model_inputs


# print(dataset['train'][0]) 
# exit()
# 🧠 Tokenize the dataset
cache_dir = "cached_en_mg_tokenized"

# If the dataset is cached, load it from disk
try:
    tokenized_dataset = load_from_disk(cache_dir)
    print("Loaded tokenized dataset from cache.")
except:
    # If not cached, process and save it
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=3,  # Use 3 CPU cores for faster mapping
        load_from_cache_file=False  # Don't load from cache if it's being generated
    )
    # Save the tokenized dataset for future use
    tokenized_dataset.save_to_disk(cache_dir)
    print(f"Tokenized dataset saved to {cache_dir}.")

# tokenized_dataset = dataset.map(preprocess_function, batched=True,num_proc=3)

# 🧠 Paramètres d'entraînement
training_args = Seq2SeqTrainingArguments(
    output_dir="./en_to_mg_model",
    eval_strategy="steps",
    save_strategy="steps",  # Sauvegarde à chaque X pas
    save_steps=4000,         # Sauvegarde toutes les 1000 étapes
    eval_steps=4000,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_strategy="steps",
    load_best_model_at_end=True,
    resume_from_checkpoint=True,
    fp16=True
)

# 🔁 Préparation du Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
# Custom function to load RNG state correctly
def custom_load_rng_state(self, checkpoint):
    checkpoint_dir = checkpoint
    checkpoint_rng_state_file = os.path.join(checkpoint_dir, 'rng_state.pth')  # Adjust if your RNG state is saved under another name
    if os.path.exists(checkpoint_rng_state_file):
        # Load the full checkpoint without using weights_only
        checkpoint_rng_state = torch.load(checkpoint_rng_state_file, weights_only=False)

        # Ensure that the RNG state is only applied if the size matches
        if "cpu" in checkpoint_rng_state:
            torch.set_rng_state(checkpoint_rng_state["cpu"])

        # Ensure that CUDA RNG state is applied if CUDA is available
        if torch.cuda.is_available():
            if "cuda" in checkpoint_rng_state:
                # Check if the CUDA state is of the correct size
                if len(checkpoint_rng_state["cuda"]) == torch.cuda.device_count():
                    torch.cuda.set_rng_state_all(checkpoint_rng_state["cuda"])
                else:
                    print(f"Skipping CUDA RNG state loading: mismatched size.")
            else:
                print(f"Skipping CUDA RNG state loading: no 'cuda' state in checkpoint.")
        else:
            print(f"CUDA is not available, skipping CUDA RNG state loading.")
    else:
        print(f"RNG state file not found: {checkpoint_rng_state_file}")

# Replace the default load_rng_state method in the Trainer instance
trainer._load_rng_state = types.MethodType(custom_load_rng_state, trainer)


def get_last_checkpoint(checkpoint_root: str) -> Optional[str]:
    """
    Returns the path to the latest checkpoint in the given directory,
    or None if no valid checkpoint is found.
    """
    if not os.path.isdir(checkpoint_root):
        return None

    checkpoints = [
        d for d in os.listdir(checkpoint_root)
        if re.match(r"^checkpoint-\d+$", d)
    ]

    if not checkpoints:
        return None

    # Sort by checkpoint number
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
    return os.path.join(checkpoint_root, checkpoints[-1])

# Utilisation
checkpoint_root = "./en_to_mg_model"
last_checkpoint = get_last_checkpoint(checkpoint_root)


# 🚀 Entraînement
if last_checkpoint:
    print(f"Reprise depuis le checkpoint : {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Aucun checkpoint trouvé. Démarrage depuis zéro.")
    trainer.train()

# 💾 Sauvegarde du modèle
trainer.save_model("./en_to_mg_model")
tokenizer.save_pretrained("./en_to_mg_model")


../input/translation-model-en-mg/train_clean.en contient 2101160 lignes non vides.
../input/translation-model-en-mg/train_clean.mg contient 2101160 lignes non vides.
../input/translation-model-en-mg/valid_clean.en contient 233463 lignes non vides.
../input/translation-model-en-mg/valid_clean.mg contient 233463 lignes non vides.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Cache GPU vidé.


model.safetensors:   0%|          | 0.00/291M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Loaded tokenized dataset from cache.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Reprise depuis le checkpoint : ./en_to_mg_model/checkpoint-8000
Skipping CUDA RNG state loading: mismatched size.




Step,Training Loss,Validation Loss
