In [4]:
# Cellule 1 : Configuration et chemins locaux
import sys
from pathlib import Path
import pandas as pd
import torch

# 1. Configuration du chemin racine
ROOT_DIR = Path("..").resolve()
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

# 2. Rechargement auto
%load_ext autoreload
%autoreload 2

# 3. Dossiers
MODELS_DIR = ROOT_DIR / 'models' / 'nllb_local'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Racine : {ROOT_DIR}")
print(f"Device : {device}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\benic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\benic\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Racine : C:\Users\benic\Documents\Projet_DL_Translation
Device : cpu


In [2]:
# Cellule 2 : Initialisation
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MODEL_CHECKPOINT = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, src_lang="fra_Latn", tgt_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT).to(device)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 512/512 [00:01<00:00, 428.66it/s, Materializing param=model.shared.weight]                                   


In [5]:
# Cellule 3 : Ingestion et Sous-echantillonnage 10%
from datasets import load_dataset
from src.hf_utils import preprocess_function

data_files = {
    "train": str(ROOT_DIR / 'data' / 'processed' / 'train_nmt_fr_en.csv'),
    "validation": str(ROOT_DIR / 'data' / 'processed' / 'valid_nmt_fr_en.csv'),
    "test": str(ROOT_DIR / 'data' / 'processed' / 'test_nmt_fr_en.csv')
}

# Chargement complet
raw_datasets = load_dataset("csv", data_files=data_files)

# --- REDUCTION A 10% POUR LE TEST LOCAL ---
for split in ["train", "validation"]:
    raw_datasets[split] = raw_datasets[split].shuffle(seed=42).select(range(int(len(raw_datasets[split]) * 0.1)))
    print(f"Taille du set {split} reduite a : {len(raw_datasets[split])} phrases.")

# Tokenization
tokenized_datasets = raw_datasets.map(
    lambda x: preprocess_function(x, tokenizer),
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Taille du set train reduite a : 3005 phrases.
Taille du set validation reduite a : 103 phrases.


Map: 100%|██████████| 3005/3005 [00:00<00:00, 12241.72 examples/s]
Map: 100%|██████████| 103/103 [00:00<00:00, 5649.52 examples/s]
Map: 100%|██████████| 1058/1058 [00:00<00:00, 13284.72 examples/s]


In [None]:
# Cellule 4 : Entrainement local avec contournement du bug tokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from src.hf_utils import compute_metrics

# Le DataCollator a deja le tokenizer, c'est lui qui gere le padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    output_dir=str(MODELS_DIR),
    eval_strategy="epoch",       
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    num_train_epochs=1,            
    predict_with_generate=True,
    fp16=torch.cuda.is_available(), 
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="none"               
)

# On retire l'argument 'tokenizer' qui pose probleme dans ton environnement
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=lambda p: compute_metrics(p, tokenizer)
)

print("Demarrage du Fine-Tuning local (test 10%)...")
trainer.train()

# Sauvegarde manuelle du tokenizer puisque le Trainer ne l'a pas fait
tokenizer.save_pretrained(str(MODELS_DIR / "final_model"))
trainer.save_model(str(MODELS_DIR / "final_model"))

Demarrage du Fine-Tuning local (test 10%)...




Epoch,Training Loss,Validation Loss


In [None]:
# Cellule 5 : Evaluation finale (Inference)
from tqdm import tqdm

model.eval()
df_test = pd.read_csv(data_files["test"]).sample(50) # On teste sur 50 phrases seulement en local
predictions = []
forced_bos_token_id = tokenizer.lang_code_to_id["eng_Latn"]

for text in tqdm(df_test['text_fr'].tolist()):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        generated_tokens = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=128)
    predictions.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

df_test['prediction'] = predictions

In [None]:
# Cellule 6 : Tableau comparatif
# Vous pouvez relancer cette cellule autant de fois que vous voulez pour voir d'autres exemples
pd.set_option('display.max_colwidth', None)
df_test[['text_fr', 'text_en', 'prediction']].sample(10)

In [None]:
# Cellule 7 : Preuve mathematique de la limite du modele Vanilla
# Calcul des longueurs des phrases sources
df_test['src_len'] = df_test['text_fr'].apply(lambda x: len(str(x).split()))

df_short = df_test[df_test['src_len'] < 10]
df_long = df_test[df_test['src_len'] > 25]

def compute_bleu(df):
    if len(df) == 0: return 0.0
    preds = df['prediction'].tolist()
    # SacreBLEU attend une liste de listes pour les references
    refs = [df['text_en'].tolist()]
    return sacrebleu.corpus_bleu(preds, refs).score

print(f"--- ANALYSE DE ROBUSTESSE ---")
print(f"BLEU Global : {compute_bleu(df_test):.2f}")
print(f"BLEU Phrases Courtes (<10 mots) : {compute_bleu(df_short):.2f} (Support: {len(df_short)})")
print(f"BLEU Phrases Longues (>25 mots) : {compute_bleu(df_long):.2f} (Support: {len(df_long)})")
print("\nConclusion: L'effondrement du score sur les longues phrases demontre la perte d'information dans le vecteur de contexte unique.")