In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
'Mettre le tokken ici de Hugging Face'

In [1]:
# Auth sur Hug‐Face si HF_TOKEN est défini dans Kaggle Secrets
try:
    hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    HfFolder.save_token(hf_token)
except:
    pass

# Configuration
MODEL = "Helsinki-NLP/opus-mt-fr-en"
SRC, TGT = "fr", "en"
BATCH = 32
EPOCHS = 3
OUTPUT = "opus-mt-fr-en-colab"

# Chargement et split du dataset
ds = load_dataset("opus_books", "en-fr")
s = ds["train"].train_test_split(0.05, seed=42)
s2 = s["train"].train_test_split(0.05, seed=42)
raw = DatasetDict({"train": s2["train"], "validation": s2["test"], "test": s["test"]})

# Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL)
def preprocess(ex):
    srcs = [t[SRC] for t in ex["translation"]]
    tgts = [t[TGT] for t in ex["translation"]]
    mi = tok(srcs, max_length=128, truncation=True, padding=False)
    lb = tok(text_target=tgts, max_length=128, truncation=True, padding=False)
    mi["labels"] = lb["input_ids"]
    return mi

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)

# Modèle + DataCollator + Métriques
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device)
dc = DataCollatorForSeq2Seq(tok, model=model)
sacrebleu = evaluate.load("sacrebleu")

def compute_metrics(p):
    preds, labels = p.predictions, p.label_ids
    if isinstance(preds, tuple): preds = preds[0]
    preds = np.where(preds != -100, preds, tok.pad_token_id)
    labels = np.where(labels != -100, labels, tok.pad_token_id)
    dp = tok.batch_decode(preds, skip_special_tokens=True)
    dl = tok.batch_decode(labels, skip_special_tokens=True)
    return {"bleu": sacrebleu.compute(predictions=dp, references=[[l] for l in dl])["score"]}

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=5e-5,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    push_to_hub=hf_token is not None,
    hub_model_id="USERNAME/" + OUTPUT  # remplace USERNAME par ton nom
)

trainer = Seq2SeqTrainer(
    model=model, args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tok, data_collator=dc,
    compute_metrics=compute_metrics
)

# Lancement
trainer.train()
trainer.save_model()
print(trainer.predict(tokenized["test"], metric_key_prefix="test").metrics)

# Inférence
for s in ["Bonjour le monde", "J'espère BLEU ~40", "Bonne traduction !"]:
    out = model.generate(**tok(s, return_tensors="pt", truncation=True).to(device),
                         max_length=128, num_beams=4)
    print(f"{s} → {tok.decode(out[0], skip_special_tokens=True)}")


NameError: name 'load_dataset' is not defined

In [3]:
!pip install --upgrade --force-reinstall datasets huggingface_hub fsspec

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (fro

In [4]:
from datasets import load_dataset, DatasetDict