### Install 

In [1]:
# === Cell 1 — Install dependencies for cleaning + Marian training (from scratch) ===
# - transformers/sentencepiece/accelerate: model + tokenizer + mixed precision
# - datasets: dataset wrappers + Trainer input
# - sacrebleu: BLEU/chrF evaluation
# - nltk/jieba: tokenization, stemming/lemmatization (assignment requirement)
# - scikit-learn: TF-IDF (assignment feature extraction requirement)
# - matplotlib: simple plots for training curves
!pip install -q transformers sentencepiece accelerate datasets sacrebleu nltk jieba scikit-learn matplotlib

# minimal NLTK resources for tokenization/lemmatization demos
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)


True

### Imports & global configuration

In [2]:
# === Cell 2 — Imports & global configuration (from-scratch Marian training) ===
from pathlib import Path
import os, re, json, unicodedata, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
from datasets import Dataset, DatasetDict
import sacrebleu  # for BLEU/chrF later

# ---- Paths ----
DATA_PATH = Path("/AI2/dataset_CN_EN.txt")   # your CN<TAB>EN file
RUN_DIR   = Path("marian_zh_en_scratch_run")
RUN_DIR.mkdir(parents=True, exist_ok=True)

# ---- Reproducibility ----
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ---- Device ----
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ---- Training hyperparameters (scratch needs more steps; adjust if needed) ----
EPOCHS         = 1          # increase if dataset is small
LR             = 3e-4        # higher than fine-tune learning rate
BSZ            = 16          # lower if OOM; you can use 8/4
GRAD_ACCUM     = 1           # raise if you reduce BSZ
WARMUP_RATIO   = 0.08
WEIGHT_DECAY   = 1e-2
LABEL_SMOOTH   = 0.1
MAX_SRC        = 128         # truncation lengths (raise if VRAM allows)
MAX_TGT        = 128
LOG_STEPS      = 50

# Persist a small run-config for reference
cfg = {
    "seed": SEED, "device": DEVICE, "epochs": EPOCHS, "lr": LR, "batch_size": BSZ,
    "grad_accum": GRAD_ACCUM, "warmup_ratio": WARMUP_RATIO, "weight_decay": WEIGHT_DECAY,
    "label_smooth": LABEL_SMOOTH, "max_src": MAX_SRC, "max_tgt": MAX_TGT,
    "data_path": str(DATA_PATH), "run_dir": str(RUN_DIR),
}
(RUN_DIR / "config.json").write_text(json.dumps(cfg, indent=2), encoding="utf-8")
print("Config saved to:", RUN_DIR / "config.json")


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
Config saved to: marian_zh_en_scratch_run\config.json


### Load raw dataset & BEFORE preview

In [3]:
# === Cell 3 — Load raw dataset & BEFORE preview ===
from IPython.display import display

assert DATA_PATH.exists(), f"Missing dataset: {DATA_PATH}"
df_raw = pd.read_csv(DATA_PATH, sep="\t", header=None, names=["cn","en"], dtype=str)

print("RAW shape:", df_raw.shape)
display(df_raw.head(10))  # BEFORE cleaning preview


RAW shape: (20289, 2)


Unnamed: 0,cn,en
0,嗨。,Hi.
1,你好。,Hi.
2,跑。,Run.
3,等等！,Wait!
4,你好。,Hello!
5,让我来。,I try.
6,我赢了。,I won!
7,不会吧。,Oh no!
8,乾杯!,Cheers!
9,他跑了。,He ran.


### Cleaning

In [4]:
# === Cell 4 — Clean text + BEFORE/AFTER view (assignment: text cleaning) ===
import re, unicodedata
from IPython.display import display

# zero-width chars set
ZWSP = "".join(["\u200b","\u200c","\u200d","\ufeff"])

def normalize(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s)     # Unicode normalize
    s = re.sub(f"[{ZWSP}]", "", s)           # remove zero-width chars
    s = re.sub(r"\s+", " ", s).strip()       # collapse whitespace
    return s

df = df_raw.copy()
df["cn_clean"] = df["cn"].map(normalize)
df["en_clean"] = df["en"].map(normalize)

before_after = pd.DataFrame({
    "cn_before": df_raw["cn"], "cn_after": df["cn_clean"],
    "en_before": df_raw["en"], "en_after": df["en_clean"]
})
changed = int(((before_after.cn_before != before_after.cn_after) |
               (before_after.en_before != before_after.en_after)).sum())
print("Rows changed by cleaning:", changed)

display(before_after.head(10))


Rows changed by cleaning: 3694


Unnamed: 0,cn_before,cn_after,en_before,en_after
0,嗨。,嗨。,Hi.,Hi.
1,你好。,你好。,Hi.,Hi.
2,跑。,跑。,Run.,Run.
3,等等！,等等!,Wait!,Wait!
4,你好。,你好。,Hello!,Hello!
5,让我来。,让我来。,I try.,I try.
6,我赢了。,我赢了。,I won!,I won!
7,不会吧。,不会吧。,Oh no!,Oh no!
8,乾杯!,乾杯!,Cheers!,Cheers!
9,他跑了。,他跑了。,He ran.,He ran.


### Drop empties/duplicates & split (80/10/10)

In [5]:
# === Cell 5 — Drop empties/duplicates + deterministic 80/10/10 split ===
import numpy as np
from IPython.display import display

n0 = len(df)
df2 = df.replace({"": np.nan}).dropna(subset=["cn_clean","en_clean"])
dup_count = int(df2.duplicated(subset=["cn_clean","en_clean"]).sum())
df2 = df2.drop_duplicates(subset=["cn_clean","en_clean"]).reset_index(drop=True)
print(f"After clean: {len(df2)} rows (removed {n0-len(df2)} empties + {dup_count} duplicates)")

# Deterministic shuffle & split
rng = np.random.default_rng(SEED)
idx = np.arange(len(df2)); rng.shuffle(idx)
n = len(idx)
i_train = idx[: int(0.8*n)]
i_valid = idx[int(0.8*n): int(0.9*n)]
i_test  = idx[int(0.9*n):]

train_df = df2.iloc[i_train][["cn_clean","en_clean"]].rename(columns={"cn_clean":"cn","en_clean":"en"}).reset_index(drop=True)
valid_df = df2.iloc[i_valid][["cn_clean","en_clean"]].rename(columns={"cn_clean":"cn","en_clean":"en"}).reset_index(drop=True)
test_df  = df2.iloc[i_test ][["cn_clean","en_clean"]].rename(columns={"cn_clean":"cn","en_clean":"en"}).reset_index(drop=True)

print(f"train={len(train_df)}  valid={len(valid_df)}  test={len(test_df)}")
display(train_df.head(5)); display(valid_df.head(5))


After clean: 20285 rows (removed 4 empties + 4 duplicates)
train=16228  valid=2028  test=2029


Unnamed: 0,cn,en
0,麻煩您稍待一下。,Will you wait a moment?
1,你怎麼什麼話也沒說?,How come you didn't say anything?
2,我會告訴我的妻子。,I'll tell my wife.
3,你認為它是陷阱嗎?,Do you think it's a trap?
4,"我们那时在谈论事情,但我不记得是什么了。","We were talking about something at that time, ..."


Unnamed: 0,cn,en
0,她總是穿著黑色的衣服。,She always wears black.
1,湯姆很可能遲到。,Tom is quite likely to be late.
2,汤姆找不到玛丽。,Tom can't find Mary.
3,"我的手抖得太厉害,没法穿针了。",My hands were shaking too much to thread the n...
4,我担心他的健康。,I was worried about his health.


### Build Hugging Face Datasets & tokenize to token IDs (for training)

In [6]:
# === Cell 6 — No HF Dataset: build PyTorch Datasets directly from your data ===
from torch.utils.data import Dataset
from transformers import AutoTokenizer

MODEL_NAME = "Helsinki-NLP/opus-mt-zh-en"   # tokenizer only; model will be from-scratch next
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class MarianPairDataset(Dataset):
    def __init__(self, df, tokenizer, max_src=MAX_SRC, max_tgt=MAX_TGT):
        self.cn = df["cn"].tolist()
        self.en = df["en"].tolist()
        self.tok = tokenizer
        self.max_src = max_src
        self.max_tgt = max_tgt

    def __len__(self):
        return len(self.cn)

    def __getitem__(self, idx):
        src = self.cn[idx]; tgt = self.en[idx]
        # Same-call style
        pack = tokenizer(
            src,
            text_target=tgt,
            max_length=MAX_SRC,
            truncation=True
        )
        # If you need different lengths, do two calls as in the comment above
        return {
            "input_ids": torch.tensor(pack["input_ids"]),
            "attention_mask": torch.tensor(pack["attention_mask"]),
            "labels": torch.tensor(pack["labels"]),
        }

train_ds = MarianPairDataset(train_df, tokenizer)
valid_ds = MarianPairDataset(valid_df, tokenizer)
test_ds  = MarianPairDataset(test_df,  tokenizer)

print("PyTorch datasets:", len(train_ds), len(valid_ds), len(test_ds))


PyTorch datasets: 16228 2028 2029




##### Version 2 using 

In [7]:
# # === Cell 6 (fixed) — Build HF Datasets & tokenize for Marian (prep for training) ===
# from transformers import AutoTokenizer
# from datasets import Dataset, DatasetDict

# MODEL_NAME = "Helsinki-NLP/opus-mt-zh-en"   # tokenizer only; model will be from-scratch next
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # Preprocess: CN → input_ids, EN → labels (both truncated to MAX_SRC/MAX_TGT)
# def preprocess(batch):
#     # One call: inputs + labels via `text_target`
#     model_inputs = tokenizer(
#         batch["cn"],
#         text_target=batch["en"],
#         max_length=MAX_SRC,
#         truncation=True
#     )
#     # If you want a different max length for targets, split into two calls:
#     model_inputs = tokenizer(batch["cn"], max_length=MAX_SRC, truncation=True)
#     labels = tokenizer(text_target=batch["en"], max_length=MAX_TGT, truncation=True)
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs


# # Wrap pandas splits into HF DatasetDict
# hf_ds = DatasetDict({
#     "train": Dataset.from_pandas(train_df),
#     "validation": Dataset.from_pandas(valid_df),
#     "test": Dataset.from_pandas(test_df),
# })

# # Remove only columns that actually exist (some pandas versions don't add __index_level_0__)
# cols_in = hf_ds["train"].column_names
# to_remove = [c for c in ["cn", "en", "__index_level_0__", "index"] if c in cols_in]

# tok_ds = hf_ds.map(preprocess, batched=True, remove_columns=to_remove)
# print(tok_ds)
# print("Columns removed:", to_remove)
# print("Example tokenized keys:", tok_ds["train"][0].keys())


### Create Marian Transformer *from scratch* + collator

In [8]:
# === Cell 7 — Create Marian Transformer *from scratch* + collator ===
from transformers import AutoConfig, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import math

MODEL_NAME = "Helsinki-NLP/opus-mt-zh-en"  # use Marian architecture config only

# Load architecture hyperparameters (no weights), then build random-initialized model
config = AutoConfig.from_pretrained(MODEL_NAME)
model  = AutoModelForSeq2SeqLM.from_config(config)

# Make sure embedding sizes match the tokenizer (safe even if already equal)
model.resize_token_embeddings(len(tokenizer))

# Move to GPU/CPU
model.to(DEVICE)

# Collator handles dynamic padding for batches
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Quick model info
num_params = sum(p.numel() for p in model.parameters())
print(f"Marian (from scratch) initialized. Parameters: {num_params/1e6:.1f}M | Vocab size: {len(tokenizer)} | Device: {DEVICE}")

Marian (from scratch) initialized. Parameters: 77.9M | Vocab size: 65001 | Device: cuda


### BLEU

In [9]:
# === Cell 8 — Metrics: BLEU only ===
import numpy as np
import sacrebleu

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu = sacrebleu.corpus_bleu(pred_texts, [ref_texts]).score
    return {"bleu": bleu}


### Trainer Setup + Train + Save

In [10]:
# === Cell 9 — Unified Trainer (works with HF tok_ds OR PyTorch train_ds/valid_ds) ===
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import math, inspect, shutil
from pathlib import Path

# ---- pick the datasets you actually created ----
if 'tok_ds' in globals():                               # HF DatasetDict path (Cell 6)
    train_dataset = tok_ds["train"]
    eval_dataset  = tok_ds["validation"]
elif 'train_ds' in globals() and 'valid_ds' in globals(): # Option B PyTorch path
    train_dataset = train_ds
    eval_dataset  = valid_ds
else:
    raise RuntimeError("No datasets found. Run Cell 6 (HF) or Cell 6 Option B (PyTorch) first.")

train_len = len(train_dataset)

ckpt_dir = RUN_DIR / "checkpoints"
best_dir = RUN_DIR / "best_model"

# ---- HARD RESET to avoid incompatible old checkpoints (set False to resume later) ----
CLEAN_START = False
if CLEAN_START:
    shutil.rmtree(ckpt_dir, ignore_errors=True)
    shutil.rmtree(best_dir, ignore_errors=True)
ckpt_dir.mkdir(parents=True, exist_ok=True)

# ---- step estimates for legacy fallbacks ----
steps_per_epoch = max(1, math.ceil(train_len / max(1, BSZ)) // max(1, GRAD_ACCUM))
total_steps     = steps_per_epoch * EPOCHS
warmup_steps    = int(WARMUP_RATIO * total_steps)

# ---- build TrainingArguments compatibly with your transformers version ----
sig = set(inspect.signature(Seq2SeqTrainingArguments.__init__).parameters.keys())
kw = dict(
    output_dir=str(ckpt_dir),
    overwrite_output_dir=True,          # ensure clean start
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BSZ,
    per_device_eval_batch_size=BSZ,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    fp16=(DEVICE == "cuda"),
    logging_steps=LOG_STEPS,
    seed=SEED,
)

# warmup
if "warmup_ratio" in sig: kw["warmup_ratio"] = WARMUP_RATIO
elif "warmup_steps" in sig: kw["warmup_steps"] = warmup_steps

# label smoothing
if "label_smoothing_factor" in sig: kw["label_smoothing_factor"] = LABEL_SMOOTH

# epoch-based strategies when available; otherwise step fallbacks
has_eval_strategy = ("evaluation_strategy" in sig) or ("eval_strategy" in sig)
has_save_strategy = ("save_strategy" in sig)

if has_eval_strategy:
    if "evaluation_strategy" in sig: kw["evaluation_strategy"] = "epoch"
    else: kw["eval_strategy"] = "epoch"
if has_save_strategy:
    kw["save_strategy"] = "epoch"
if "save_total_limit" in sig: kw["save_total_limit"] = 3
if "predict_with_generate" in sig: kw["predict_with_generate"] = True
if "report_to" in sig: kw["report_to"] = "none"

if not has_eval_strategy and "eval_steps" in sig: kw["eval_steps"] = steps_per_epoch
if not has_save_strategy and "save_steps" in sig: kw["save_steps"] = steps_per_epoch

# enable "load_best_model_at_end" only if both strategies exist (prevents mismatch error)
if (("load_best_model_at_end" in sig) and ("metric_for_best_model" in sig)
    and ("greater_is_better" in sig) and has_eval_strategy and has_save_strategy):
    kw["load_best_model_at_end"] = True
    kw["metric_for_best_model"] = "bleu"
    kw["greater_is_better"] = True

args = Seq2SeqTrainingArguments(**kw)


In [None]:
# ---- Trainer ----
try:
    from transformers import EarlyStoppingCallback
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
except Exception:
    callbacks = []

trainer = Seq2SeqTrainer(
    model=model,                      # from-scratch Marian (Cell 7)
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # BLEU-only (Cell 8)
    callbacks=callbacks,
)

# ---- Train (NO RESUME this run) ----
trainer.train(resume_from_checkpoint=None)

# ---- Save model/tokenizer ----
trainer.save_model(best_dir)
tokenizer.save_pretrained(best_dir)

best_ckpt  = getattr(trainer.state, "best_model_checkpoint", None)
best_metric= getattr(trainer.state, "best_metric", None)
print("Best checkpoint:", best_ckpt)
print("Best BLEU:", best_metric)
print("Saved to:", best_dir)


  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
# === Cell 12A — Classification-style metrics (token-level) + BLEU ===
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sacrebleu

def compute_metrics_cls(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):  # some Trainer versions return (logits, ...)
        preds = preds[0]

    pad_id = tokenizer.pad_token_id

    # ----- token-level classification view -----
    # Flatten and ignore PAD in labels
    y_true = np.where(labels != -100, labels, pad_id).reshape(-1)
    y_pred = preds.reshape(-1)
    keep = (y_true != pad_id)
    y_true_m = y_true[keep]
    y_pred_m = y_pred[keep]

    out = {}
    if y_true_m.size > 0:
        out["tok_acc"]            = accuracy_score(y_true_m, y_pred_m)
        out["tok_f1_micro"]       = f1_score(y_true_m, y_pred_m, average="micro", zero_division=0)
        out["tok_f1_macro"]       = f1_score(y_true_m, y_pred_m, average="macro", zero_division=0)
        out["tok_precision_macro"]= precision_score(y_true_m, y_pred_m, average="macro", zero_division=0)
        out["tok_recall_macro"]   = recall_score(y_true_m, y_pred_m, average="macro", zero_division=0)

    # ----- BLEU (text-level) -----
    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_for_decode = np.where(labels != -100, labels, pad_id)
    ref_texts  = tokenizer.batch_decode(labels_for_decode, skip_special_tokens=True)
    out["bleu"] = sacrebleu.corpus_bleu(pred_texts, [ref_texts]).score
    return out


# For the current run (no retrain), attach and evaluate VALID once:
trainer.compute_metrics = compute_metrics_cls
valid_results = trainer.evaluate(eval_dataset=eval_dataset)  # uses the same eval_dataset as Cell 9
import pandas as pd; display(pd.DataFrame([valid_results]).round(4))

In [None]:
# === Cell 12C — Overall classification metrics on TEST ===
import torch
from tqdm.auto import tqdm

@torch.no_grad()
def translate_texts(texts, batch_size=16, num_beams=4, max_new_tokens=128,
                    no_repeat_ngram_size=3, length_penalty=1.0):
    mdl = trainer.model.to(DEVICE).eval() if 'trainer' in globals() else model.to(DEVICE).eval()
    outs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating TEST"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True,
                        max_length=MAX_SRC).to(DEVICE)
        gen = mdl.generate(
            **enc,
            num_beams=num_beams,
            max_new_tokens=max_new_tokens,
            no_repeat_ngram_size=no_repeat_ngram_size,
            length_penalty=length_penalty,
        )
        outs.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    return outs

# 1) Translate TEST
test_src = test_df["cn"].tolist()
test_ref = test_df["en"].tolist()
test_pred = translate_texts(test_src, batch_size=16)

# 2) Token-level multiclass metrics (re-encode refs & preds) — FIXED equal-length pad
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MAX_TGT_EVAL = int(globals().get("MAX_TGT", 128))  # fallback if MAX_TGT not defined
pad_id = tokenizer.pad_token_id

# Use identical width for both sides to avoid shape mismatch
enc_ref  = tokenizer(
    test_ref, padding='max_length', truncation=True, max_length=MAX_TGT_EVAL, return_tensors="np"
)
enc_pred = tokenizer(
    test_pred, padding='max_length', truncation=True, max_length=MAX_TGT_EVAL, return_tensors="np"
)

# Defensive: enforce same width (in case you change one side later)
L = min(enc_ref["input_ids"].shape[1], enc_pred["input_ids"].shape[1])
y_true = enc_ref["input_ids"][:, :L].reshape(-1)
y_pred = enc_pred["input_ids"][:, :L].reshape(-1)

# Ignore PAD positions in the reference
keep = (y_true != pad_id)
y_true_m = y_true[keep]
y_pred_m = y_pred[keep]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sacrebleu, pandas as pd
overall = {
    "bleu": sacrebleu.corpus_bleu(test_pred, [test_ref]).score,
    "tok_acc": accuracy_score(y_true_m, y_pred_m),
    "tok_f1_micro": f1_score(y_true_m, y_pred_m, average="micro", zero_division=0),
    "tok_f1_macro": f1_score(y_true_m, y_pred_m, average="macro", zero_division=0),
    "tok_precision_macro": precision_score(y_true_m, y_pred_m, average="macro", zero_division=0),
    "tok_recall_macro": recall_score(y_true_m, y_pred_m, average="macro", zero_division=0),
}
display(pd.DataFrame([overall]).round(4))

# 3) Save artifacts
EVAL_DIR = RUN_DIR / "eval"
EVAL_DIR.mkdir(parents=True, exist_ok=True)
pd.DataFrame({"cn": test_src, "ref_en": test_ref, "pred_en": test_pred}).to_csv(EVAL_DIR / "test_preds.csv", index=False)
pd.DataFrame([overall]).to_csv(EVAL_DIR / "test_classification_metrics.csv", index=False)
print("Saved:", EVAL_DIR / "test_preds.csv")
print("Saved:", EVAL_DIR / "test_classification_metrics.csv")


### Evaluate & Testing

In [None]:
# === Cell 10 — Evaluate on VALID & TEST + sample translations (BLEU/chrF) ===
from transformers import AutoModelForSeq2SeqLM
import json

# 1) Load best saved model if present, else use the current model in memory
best_dir = RUN_DIR / "best_model"
eval_model = None
try:
    if best_dir.exists():
        eval_model = AutoModelForSeq2SeqLM.from_pretrained(best_dir).to(DEVICE)
        print("Loaded best model from:", best_dir)
except Exception as e:
    print("Could not load best model from disk, using in-memory model. Reason:", e)

if eval_model is None:
    eval_model = model  # fallback to the model you just trained

# 2) Translate helper (batched) with beam search
@torch.no_grad()
def translate_marian(texts, beams=4, max_new_tokens=128, bs=64):
    outs = []
    eval_model.eval()
    for i in range(0, len(texts), bs):
        batch = texts[i:i+bs]
        enc = tokenizer(batch, return_tensors="pt", padding=True,
                        truncation=True, max_length=MAX_SRC).to(DEVICE)
        gen = eval_model.generate(
            **enc,
            num_beams=beams,
            max_new_tokens=max_new_tokens,
            no_repeat_ngram_size=3,
            length_penalty=1.0,
        )
        outs += tokenizer.batch_decode(gen, skip_special_tokens=True)
    return outs

# 3) Evaluate on VALID
valid_pred = translate_marian(valid_df["cn"].tolist(), beams=4, max_new_tokens=128, bs=48)
valid_bleu = sacrebleu.corpus_bleu(valid_pred, [valid_df["en"].tolist()]).score
valid_chrf = sacrebleu.corpus_chrf(valid_pred, [valid_df["en"].tolist()]).score

# 4) Evaluate on TEST
test_pred  = translate_marian(test_df["cn"].tolist(),  beams=4, max_new_tokens=128, bs=48)
test_bleu  = sacrebleu.corpus_bleu(test_pred,  [test_df["en"].tolist()]).score
test_chrf  = sacrebleu.corpus_chrf(test_pred,  [test_df["en"].tolist()]).score

print(f"VALID  | BLEU {valid_bleu:.2f}  chrF {valid_chrf:.2f}")
print(f"TEST   | BLEU {test_bleu:.2f}  chrF {test_chrf:.2f}")

# 5) Show a small sample table for the report
display(pd.DataFrame({
    "CN": valid_df["cn"].head(10).tolist(),
    "REF_EN": valid_df["en"].head(10).tolist(),
    "PRED_EN": valid_pred[:10]
}))

# 6) Save metrics and sample predictions
metrics = {
    "valid_bleu": float(valid_bleu),
    "valid_chrf": float(valid_chrf),
    "test_bleu": float(test_bleu),
    "test_chrf": float(test_chrf),
}
(RUN_DIR / "eval").mkdir(parents=True, exist_ok=True)
(RUN_DIR / "eval" / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
pd.DataFrame({
    "cn": valid_df["cn"],
    "ref_en": valid_df["en"],
    "pred_en": valid_pred
}).to_csv(RUN_DIR / "eval" / "valid_preds.csv", index=False)
pd.DataFrame({
    "cn": test_df["cn"],
    "ref_en": test_df["en"],
    "pred_en": test_pred
}).to_csv(RUN_DIR / "eval" / "test_preds.csv", index=False)
print("Saved metrics & preds to:", RUN_DIR / "eval")


In [None]:
# === Cell 11 — Plot training curves (BLEU/chrF over epochs) & save artifacts (no LR) ===
import pandas as pd
import matplotlib.pyplot as plt

logs = getattr(trainer.state, "log_history", None)
if not logs:
    print("No log history found on this Trainer instance.")
else:
    df_logs = pd.DataFrame(logs)

    # Keep rows that contain any eval metric we care about
    wanted = ["eval_bleu", "eval_chrf", "eval_loss"]
    mask = False
    for c in wanted:
        if c in df_logs.columns:
            mask = mask | df_logs[c].notna()
    if mask is False:
        print("No evaluation metrics recorded.")
        display(df_logs.tail(15))
    else:
        cols = ["epoch"] + [c for c in wanted if c in df_logs.columns]
        eval_df = df_logs.loc[mask, cols].reset_index(drop=True)

        # Drop any columns that are all-NaN (e.g., if chrF isn’t logged)
        eval_df = eval_df.dropna(axis=1, how="all")

        if len(eval_df) == 0 or "epoch" not in eval_df.columns:
            print("No evaluation metrics recorded.")
            display(df_logs.tail(15))
        else:
            display(eval_df)

            # Plot BLEU/chrF if present; otherwise plot eval_loss
            plt.figure()
            plotted = False
            if "eval_bleu" in eval_df.columns:
                plt.plot(eval_df["epoch"], eval_df["eval_bleu"], marker="o", label="BLEU (valid)")
                plotted = True
            if "eval_chrf" in eval_df.columns:
                plt.plot(eval_df["epoch"], eval_df["eval_chrf"], marker="o", label="chrF (valid)")
                plotted = True
            if not plotted and "eval_loss" in eval_df.columns:
                plt.plot(eval_df["epoch"], eval_df["eval_loss"], marker="o", label="Loss (valid)")

            plt.xlabel("Epoch")
            plt.ylabel("Score")
            plt.title("Validation metrics over training")
            plt.legend()
            plt.grid(True, linestyle="--", alpha=0.4)

            (RUN_DIR / "eval").mkdir(parents=True, exist_ok=True)
            fig_path = RUN_DIR / "eval" / "training_curves.png"
            plt.savefig(fig_path, bbox_inches="tight")
            plt.show()
            print("Saved plot to:", fig_path)

            # Save metrics table
            csv_path = RUN_DIR / "eval" / "training_metrics.csv"
            eval_df.to_csv(csv_path, index=False)
            print("Saved metrics CSV to:", csv_path)

In [None]:
# # === UI Cell — Easy translator (Gradio) ===
# # If you don't have Gradio yet, uncomment the next line:
# !pip install -q gradio

# import time, torch
# import gradio as gr
# from pathlib import Path
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # ---- Paths & device ----
# BEST_DIR = RUN_DIR / "best_model" if 'RUN_DIR' in globals() else Path("marian_zh_en_scratch_run/best_model")
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# # ---- Load tokenizer (prefer your saved one; else Marian base) ----
# TOK_NAME = str(BEST_DIR) if BEST_DIR.exists() else "Helsinki-NLP/opus-mt-zh-en"
# tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

# # ---- Get / load model once ----
# _loaded_model = None
# def get_model():
#     global _loaded_model
#     if _loaded_model is not None:
#         return _loaded_model
#     # 1) Prefer your saved best model
#     if BEST_DIR.exists():
#         try:
#             _loaded_model = AutoModelForSeq2SeqLM.from_pretrained(BEST_DIR).to(DEVICE).eval()
#             return _loaded_model
#         except Exception as e:
#             print("Warning: could not load best_model from disk:", e)
#     # 2) If you trained a model in this session, reuse it
#     if 'model' in globals():
#         _loaded_model = model.to(DEVICE).eval()
#         return _loaded_model
#     # 3) Fallback to base Marian (not fine-tuned)
#     _loaded_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to(DEVICE).eval()
#     return _loaded_model

# # ---- Translator function for Gradio ----
# @torch.no_grad()
# def translate_ui(text, beams, max_new_tokens, no_repeat_ngram, length_penalty):
#     text = (text or "").strip()
#     if not text:
#         return "", "Please enter Chinese text."

#     mdl = get_model()
#     t0 = time.time()

#     enc = tokenizer([text], return_tensors="pt", padding=True, truncation=True,
#                     max_length=globals().get("MAX_SRC", 128)).to(DEVICE)

#     gen = mdl.generate(
#         **enc,
#         num_beams=int(beams),
#         max_new_tokens=int(max_new_tokens),
#         no_repeat_ngram_size=int(no_repeat_ngram),
#         length_penalty=float(length_penalty),
#     )
#     out = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
#     dt = time.time() - t0
#     info = f"Device: {DEVICE} • Beam: {beams} • Max tokens: {max_new_tokens} • Time: {dt*1000:.0f} ms"
#     return out, info

# # ---- Build UI ----
# with gr.Blocks(title="CN → EN Translator") as demo:
#     gr.Markdown("## 🇨🇳 ➜ 🇬🇧 CN → EN Translator (Marian)\n"
#                 "- Uses your **best_model** if found\n"
#                 "- GPU auto-used if available\n"
#                 "- Tweak beam size & max tokens as needed")

#     with gr.Row():
#         with gr.Column():
#             inp = gr.Textbox(label="Chinese input", lines=5, placeholder="输入中文句子…")
#             beams = gr.Slider(1, 8, value=4, step=1, label="Beam size")
#             max_new = gr.Slider(16, 256, value=128, step=8, label="Max new tokens")
#             no_rep = gr.Slider(0, 5, value=3, step=1, label="No-repeat n-gram size")
#             lp = gr.Slider(-1.0, 2.0, value=1.0, step=0.1, label="Length penalty")
#             btn = gr.Button("Translate 🚀")
#         with gr.Column():
#             out = gr.Textbox(label="English translation", lines=5)
#             meta = gr.Markdown()

#     btn.click(fn=translate_ui, inputs=[inp, beams, max_new, no_rep, lp], outputs=[out, meta])

# demo.launch(share=True)
