# Ko→En QE (CPU, VS Code, local, `- b.txt` pairing)
- `X.txt`(ko) ↔ `X - b.txt`(en)
- COMET Kiwi(gated) → public wmt21 automatic fallback
- CPU-only execution

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## 0) (Optional) Install

In [2]:
# %pip install pandas==2.2.2 pyarrow>=14,<20 scikit-learn==1.5.2 sentencepiece evaluate==0.4.2
# %pip install transformers==4.44.2 datasets==2.21.0 accelerate==0.34.2 unbabel-comet==2.2.3
# %pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu
print("Hello World!")

Hello World!


## 1) Paths

In [3]:
import os, re, glob, json
DATA_DIR = "./data"
OUT_DIR  = "./qe_runs"
os.makedirs(OUT_DIR, exist_ok=True)
print("DATA_DIR =", os.path.abspath(DATA_DIR))
print("OUT_DIR  =", os.path.abspath(OUT_DIR))


DATA_DIR = c:\Users\pockg\OneDrive\Desktop\Hojumoney\DEV\v14\making_model\data
OUT_DIR  = c:\Users\pockg\OneDrive\Desktop\Hojumoney\DEV\v14\making_model\qe_runs


## 2) Find pairs (`X.txt` ↔ `X - b.txt`)

In [4]:
def find_pairs(data_dir):
    txts = glob.glob(os.path.join(data_dir, "*.txt"))
    base_map = {}
    for p in txts:
        name = os.path.basename(p)
        m = re.match(r"^(.*)\.txt$", name)
        if not m: 
            continue
        stem = m.group(1)
        if stem.endswith(" b"):
            continue
        ko_path = p
        en_path = os.path.join(data_dir, f"{stem} b.txt")
        if os.path.exists(en_path):
            base_map[stem] = (ko_path, en_path)
    return base_map

pairs = find_pairs(DATA_DIR)
print("Found pairs:", len(pairs))
for k, v in list(pairs.items())[:10]:
    print(k, "->", v)


Found pairs: 21
Accommodation - Easy -> ('./data\\Accommodation - Easy.txt', './data\\Accommodation - Easy b.txt')
Accommodation - Hard -> ('./data\\Accommodation - Hard.txt', './data\\Accommodation - Hard b.txt')
Accommodation - Medium -> ('./data\\Accommodation - Medium.txt', './data\\Accommodation - Medium b.txt')
Bank - Easy -> ('./data\\Bank - Easy.txt', './data\\Bank - Easy b.txt')
Bank - Hard -> ('./data\\Bank - Hard.txt', './data\\Bank - Hard b.txt')
Bank - Medium -> ('./data\\Bank - Medium.txt', './data\\Bank - Medium b.txt')
Culture - Easy -> ('./data\\Culture - Easy.txt', './data\\Culture - Easy b.txt')
Culture - Hard -> ('./data\\Culture - Hard.txt', './data\\Culture - Hard b.txt')
Culture - Medium -> ('./data\\Culture - Medium.txt', './data\\Culture - Medium b.txt')
Developer - Easy -> ('./data\\Developer - Easy.txt', './data\\Developer - Easy b.txt')


## 3) Load & align

In [5]:
def load_lines(path):
    with open(path, "r", encoding="utf-8-sig") as f:
        lines = [ln.strip() for ln in f.readlines()]
    return lines

def build_examples(pair_map):
    rows = []
    for stem, (ko_path, en_path) in pair_map.items():
        ko_lines = load_lines(ko_path)
        en_lines = load_lines(en_path)
        n = min(len(ko_lines), len(en_lines))
        if len(ko_lines) != len(en_lines):
            print(f"[WARN] mismatch '{stem}': ko={len(ko_lines)} en={len(en_lines)} -> {n}")
        for i in range(n):
            ko = ko_lines[i]; en = en_lines[i]
            if not ko or not en: 
                continue
            rows.append({"file": stem, "idx": i, "ko": ko, "en": en})
    return rows

raw_rows = build_examples(pairs)
print("Total aligned rows:", len(raw_rows))
raw_rows[:3]


Total aligned rows: 21000


[{'file': 'Accommodation - Easy',
  'idx': 0,
  'ko': '게스트하우스에 냉장고가 있나요?',
  'en': 'Is there a refrigerator in the guesthouse?'},
 {'file': 'Accommodation - Easy',
  'idx': 1,
  'ko': '레지던스에서 짐 보관 가능할까요?',
  'en': "Do you think it'll be possible to keep your things in Regance?"},
 {'file': 'Accommodation - Easy',
  'idx': 2,
  'ko': '모텔까지 셔틀로 얼마나 걸리나요?',
  'en': 'How long will it take to get to the motel?'}]

## 4) COMET QE scoring (CPU) — Kiwi→wmt21 fallback

In [7]:
# === COMET scoring (robust fallback, no internal API) ===
import torch, pandas as pd
from comet import download_model, load_from_checkpoint

# 1) Candidates: Open QE first → (last) Kiwi (Gated: HF login/approval required)
CANDIDATES = [
    "wmt20-comet-qe-da", # ✅ Open QE (requires src+mt only)
    "wmt21-comet-qe-da", #PublicQE (not depending on environment)
    "wmt22-cometkiwi-da", # 🔒 gated (Hugging Face login + Access consent required)
]

selected_id, qe_model = None, None
last_err = None

for mid in CANDIDATES:
    try:
        print(f"[COMET] Trying: {mid}")
        mpath = download_model(mid)
        model = load_from_checkpoint(mpath)
        model.eval()
        selected_id, qe_model = mid, model
        print(f"[COMET] Loaded: {mid}")
        break
    except Exception as e:
        print(f"[COMET] Fail: {mid} -> {e}")
        last_err = e

if qe_model is None:
    raise RuntimeError(
        "I haven't received the COMET model available. Please copy the error log above and send it to me."
        "To write Kiwi, you need access consent from the model page after 'huggingface-cli login'."
    ) from last_err

is_qe = ("qe" in selected_id) or ("kiwi" in selected_id)

# 2) Placement scoring (QE: src+mt/DA: src+mt+ref required)
batch_size = 32
scores = []
for i in range(0, len(raw_rows), batch_size):
    batch = raw_rows[i:i+batch_size]
    if is_qe:
        data = [{"src": r["ko"], "mt": r["en"]} for r in batch]
    else:
        # DA models require a reference (ref). If there is no ref, the meaning falls.
        data = [{"src": r["ko"], "mt": r["en"], "ref": r["en"]} for r in batch]

    with torch.no_grad():
        out = qe_model.predict(data, batch_size=min(16, len(data)), gpus=0)  # CPU

    for s in out["scores"]:
        # Rescale COMET score (-1..1) to 0..100
        score_0_100 = float(max(0.0, min(1.0, (s + 1) / 2.0))) * 100.0
        scores.append(score_0_100)

df = pd.DataFrame(raw_rows)
df["score"] = scores
display({"model": selected_id})
display(df.head()); display(df["score"].describe())

  from .autonotebook import tqdm as notebook_tqdm


[COMET] Trying: wmt20-comet-qe-da


wmt20-comet-qe-da is already in cache.
Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\pockg\.cache\torch\unbabel_comet\wmt20-comet-qe-da\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\pockg\OneDrive\Desktop\Hojumoney\DEV\v14\.venv\Lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


[COMET] Loaded: wmt20-comet-qe-da


Predicting DataLoader 0: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs

{'model': 'wmt20-comet-qe-da'}

Unnamed: 0,file,idx,ko,en,score
0,Accommodation - Easy,0,게스트하우스에 냉장고가 있나요?,Is there a refrigerator in the guesthouse?,97.72217
1,Accommodation - Easy,1,레지던스에서 짐 보관 가능할까요?,Do you think it'll be possible to keep your th...,50.002044
2,Accommodation - Easy,2,모텔까지 셔틀로 얼마나 걸리나요?,How long will it take to get to the motel?,50.592725
3,Accommodation - Easy,3,패밀리룸은 있나요?,Is there a family room?,85.914856
4,Accommodation - Easy,4,조식은 포함되어 있나요?,Do you include the eclipse?,50.002301


count    21000.000000
mean        62.736878
std         13.393798
min         50.002017
25%         50.003369
50%         59.942552
75%         70.215014
max         99.010512
Name: score, dtype: float64

## 5) Split & save parquet

In [8]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_path = os.path.join(OUT_DIR, "train.parquet")
val_path   = os.path.join(OUT_DIR, "val.parquet")
train_df.to_parquet(train_path, index=False)
val_df.to_parquet(val_path, index=False)
print(train_path, val_path)


./qe_runs\train.parquet ./qe_runs\val.parquet


## 6) Tokenize (XLM-R)

In [None]:
# === XLM-R QE (Regression) — load parquet → tokenize → train ===
import os, numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

# 0) Paths
OUT_DIR = globals().get("OUT_DIR", "./qe_runs")
os.makedirs(OUT_DIR, exist_ok=True)
train_path = os.path.join(OUT_DIR, "train.parquet")
val_path   = os.path.join(OUT_DIR, "val.parquet")

# 1) Load data (expects columns: 'ko','en','score' at minimum)
data = load_dataset(
    "parquet",
    data_files={"train": train_path, "validation": val_path},
)

# 2) Build input text: [KO] ... [EN] ...
def build_text(batch):
    ko = batch.get("ko", [""]*len(batch[next(iter(batch))]))
    en = batch.get("en", [""]*len(batch[next(iter(batch))]))
    return {"text": [f"[KO] {k} [EN] {e}" for k, e in zip(ko, en)]}

data = data.map(build_text, batched=True, desc="Build input text [KO]/[EN]")

# 3) Scale labels: score 50~100 → labels 0~1
def map_labels(batch):
    if "labels" in batch:
        return {}
    s = np.array(batch["score"], dtype=float)
    return {"labels": ((s - 50.0) / 50.0).astype("float32")}

data = data.map(map_labels, batched=True, desc="Scale labels 50~100 → 0~1")

# 4) Tokenizer / Model
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=1)
model.config.problem_type = "regression"

# 5) Tokenize
def tok(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=256,
    )

data = data.map(tok, batched=True, remove_columns=[c for c in data["train"].column_names if c not in ("labels") and c not in ("input_ids","attention_mask")],
                desc="Tokenize")

# 6) Metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = preds.reshape(-1)
    labels = np.array(labels).reshape(-1)
    mae = float(np.mean(np.abs(preds - labels)))
    rmse = float(np.sqrt(np.mean((preds - labels) ** 2)))
    pearson = 0.0 if (preds.std() == 0 or labels.std() == 0) else float(np.corrcoef(preds, labels)[0, 1])
    return {"mae": mae, "rmse": rmse, "pearson": pearson}

# 7) Training args (CPU-friendly)
args = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, "xlmr_qe"),
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    max_grad_norm=1.0,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    report_to="none",
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=5e-4)],
)

trainer.train()
save_dir = os.path.join(OUT_DIR, "xlmr_qe", "best")
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)


PyTorch version 2.8.0 available.
Generating train split: 18900 examples [00:00, 1244796.03 examples/s]
Generating validation split: 2100 examples [00:00, 906736.50 examples/s]
Build input text [KO]/[EN]: 100%|██████████| 18900/18900 [00:00<00:00, 189979.00 examples/s]
Build input text [KO]/[EN]: 100%|██████████| 2100/2100 [00:00<00:00, 59000.31 examples/s]
Scale labels 50~100 → 0~1: 100%|██████████| 18900/18900 [00:00<00:00, 571464.02 examples/s]
Scale labels 50~100 → 0~1: 100%|██████████| 2100/2100 [00:00<00:00, 97641.43 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Tokenize: 100%|██████████| 18900/18900 [00:00<00:00, 37156.58 examples/s]
Tokenize: 100%|███

{'loss': 0.095, 'grad_norm': 2.3956806659698486, 'learning_rate': 2.347417840375587e-06, 'epoch': 0.04}


  3%|▎         | 100/3543 [08:05<4:15:50,  4.46s/it]

{'loss': 0.0811, 'grad_norm': 1.6544015407562256, 'learning_rate': 4.694835680751174e-06, 'epoch': 0.08}


  4%|▍         | 150/3543 [11:55<4:17:50,  4.56s/it]

{'loss': 0.0734, 'grad_norm': 4.621305465698242, 'learning_rate': 7.042253521126761e-06, 'epoch': 0.13}


  6%|▌         | 200/3543 [15:39<4:10:52,  4.50s/it]

{'loss': 0.0578, 'grad_norm': 3.0482685565948486, 'learning_rate': 9.389671361502349e-06, 'epoch': 0.17}


                                                    
  6%|▌         | 200/3543 [17:39<4:10:52,  4.50s/it]

{'eval_loss': 0.04781968519091606, 'eval_mae': 0.18548400700092316, 'eval_rmse': 0.2186771184206009, 'eval_pearson': 0.6861154986696428, 'eval_runtime': 119.8565, 'eval_samples_per_second': 17.521, 'eval_steps_per_second': 1.101, 'epoch': 0.17}


  7%|▋         | 250/3543 [21:08<3:48:43,  4.17s/it] 

{'loss': 0.0576, 'grad_norm': 3.520082712173462, 'learning_rate': 9.88888888888889e-06, 'epoch': 0.21}


  8%|▊         | 300/3543 [24:51<3:57:45,  4.40s/it]

{'loss': 0.0415, 'grad_norm': 1.2669076919555664, 'learning_rate': 9.73873873873874e-06, 'epoch': 0.25}


 10%|▉         | 350/3543 [28:39<4:17:17,  4.83s/it]

{'loss': 0.0363, 'grad_norm': 3.6409904956817627, 'learning_rate': 9.58858858858859e-06, 'epoch': 0.3}


 11%|█▏        | 400/3543 [32:29<3:50:03,  4.39s/it]

{'loss': 0.0376, 'grad_norm': 3.2664783000946045, 'learning_rate': 9.43843843843844e-06, 'epoch': 0.34}


                                                    
 11%|█▏        | 400/3543 [34:38<3:50:03,  4.39s/it]

{'eval_loss': 0.018399447202682495, 'eval_mae': 0.10373105108737946, 'eval_rmse': 0.1356445550918579, 'eval_pearson': 0.8832194164251226, 'eval_runtime': 129.1496, 'eval_samples_per_second': 16.26, 'eval_steps_per_second': 1.022, 'epoch': 0.34}


 13%|█▎        | 450/3543 [38:38<4:00:15,  4.66s/it] 

{'loss': 0.0247, 'grad_norm': 1.5395036935806274, 'learning_rate': 9.288288288288288e-06, 'epoch': 0.38}


 14%|█▍        | 500/3543 [42:25<3:31:52,  4.18s/it]

{'loss': 0.026, 'grad_norm': 2.1179592609405518, 'learning_rate': 9.138138138138138e-06, 'epoch': 0.42}


 16%|█▌        | 550/3543 [46:14<3:53:48,  4.69s/it]

{'loss': 0.0311, 'grad_norm': 2.19636607170105, 'learning_rate': 8.987987987987988e-06, 'epoch': 0.47}


 17%|█▋        | 600/3543 [50:04<3:46:16,  4.61s/it]

{'loss': 0.0266, 'grad_norm': 1.6811832189559937, 'learning_rate': 8.837837837837839e-06, 'epoch': 0.51}


                                                    
 17%|█▋        | 600/3543 [52:15<3:46:16,  4.61s/it]

{'eval_loss': 0.034030571579933167, 'eval_mae': 0.13901157677173615, 'eval_rmse': 0.18447376787662506, 'eval_pearson': 0.895992017099653, 'eval_runtime': 131.7775, 'eval_samples_per_second': 15.936, 'eval_steps_per_second': 1.002, 'epoch': 0.51}


 18%|█▊        | 650/3543 [56:08<3:52:12,  4.82s/it] 

{'loss': 0.0248, 'grad_norm': 1.6538894176483154, 'learning_rate': 8.687687687687689e-06, 'epoch': 0.55}


 20%|█▉        | 700/3543 [1:00:02<3:53:05,  4.92s/it]

{'loss': 0.0203, 'grad_norm': 0.8361111283302307, 'learning_rate': 8.537537537537537e-06, 'epoch': 0.59}


 21%|██        | 750/3543 [1:03:51<3:23:29,  4.37s/it]

{'loss': 0.0192, 'grad_norm': 0.817671000957489, 'learning_rate': 8.387387387387388e-06, 'epoch': 0.63}


 23%|██▎       | 800/3543 [1:07:40<3:34:11,  4.69s/it]

{'loss': 0.0183, 'grad_norm': 1.0593206882476807, 'learning_rate': 8.237237237237238e-06, 'epoch': 0.68}


                                                      
 23%|██▎       | 800/3543 [1:09:48<3:34:11,  4.69s/it]

{'eval_loss': 0.017383605241775513, 'eval_mae': 0.10142093896865845, 'eval_rmse': 0.13184690475463867, 'eval_pearson': 0.9345008108938058, 'eval_runtime': 128.7268, 'eval_samples_per_second': 16.314, 'eval_steps_per_second': 1.025, 'epoch': 0.68}


 24%|██▍       | 850/3543 [1:13:41<3:15:02,  4.35s/it] 

{'loss': 0.0171, 'grad_norm': 0.7423282265663147, 'learning_rate': 8.087087087087088e-06, 'epoch': 0.72}


 25%|██▌       | 900/3543 [1:17:32<3:23:20,  4.62s/it]

{'loss': 0.0187, 'grad_norm': 1.1725208759307861, 'learning_rate': 7.936936936936938e-06, 'epoch': 0.76}


 27%|██▋       | 950/3543 [1:21:26<3:29:27,  4.85s/it]

{'loss': 0.0166, 'grad_norm': 1.181476354598999, 'learning_rate': 7.786786786786787e-06, 'epoch': 0.8}


 28%|██▊       | 1000/3543 [1:25:13<3:25:29,  4.85s/it]

{'loss': 0.0188, 'grad_norm': 2.0987889766693115, 'learning_rate': 7.636636636636637e-06, 'epoch': 0.85}


                                                       
 28%|██▊       | 1000/3543 [1:27:25<3:25:29,  4.85s/it]

{'eval_loss': 0.010971881449222565, 'eval_mae': 0.08027192950248718, 'eval_rmse': 0.1047467514872551, 'eval_pearson': 0.9528286154055957, 'eval_runtime': 131.7514, 'eval_samples_per_second': 15.939, 'eval_steps_per_second': 1.002, 'epoch': 0.85}


 30%|██▉       | 1050/3543 [1:31:13<3:03:39,  4.42s/it] 

{'loss': 0.0163, 'grad_norm': 0.8718230128288269, 'learning_rate': 7.486486486486487e-06, 'epoch': 0.89}


 31%|███       | 1100/3543 [1:35:00<3:01:35,  4.46s/it]

{'loss': 0.0157, 'grad_norm': 1.427654504776001, 'learning_rate': 7.336336336336337e-06, 'epoch': 0.93}


 32%|███▏      | 1150/3543 [1:38:52<3:08:29,  4.73s/it]

{'loss': 0.0134, 'grad_norm': 1.0077760219573975, 'learning_rate': 7.186186186186187e-06, 'epoch': 0.97}


 34%|███▍      | 1200/3543 [1:43:00<3:00:31,  4.62s/it]

{'loss': 0.0127, 'grad_norm': 1.094111680984497, 'learning_rate': 7.036036036036037e-06, 'epoch': 1.02}


                                                       
 34%|███▍      | 1200/3543 [1:45:11<3:00:31,  4.62s/it]

{'eval_loss': 0.009300928562879562, 'eval_mae': 0.07240773737430573, 'eval_rmse': 0.09644132107496262, 'eval_pearson': 0.9596085670380995, 'eval_runtime': 130.8066, 'eval_samples_per_second': 16.054, 'eval_steps_per_second': 1.009, 'epoch': 1.02}


 35%|███▌      | 1250/3543 [1:49:04<2:52:38,  4.52s/it] 

{'loss': 0.012, 'grad_norm': 0.7971405386924744, 'learning_rate': 6.885885885885887e-06, 'epoch': 1.06}


 37%|███▋      | 1300/3543 [1:52:55<2:44:09,  4.39s/it]

{'loss': 0.0113, 'grad_norm': 0.6860129237174988, 'learning_rate': 6.7357357357357365e-06, 'epoch': 1.1}


 38%|███▊      | 1350/3543 [1:56:44<2:51:03,  4.68s/it]

{'loss': 0.012, 'grad_norm': 1.4688844680786133, 'learning_rate': 6.585585585585587e-06, 'epoch': 1.14}


 40%|███▉      | 1400/3543 [2:00:26<2:31:07,  4.23s/it]

{'loss': 0.0114, 'grad_norm': 1.4276487827301025, 'learning_rate': 6.435435435435436e-06, 'epoch': 1.18}


                                                       
 40%|███▉      | 1400/3543 [2:02:37<2:31:07,  4.23s/it]

{'eval_loss': 0.008649259805679321, 'eval_mae': 0.06961597502231598, 'eval_rmse': 0.09300139546394348, 'eval_pearson': 0.9641413977788554, 'eval_runtime': 130.9329, 'eval_samples_per_second': 16.039, 'eval_steps_per_second': 1.008, 'epoch': 1.18}


 41%|████      | 1450/3543 [2:06:30<2:39:43,  4.58s/it] 

{'loss': 0.0108, 'grad_norm': 0.5514762997627258, 'learning_rate': 6.2852852852852854e-06, 'epoch': 1.23}


 42%|████▏     | 1500/3543 [2:10:22<2:49:41,  4.98s/it]

{'loss': 0.0115, 'grad_norm': 1.0069580078125, 'learning_rate': 6.135135135135135e-06, 'epoch': 1.27}


 44%|████▎     | 1550/3543 [2:13:54<2:07:20,  3.83s/it]

{'loss': 0.0107, 'grad_norm': 1.4993482828140259, 'learning_rate': 5.984984984984985e-06, 'epoch': 1.31}


 45%|████▌     | 1600/3543 [2:17:33<2:32:57,  4.72s/it]

{'loss': 0.01, 'grad_norm': 0.7552378177642822, 'learning_rate': 5.834834834834835e-06, 'epoch': 1.35}


                                                       
 45%|████▌     | 1600/3543 [2:19:44<2:32:57,  4.72s/it]

{'eval_loss': 0.021309854462742805, 'eval_mae': 0.11346408724784851, 'eval_rmse': 0.14597895741462708, 'eval_pearson': 0.9518952534478657, 'eval_runtime': 130.786, 'eval_samples_per_second': 16.057, 'eval_steps_per_second': 1.009, 'epoch': 1.35}


 47%|████▋     | 1650/3543 [2:23:31<2:19:49,  4.43s/it] 

{'loss': 0.0108, 'grad_norm': 1.9768646955490112, 'learning_rate': 5.6846846846846846e-06, 'epoch': 1.4}


 48%|████▊     | 1700/3543 [2:27:26<2:32:43,  4.97s/it]

{'loss': 0.0105, 'grad_norm': 0.5105684995651245, 'learning_rate': 5.534534534534535e-06, 'epoch': 1.44}


 49%|████▉     | 1750/3543 [2:31:16<2:21:36,  4.74s/it]

{'loss': 0.009, 'grad_norm': 0.8321210145950317, 'learning_rate': 5.384384384384385e-06, 'epoch': 1.48}


 51%|█████     | 1800/3543 [2:35:03<2:07:41,  4.40s/it]

{'loss': 0.0085, 'grad_norm': 0.7189128398895264, 'learning_rate': 5.234234234234234e-06, 'epoch': 1.52}


                                                       
 51%|█████     | 1800/3543 [2:37:11<2:07:41,  4.40s/it]

{'eval_loss': 0.009978754445910454, 'eval_mae': 0.07524481415748596, 'eval_rmse': 0.099893718957901, 'eval_pearson': 0.9650074219085265, 'eval_runtime': 128.0936, 'eval_samples_per_second': 16.394, 'eval_steps_per_second': 1.03, 'epoch': 1.52}


 52%|█████▏    | 1850/3543 [2:41:09<2:15:39,  4.81s/it] 

{'loss': 0.0098, 'grad_norm': 1.7148815393447876, 'learning_rate': 5.0840840840840846e-06, 'epoch': 1.57}


 54%|█████▎    | 1900/3543 [2:45:03<2:02:54,  4.49s/it]

{'loss': 0.0093, 'grad_norm': 1.3266218900680542, 'learning_rate': 4.933933933933934e-06, 'epoch': 1.61}


 55%|█████▌    | 1950/3543 [2:48:48<1:54:27,  4.31s/it]

{'loss': 0.0091, 'grad_norm': 0.5103517770767212, 'learning_rate': 4.783783783783784e-06, 'epoch': 1.65}


 56%|█████▋    | 2000/3543 [2:52:38<1:50:20,  4.29s/it]

{'loss': 0.0093, 'grad_norm': 1.212234616279602, 'learning_rate': 4.633633633633634e-06, 'epoch': 1.69}


                                                       
 56%|█████▋    | 2000/3543 [2:54:49<1:50:20,  4.29s/it]

{'eval_loss': 0.016960345208644867, 'eval_mae': 0.10657697170972824, 'eval_rmse': 0.13023188710212708, 'eval_pearson': 0.9667412213509629, 'eval_runtime': 131.3079, 'eval_samples_per_second': 15.993, 'eval_steps_per_second': 1.005, 'epoch': 1.69}


 58%|█████▊    | 2050/3543 [2:58:43<1:53:04,  4.54s/it] 

{'loss': 0.0088, 'grad_norm': 0.786963939666748, 'learning_rate': 4.483483483483484e-06, 'epoch': 1.74}


 59%|█████▉    | 2100/3543 [3:02:31<1:50:40,  4.60s/it]

{'loss': 0.0085, 'grad_norm': 0.9937750101089478, 'learning_rate': 4.333333333333334e-06, 'epoch': 1.78}


 61%|██████    | 2150/3543 [3:06:16<1:51:19,  4.79s/it]

{'loss': 0.0076, 'grad_norm': 0.7112743854522705, 'learning_rate': 4.183183183183184e-06, 'epoch': 1.82}


 62%|██████▏   | 2200/3543 [3:10:07<1:35:47,  4.28s/it]

{'loss': 0.0094, 'grad_norm': 0.80937659740448, 'learning_rate': 4.0330330330330335e-06, 'epoch': 1.86}


                                                       
 62%|██████▏   | 2200/3543 [3:12:14<1:35:47,  4.28s/it]

{'eval_loss': 0.006472351960837841, 'eval_mae': 0.060498230159282684, 'eval_rmse': 0.08045092970132828, 'eval_pearson': 0.975255235490526, 'eval_runtime': 127.1927, 'eval_samples_per_second': 16.51, 'eval_steps_per_second': 1.038, 'epoch': 1.86}


 64%|██████▎   | 2250/3543 [3:16:12<1:30:51,  4.22s/it] 

{'loss': 0.0079, 'grad_norm': 1.3113926649093628, 'learning_rate': 3.882882882882883e-06, 'epoch': 1.9}


 65%|██████▍   | 2300/3543 [3:20:01<1:34:03,  4.54s/it]

{'loss': 0.0077, 'grad_norm': 0.7021466493606567, 'learning_rate': 3.732732732732733e-06, 'epoch': 1.95}


 66%|██████▋   | 2350/3543 [3:23:48<1:31:04,  4.58s/it]

{'loss': 0.0085, 'grad_norm': 0.9811916947364807, 'learning_rate': 3.582582582582583e-06, 'epoch': 1.99}


 68%|██████▊   | 2400/3543 [3:27:55<1:26:34,  4.54s/it]

{'loss': 0.0086, 'grad_norm': 1.0223369598388672, 'learning_rate': 3.4324324324324326e-06, 'epoch': 2.03}


                                                       
 68%|██████▊   | 2400/3543 [3:30:05<1:26:34,  4.54s/it]

{'eval_loss': 0.012783190235495567, 'eval_mae': 0.0887199342250824, 'eval_rmse': 0.1130627691745758, 'eval_pearson': 0.9776746732642234, 'eval_runtime': 129.5102, 'eval_samples_per_second': 16.215, 'eval_steps_per_second': 1.019, 'epoch': 2.03}


 69%|██████▉   | 2450/3543 [3:33:59<1:23:16,  4.57s/it] 

{'loss': 0.008, 'grad_norm': 1.8441613912582397, 'learning_rate': 3.2822822822822824e-06, 'epoch': 2.07}


 71%|███████   | 2500/3543 [3:37:48<1:20:46,  4.65s/it]

{'loss': 0.0071, 'grad_norm': 0.8917406797409058, 'learning_rate': 3.132132132132132e-06, 'epoch': 2.12}


 72%|███████▏  | 2550/3543 [3:41:40<1:16:43,  4.64s/it]

{'loss': 0.0071, 'grad_norm': 0.5281912684440613, 'learning_rate': 2.9819819819819824e-06, 'epoch': 2.16}


 73%|███████▎  | 2600/3543 [3:45:29<1:11:33,  4.55s/it]

{'loss': 0.0076, 'grad_norm': 1.212501883506775, 'learning_rate': 2.831831831831832e-06, 'epoch': 2.2}


                                                       
 73%|███████▎  | 2600/3543 [3:47:40<1:11:33,  4.55s/it]

{'eval_loss': 0.00932096317410469, 'eval_mae': 0.07593948394060135, 'eval_rmse': 0.09654513746500015, 'eval_pearson': 0.9748809476119462, 'eval_runtime': 131.1703, 'eval_samples_per_second': 16.01, 'eval_steps_per_second': 1.006, 'epoch': 2.2}


 75%|███████▍  | 2650/3543 [3:51:32<1:11:42,  4.82s/it] 

{'loss': 0.007, 'grad_norm': 1.14625883102417, 'learning_rate': 2.681681681681682e-06, 'epoch': 2.24}


 76%|███████▌  | 2700/3543 [3:55:18<1:01:56,  4.41s/it]

{'loss': 0.0071, 'grad_norm': 0.8392284512519836, 'learning_rate': 2.5315315315315318e-06, 'epoch': 2.29}


 78%|███████▊  | 2750/3543 [3:59:11<1:00:16,  4.56s/it]

{'loss': 0.0069, 'grad_norm': 0.6873724460601807, 'learning_rate': 2.3813813813813815e-06, 'epoch': 2.33}


 79%|███████▉  | 2800/3543 [4:02:54<53:20,  4.31s/it]  

{'loss': 0.0064, 'grad_norm': 0.4049142301082611, 'learning_rate': 2.2312312312312313e-06, 'epoch': 2.37}


                                                     
 79%|███████▉  | 2800/3543 [4:05:05<53:20,  4.31s/it]

{'eval_loss': 0.007206866052001715, 'eval_mae': 0.06610659509897232, 'eval_rmse': 0.08489326387643814, 'eval_pearson': 0.9772918190721258, 'eval_runtime': 131.7249, 'eval_samples_per_second': 15.942, 'eval_steps_per_second': 1.002, 'epoch': 2.37}


 80%|████████  | 2850/3543 [4:08:58<51:10,  4.43s/it]  

{'loss': 0.0069, 'grad_norm': 0.36788976192474365, 'learning_rate': 2.0810810810810815e-06, 'epoch': 2.41}


 82%|████████▏ | 2900/3543 [4:12:52<48:34,  4.53s/it]

{'loss': 0.0068, 'grad_norm': 1.181626796722412, 'learning_rate': 1.930930930930931e-06, 'epoch': 2.45}


 83%|████████▎ | 2950/3543 [4:16:45<45:39,  4.62s/it]

{'loss': 0.0065, 'grad_norm': 0.5558580756187439, 'learning_rate': 1.780780780780781e-06, 'epoch': 2.5}


 85%|████████▍ | 3000/3543 [4:20:38<43:00,  4.75s/it]

{'loss': 0.0063, 'grad_norm': 0.5732165575027466, 'learning_rate': 1.6306306306306307e-06, 'epoch': 2.54}


                                                     
 85%|████████▍ | 3000/3543 [4:22:47<43:00,  4.75s/it]

{'eval_loss': 0.009687596932053566, 'eval_mae': 0.07655856013298035, 'eval_rmse': 0.09842558950185776, 'eval_pearson': 0.9800601100238653, 'eval_runtime': 129.2129, 'eval_samples_per_second': 16.252, 'eval_steps_per_second': 1.022, 'epoch': 2.54}


 86%|████████▌ | 3050/3543 [4:26:38<37:44,  4.59s/it]  

{'loss': 0.0069, 'grad_norm': 0.8735827207565308, 'learning_rate': 1.4804804804804807e-06, 'epoch': 2.58}


 87%|████████▋ | 3100/3543 [4:30:28<34:16,  4.64s/it]

{'loss': 0.0065, 'grad_norm': 0.658913791179657, 'learning_rate': 1.3303303303303305e-06, 'epoch': 2.62}


 89%|████████▉ | 3150/3543 [4:34:17<28:34,  4.36s/it]

{'loss': 0.0069, 'grad_norm': 1.5011385679244995, 'learning_rate': 1.1801801801801803e-06, 'epoch': 2.67}


 90%|█████████ | 3200/3543 [4:38:04<25:37,  4.48s/it]

{'loss': 0.0065, 'grad_norm': 0.8874229788780212, 'learning_rate': 1.03003003003003e-06, 'epoch': 2.71}


                                                     
 90%|█████████ | 3200/3543 [4:40:13<25:37,  4.48s/it]

{'eval_loss': 0.007519801612943411, 'eval_mae': 0.06723403930664062, 'eval_rmse': 0.08671678602695465, 'eval_pearson': 0.9798776999825821, 'eval_runtime': 129.2208, 'eval_samples_per_second': 16.251, 'eval_steps_per_second': 1.022, 'epoch': 2.71}


 92%|█████████▏| 3250/3543 [4:44:04<22:59,  4.71s/it]  

{'loss': 0.007, 'grad_norm': 0.6422489285469055, 'learning_rate': 8.798798798798799e-07, 'epoch': 2.75}


 93%|█████████▎| 3300/3543 [4:47:59<18:27,  4.56s/it]

{'loss': 0.0058, 'grad_norm': 0.4724055230617523, 'learning_rate': 7.297297297297298e-07, 'epoch': 2.79}


 95%|█████████▍| 3350/3543 [4:51:49<15:54,  4.94s/it]

{'loss': 0.0064, 'grad_norm': 0.8149694204330444, 'learning_rate': 5.795795795795796e-07, 'epoch': 2.84}


 96%|█████████▌| 3400/3543 [4:55:33<11:04,  4.65s/it]

{'loss': 0.0068, 'grad_norm': 0.4528793692588806, 'learning_rate': 4.294294294294295e-07, 'epoch': 2.88}


                                                     
 96%|█████████▌| 3400/3543 [4:57:42<11:04,  4.65s/it]

{'eval_loss': 0.009799043647944927, 'eval_mae': 0.077755406498909, 'eval_rmse': 0.09899011999368668, 'eval_pearson': 0.9791246978950741, 'eval_runtime': 128.6702, 'eval_samples_per_second': 16.321, 'eval_steps_per_second': 1.026, 'epoch': 2.88}


 97%|█████████▋| 3450/3543 [5:01:44<07:26,  4.80s/it]  

{'loss': 0.0061, 'grad_norm': 0.4236988127231598, 'learning_rate': 2.792792792792793e-07, 'epoch': 2.92}


 99%|█████████▉| 3500/3543 [5:05:36<03:31,  4.91s/it]

{'loss': 0.0061, 'grad_norm': 0.2775803506374359, 'learning_rate': 1.2912912912912912e-07, 'epoch': 2.96}


100%|██████████| 3543/3543 [5:09:00<00:00,  5.23s/it]


{'train_runtime': 18540.3684, 'train_samples_per_second': 3.058, 'train_steps_per_second': 0.191, 'train_loss': 0.016756799727081075, 'epoch': 3.0}
Saved to: ./qe_runs\xlmr_qe\best


## 7) Train XLM-R regression (CPU)

In [None]:
import os
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

# 모델/토크나이저 (이미 로드돼 있으면 이 두 줄은 생략해도 됨)
tokenizer = tokenizer if "tokenizer" in globals() else AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=1,
)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # HF가 (logits,) 형태로 줄 때를 대비
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = preds.reshape(-1)
    labels = np.array(labels).reshape(-1)
    mae = float(np.mean(np.abs(preds - labels)))
    rmse = float(np.sqrt(np.mean((preds - labels) ** 2)))
    # 분산 0 방지
    pearson = 0.0 if (preds.std() == 0 or labels.std() == 0) else float(np.corrcoef(preds, labels)[0, 1])
    return {"mae": mae, "rmse": rmse, "pearson": pearson}

args = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, "xlmr_qe"),
    # 안정화 & 속도 관련 핵심 튜닝
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    max_grad_norm=1.0,
    gradient_accumulation_steps=2,

    # 배치/에폭
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,

    # 로깅/평가/체크포인트
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    greater_is_better=True,

    # CPU 환경 가정 (GPU 없으면 pin_memory 경고 회피)
    dataloader_num_workers=4,
    dataloader_pin_memory=False,

    # 기타
    report_to="none",
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=5e-4)],
)

# 빠른 맛보기만 하고 싶으면 아래 주석 해제
# trainer.train(resume_from_checkpoint=False, max_steps=400)

trainer.train()
trainer.save_model(os.path.join(OUT_DIR, "xlmr_qe/best"))
tokenizer.save_pretrained(os.path.join(OUT_DIR, "xlmr_qe/best"))


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'OUT_DIR' is not defined

## 8) Inference & batch scoring

In [None]:
import torch, csv
from transformers import AutoTokenizer, AutoModelForSequenceClassification

CKPT = os.path.join(OUT_DIR, "xlmr_qe/best")
tok = AutoTokenizer.from_pretrained(CKPT)
mdl = AutoModelForSequenceClassification.from_pretrained(CKPT)
mdl.eval()

def score_translation(ko, en):
    text = f"{ko} </s> {en}"
    enc = tok(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        out = mdl(**enc).logits.squeeze().item()
    return float(max(0.0, min(100.0, out)))

def score_file_pair(ko_path, en_path, out_csv):
    def load_lines(path):
        with open(path, "r", encoding="utf-8-sig") as f:
            lines = [ln.strip() for ln in f.readlines()]
        return lines
    ko_lines = load_lines(ko_path)
    en_lines = load_lines(en_path)
    n = min(len(ko_lines), len(en_lines))
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["idx","ko","en","score"])
        for i in range(n):
            s = score_translation(ko_lines[i], en_lines[i])
            w.writerow([i, ko_lines[i], en_lines[i], f"{s:.2f}"])

print (score_translation ("Please fix the code now", "Please fix the code now"))
# Example of use:
# stem, (ko_p, en_p) = next(iter(pairs.items()))
# out_csv = os.path.join(OUT_DIR, f"{stem}_scores.csv")
# score_file_pair(ko_p, en_p, out_csv)
# print("Saved:", out_csv)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'os' is not defined