In [18]:
import os
import argparse
import random
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import GroupShuffleSplit
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--csv", default="dataset.csv")
parser.add_argument("--model", default="xlm-roberta-base")
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--lr", type=float, default=1e-5)
parser.add_argument("--max_length", type=int, default=256)
parser.add_argument("--out_dir", default="./qe-model")
parser.add_argument("--seed", type=int, default=42)

args, _ = parser.parse_known_args()
args

Namespace(csv='dataset.csv', model='xlm-roberta-base', epochs=3, batch_size=8, lr=1e-05, max_length=256, out_dir='./qe-model', seed=42)

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def load_df(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    needed = {"SOURCE", "TARGET", "SCORE"}
    if not needed.issubset(df.columns):
        raise ValueError(f"dataset.csv에는 {needed} 컬럼이 필요함")
    df = df[["SOURCE", "TARGET", "SCORE"]].dropna().reset_index(drop=True)
    return df

def split_by_source(df, test_size=0.1, seed=42):
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    groups = df["SOURCE"].values
    idx_train, idx_valid = next(gss.split(df, groups=groups))
    return (
        df.iloc[idx_train].reset_index(drop=True),
        df.iloc[idx_valid].reset_index(drop=True),
    )

set_seed(args.seed)
print(f"[INFO] loading csv: {args.csv}")
df = load_df(args.csv)

print(f"[INFO] rows: {len(df)}")
df_train, df_valid = split_by_source(df)
print(f"[INFO] train: {len(df_train)}, valid: {len(df_valid)}")

tokenizer = AutoTokenizer.from_pretrained(args.model)
print('tokenizer loaded.')

[INFO] loading csv: dataset.csv
[INFO] rows: 50
[INFO] train: 45, valid: 5
tokenizer loaded.


In [24]:
class QEDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        src, tgt = str(row["SOURCE"]), str(row["TARGET"])
        label = np.clip(float(row["SCORE"]) / 100.0, 0.01, 0.99)
        text = f"ko: {src} en: {tgt}"
        enc = self.tokenizer(
            text, max_length=self.max_length, truncation=True
        )
        enc["labels"] = torch.tensor(label, dtype=torch.float)
        return enc

model = AutoModelForSequenceClassification.from_pretrained(
    args.model, num_labels=1, problem_type="regression"
)
print(f"[INFO] Loaded tokenizer and model '{args.model}' for QE regression task.")

train_ds = QEDataset(df_train, tokenizer, args.max_length)
valid_ds = QEDataset(df_valid, tokenizer, args.max_length)
print(f"[INFO] Dataset ready: {len(train_ds)} training samples, {len(valid_ds)} validation samples.")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"[INFO] Using device: {device}")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Loaded tokenizer and model 'xlm-roberta-base' for QE regression task.
[INFO] Dataset ready: 45 training samples, 5 validation samples.
[INFO] Using device: cpu


In [25]:
def compute_metrics_builder():
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        preds = preds.reshape(-1)
        preds = np.clip(preds, 0.0, 1.0) * 100
        labels = np.clip(labels, 0.0, 1.0) * 100
        mae = float(np.mean(np.abs(preds - labels)))
        rmse = float(np.sqrt(np.mean((preds - labels) ** 2)))
        pearson = (
            float(np.corrcoef(preds, labels)[0, 1])
            if np.std(preds) and np.std(labels)
            else 0.0
        )
        return {"mae": mae, "rmse": rmse, "pearson": pearson}

    return compute_metrics


training_args = TrainingArguments(
    output_dir=args.out_dir,
    learning_rate=args.lr,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size * 2,
    num_train_epochs=args.epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    logging_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_builder(),
)

print("[INFO] start training ...")
trainer.train()
print("[INFO] training done")


  trainer = Trainer(


[INFO] start training ...




Epoch,Training Loss,Validation Loss,Mae,Rmse,Pearson
1,No log,0.152909,33.995209,39.10358,0.691513


[INFO] training done


In [None]:
# -----------------------
# Quick sanity check
# -----------------------
ex = df.sample(5, random_state=args.seed)
model.eval()
with torch.no_grad():
    for _, row in ex.iterrows():
        text = f"ko: {row['SOURCE']} en: {row['TARGET']}"
        enc = tokenizer(text, return_tensors="pt", truncation=True).to(device)
        out = model(**enc)
        pred = torch.sigmoid(out.logits).squeeze().item()
        pred = np.clip(pred, 0.0, 1.0) * 100
        print("----")
        print("SRC :", row['SOURCE'])
        print("TGT :", row['TARGET'])
        print("GT  :", row['SCORE'])
        print("PRED:", round(pred, 2))

print(f"[INFO] best model saved under: {args.out_dir}")

----
SRC : 나는 클라이언트 피드백을 반영해서 다시 올리겠습니다.
TGT : I'd like to receive a copy of my bank statement in English.
GT  : 0
PRED: 56.52
----
SRC : 그녀는 음식을 포장해 달라고 했어요.
TGT : Can I get a refund?
GT  : 0
PRED: 56.22
----
SRC : 그는 주로 집에서 일해.
TGT : When is your flight?
GT  : 0
PRED: 56.27
----
SRC : 너는 팀장님께 먼저 여쭤봤니?
TGT : He turned on the navigation.
GT  : 0
PRED: 56.0
----
SRC : 나는 예산안을 다시 검토해야 합니다.
TGT : Are you free this week?
GT  : 100
PRED: 56.24
[INFO] best model saved under: ./qe-test
