In [1]:
# 충돌나는 패키지들을 먼저 제거
!pip -q uninstall -y transformers datasets accelerate numpy

# NumPy 1.x 기준으로 호환 버전 설치
!pip -q install "numpy<2.0" transformers==4.44.2 datasets==2.19.0 \
                accelerate==0.30.1 sentencepiece==0.2.0 \
                sacrebleu==2.4.0 rouge-score==0.1.2

  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 2.0.8 requires opencv-python-headless>=4.9.0.80, which is not installed.
albucore 0.0.24 requires opencv-python-headless>=4.9.0.80, which is not installed.
spacy 3.8.7 requires thinc<8.4.0,>=8.3.4, which

In [5]:
# 0) (선택) 지금 충돌 유발 가능성이 있는 패키지들 확인
!pip list | egrep -i "numpy|opencv|albume|spacy|thinc|gcsfs|fsspec|jax|jaxlib|dopamine" || true


albumentations                        2.0.8
dopamine_rl                           4.1.2
fsspec                                2024.3.1
gcsfs                                 2025.3.0
jax                                   0.5.3
jax-cuda12-pjrt                       0.5.3
jax-cuda12-plugin                     0.5.3
jaxlib                                0.5.3
numpy                                 1.26.4
spacy                                 3.8.7
spacy-legacy                          3.0.12
spacy-loggers                         1.0.5


In [6]:
# 1) 우리 작업에 불필요하고 충돌을 유발하는 패키지들 제거
!pip -q uninstall -y opencv-python opencv-python-headless opencv-contrib-python \
  albumentations albucore spacy thinc gcsfs dopamine-rl jax jaxlib || true


[0m

In [7]:
# 2) 우리가 쓸 스택과 충돌할 수 있는 것들 제거
!pip -q uninstall -y transformers datasets accelerate numpy fsspec || true


In [8]:
# 3) 설치 기반 정리(캐시/업그레이드)
!pip -q install --no-cache-dir -U pip setuptools wheel


In [9]:
# 4) 먼저 핵심 하위 의존성부터 고정 (numpy, fsspec)
!pip -q install --no-cache-dir "numpy==1.26.4" "fsspec==2024.3.1"


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchtune 0.6.1 requires datasets, which is not installed.
orbax-checkpoint 0.11.24 requires jax>=0.5.0, which is not installed.
bigframes 2.21.0 requires gcsfs!=2025.5.0,>=2023.3.0, which is not installed.
flax 0.10.6 requires jax>=0.5.1, which is not installed.
optax 0.2.6 requires jax>=0.5.3, which is not installed.
optax 0.2.6 requires jaxlib>=0.5.3, which is not installed.
chex 0.1.90 requires jax>=0.4.27, which is not installed.
chex 0.1.90 requires jaxlib>=0.4.27, which is not installed.
fastai 2.8.4 requires spacy<4, which is not installed.
sentence-transformers 5.1.0 requires transformers<5.0.0,>=4.41.0, which is not installed.
peft 0.17.1 requires accelerate>=0.21.0, which is not installed.
peft 0.17.1 requires transformers, which is not installed.[0m[31m
[0m

In [10]:
# 5) 번역 파이프라인 패키지 설치 (서로 호환 검증된 조합)
!pip -q install --no-cache-dir transformers==4.44.2 datasets==2.19.0 \
  accelerate==0.30.1 sentencepiece==0.2.0 sacrebleu==2.4.0 rouge-score==0.1.2


In [11]:
# 6) 버전 확인
import numpy, transformers, datasets, accelerate, sentencepiece, sacrebleu, rouge_score, fsspec
print("numpy:", numpy.__version__)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("accelerate:", accelerate.__version__)
print("sentencepiece:", sentencepiece.__version__)
print("sacrebleu:", sacrebleu.__version__)
print("rouge-score:", rouge_score.__version__)
print("fsspec:", fsspec.__version__)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# 방법 B: 드라이브의 폴더에 복사해 두었다면 경로만 지정
train_path = "/content/drive/MyDrive/train.jsonl"
test_path  = "/content/drive/MyDrive/test.jsonl"


In [6]:
from datasets import load_dataset

ds = load_dataset("json", data_files={"train": train_path, "validation": test_path})
ds


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tgt', 'src'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tgt', 'src'],
        num_rows: 300
    })
})

In [7]:
from transformers import AutoTokenizer

MODEL_NAME = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

prefix = "fix: "
max_input_len = 96
max_target_len = 96

def preprocess(batch):
    inputs = [prefix + x for x in batch["src"]]
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["tgt"], max_length=max_target_len, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tokenized


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [9]:
!pip install -q evaluate

In [10]:
import numpy as np
import evaluate

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu_score = bleu.compute(predictions=pred_str, references=[[l] for l in label_str])["score"]
    rouge_l = rouge.compute(predictions=pred_str, references=label_str)["rougeL"]
    return {"sacrebleu": bleu_score, "rougeL": rouge_l}


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [3]:
# accelerate를 호환 버전으로 업그레이드
!pip -q install -U accelerate==0.33.0
# (문제 지속 시 최신 소폭 상향)
# !pip -q install -U accelerate==0.34.2


In [10]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
from datasets import load_dataset
import numpy as np
import evaluate
import os

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu_score = bleu.compute(predictions=pred_str, references=[[l] for l in label_str])["score"]
    rouge_l = rouge.compute(predictions=pred_str, references=label_str)["rougeL"]
    return {"sacrebleu": bleu_score, "rougeL": rouge_l}


MODEL_NAME = "google/mt5-small" # Define MODEL_NAME here
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Load and preprocess the dataset
train_path = "/content/drive/MyDrive/train.jsonl"
test_path  = "/content/drive/MyDrive/test.jsonl"

ds = load_dataset("json", data_files={"train": train_path, "validation": test_path})

prefix = "fix: "
max_input_len = 96
max_target_len = 96

def preprocess(batch):
    inputs = [prefix + x for x in batch["src"]]
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["tgt"], max_length=max_target_len, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)


model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Explicitly make model parameters contiguous
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()


training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/context_trans_ckpt",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,              # 소량 데이터 → 약간 높게
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,              # 3~5 사이에서 시작 권장
    weight_decay=0.01,
    warmup_ratio=0.03,                # 추가
    label_smoothing_factor=0.1,       # 추가
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=128,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none",
)

# Removed the custom SaveStateDictCallback as we are trying to fix the contiguity issue before training

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    compute_metrics=compute_metrics,
    # Removed callbacks=[SaveStateDictCallback()]
)

trainer.train()



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Sacrebleu,Rougel
1,0.0,,0.0,0.0
2,0.0,,0.0,0.0
3,0.0,,0.0,0.0
4,0.0,,0.0,0.0


TrainOutput(global_step=252, training_loss=0.0, metrics={'train_runtime': 200.6178, 'train_samples_per_second': 19.938, 'train_steps_per_second': 1.256, 'total_flos': 113895988101120.0, 'train_loss': 0.0, 'epoch': 4.0})

In [11]:
#모델저
save_dir = "/content/drive/MyDrive/context_trans_model_v1"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to", save_dir)


Saved to /content/drive/MyDrive/context_trans_model_v1


In [13]:
#추론(테스트 셋 몇개 확)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained(save_dir)
mdl = AutoModelForSeq2SeqLM.from_pretrained(save_dir).to(device)

def rewrite(text, num_beams=4, max_len=96):
    inp = tok.encode("fix: " + text, return_tensors="pt", truncation=True, max_length=96).to(device)
    out = mdl.generate(inp, num_beams=num_beams, max_length=max_len, no_repeat_ngram_size=3)
    return tok.decode(out[0], skip_special_tokens=True)

# 테스트셋 상위 5개 비교 출력
raw_valid = ds["validation"]
for i in range(5):
    s, t = raw_valid[i]["src"], raw_valid[i]["tgt"]
    pred = rewrite(s)
    print(f"[{i}]")
    print("SRC :", s)
    print("PRED:", pred)
    print("TGT :", t)
    print("-"*60)

[0]
SRC : 퇴근 후 마트에 가서약 찾자. 어디에서 볼까?
PRED: <extra_id_0>.
TGT : 퇴근 후 마트에 가서 약 찾자. 어디에서 볼까?
------------------------------------------------------------
[1]
SRC : 버스가좀 늦어. 그리고아침에 영화 카페에서 보자.
PRED: <extra_id_0>.
TGT : 버스가 좀 늦어. 그리고 아침에 카페에서 영화 보자.
------------------------------------------------------------
[2]
SRC : 내일 은행에 갈 건데 올래? 길 안 막히겠지? 수빈도
PRED: <extra_id_0>
TGT : 내일 은행에 갈 건데 수빈도 올래? 길 안 막히겠지?
------------------------------------------------------------
[3]
SRC : 수업 끝나고 공원에서 잠깐 이야기하자.
PRED: <extra_id_0>
TGT : 수업 끝나고 공원에서 잠깐 이야기하자.
------------------------------------------------------------
[4]
SRC : 다음 민수랑 장 주집에서 보자. 도움 필요해?
PRED: <extra_id_0>
TGT : 다음 주 집에서 민수랑 장 보자. 도움 필요해?
------------------------------------------------------------


In [14]:
#테스트셋 로드
from datasets import load_dataset

ds_test = load_dataset("json", data_files={"test": test_path})["test"]
print(ds_test)
print("Examples:", ds_test[:2])


Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['tgt', 'src'],
    num_rows: 300
})
Examples: {'tgt': ['퇴근 후 마트에 가서 약 찾자. 어디에서 볼까?', '버스가 좀 늦어. 그리고 아침에 카페에서 영화 보자.'], 'src': ['퇴근 후 마트에 가서약 찾자. 어디에서 볼까?', '버스가좀 늦어. 그리고아침에 영화 카페에서 보자.']}


In [24]:
#모델/토크나이저 로드&배치추론
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm.auto import tqdm
from datasets import load_dataset # Import load_dataset

prefix = "fix: " # Define prefix here
max_gen_len = 96 # Define max_gen_len here
num_beams = 4 # Define num_beams here
batch_size = 16 # Define batch_size here
save_dir = "/content/drive/MyDrive/context_trans_model_v1" # Define save_dir here

device = "cuda" if torch.cuda.is_available() else "cpu" # Define device here

# Load the test dataset
test_path  = "/content/drive/MyDrive/test.jsonl" # Define test_path
ds_test = load_dataset("json", data_files={"test": test_path})["test"] # Define ds_test


tok = AutoTokenizer.from_pretrained(save_dir, use_fast=True)
mdl = AutoModelForSeq2SeqLM.from_pretrained(save_dir).to(device)
mdl.eval()

def generate_batch_texts(batch_src):
    inputs = tok([prefix + s for s in batch_src],
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
                 max_length=max_gen_len).to(device)
    with torch.no_grad():
        out = mdl.generate(**inputs,
                           num_beams=num_beams,
                           max_length=max_gen_len,
                           no_repeat_ngram_size=3)
    return tok.batch_decode(out, skip_special_tokens=True)

preds = []
for i in tqdm(range(0, len(ds_test), batch_size)):
    batch = ds_test[i:i+batch_size]
    batch_src = batch["src"]
    batch_pred = generate_batch_texts(batch_src)
    preds.extend(batch_pred)

refs = ds_test["tgt"]
assert len(preds) == len(refs)
print("Sample\nSRC:", ds_test[0]["src"], "\nPRED:", preds[0], "\nTGT:", refs[0])



  0%|          | 0/19 [00:00<?, ?it/s]

Sample
SRC: 퇴근 후 마트에 가서약 찾자. 어디에서 볼까? 
PRED: <extra_id_0>. 
TGT: 퇴근 후 마트에 가서 약 찾자. 어디에서 볼까?


In [25]:
#3) 표준 번역 지표 (SacreBLEU / chrF++ / ROUGE-L / TER)
import evaluate

metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")
metric_rouge = evaluate.load("rouge")
metric_ter  = evaluate.load("ter")

# BLEU/TER은 ref를 리스트의 리스트로 받습니다.
bleu = metric_bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
chrf = metric_chrf.compute(predictions=preds, references=refs)["score"]
rougeL = metric_rouge.compute(predictions=preds, references=refs)["rougeL"]
ter = metric_ter.compute(predictions=preds, references=refs)["score"]

print(f"SacreBLEU : {bleu:.2f}")
print(f"chrF++    : {chrf:.2f}")
print(f"ROUGE-L   : {rougeL:.4f}")
print(f"TER       : {ter:.2f} (낮을수록 좋음)")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

SacreBLEU : 0.06
chrF++    : 0.76
ROUGE-L   : 0.0000
TER       : 99.87 (낮을수록 좋음)


In [26]:
#추가 진단 지표 (정확일치/길이/다양도)
import numpy as np

# 정확히 동일한 문장 비율(엄격)
exact_match = np.mean([int(p.strip()==r.strip()) for p, r in zip(preds, refs)])

# 길이 비율(예측/정답), 길이가 너무 늘어나거나 줄어드는지 확인
len_pred = np.array([len(p) for p in preds])
len_ref  = np.array([len(r) for r in refs])
len_ratio = (len_pred / np.maximum(1, len_ref)).mean()

# distinct-1/2 (uni/bi-gram의 고유 비율 → 반복/복붙 경향 확인)
def distinct_ngram_ratio(texts, n=1):
    total = 0
    uniq = 0
    for t in texts:
        tokens = t.split()
        ngrams = [' '.join(tokens[i:i+n]) for i in range(0, max(0, len(tokens)-n+1))]
        total += len(ngrams)
        uniq  += len(set(ngrams))
    return (uniq / total) if total>0 else 0.0

distinct1 = distinct_ngram_ratio(preds, n=1)
distinct2 = distinct_ngram_ratio(preds, n=2)

print(f"Exact Match      : {exact_match*100:.2f}%")
print(f"Length Ratio     : {len_ratio:.3f}  (예측/정답, 1.0에 가까울수록 적절)")
print(f"Distinct-1       : {distinct1:.3f}")
print(f"Distinct-2       : {distinct2:.3f}")


Exact Match      : 0.00%
Length Ratio     : 0.478  (예측/정답, 1.0에 가까울수록 적절)
Distinct-1       : 1.000
Distinct-2       : 1.000


In [27]:
#길이 구간별(Bucket) 성능 분석
import pandas as pd

df = pd.DataFrame({
    "src": ds_test["src"],
    "tgt": refs,
    "pred": preds,
    "len_tgt": [len(r) for r in refs],
})

# 길이에 따라 5개 구간으로 나눔(원하면 10개도 가능)
df["len_bucket"] = pd.qcut(df["len_tgt"], q=5, duplicates="drop")

bucket_rows = []
for bucket, sub in df.groupby("len_bucket"):
    sb_preds = sub["pred"].tolist()
    sb_refs  = sub["tgt"].tolist()
    b_bleu = metric_bleu.compute(predictions=sb_preds, references=[[r] for r in sb_refs])["score"]
    b_chrf = metric_chrf.compute(predictions=sb_preds, references=sb_refs)["score"]
    b_rougeL = metric_rouge.compute(predictions=sb_preds, references=sb_refs)["rougeL"]
    b_ter  = metric_ter.compute(predictions=sb_preds, references=sb_refs)["score"]
    bucket_rows.append([str(bucket), len(sub), b_bleu, b_chrf, b_rougeL, b_ter])

df_bucket = pd.DataFrame(bucket_rows, columns=["len_bucket","n","BLEU","chrF++","ROUGE-L","TER"]).sort_values("len_bucket")
df_bucket


  for bucket, sub in df.groupby("len_bucket"):


Unnamed: 0,len_bucket,n,BLEU,chrF++,ROUGE-L,TER
0,"(14.999, 25.0]",80,0.192376,0.659783,0.0,100.0
1,"(25.0, 27.0]",54,0.216032,0.589266,0.0,100.0
2,"(27.0, 29.0]",52,0.211061,0.528135,0.0,100.0
3,"(29.0, 32.0]",74,0.137967,0.704295,0.0,99.847561
4,"(32.0, 41.0]",40,0.215529,1.404821,0.0,99.456522


In [28]:
# 간단히 "문자 레벨" 차이를 기준으로 한 유사도 점수(0~1) → 낮을수록 차이 큼 (빠른 예비 확인용)
import difflib

def quick_similarity(a, b):
    return difflib.SequenceMatcher(None, a, b).ratio()

df["sim"] = [quick_similarity(p, r) for p, r in zip(df["pred"], df["tgt"])]
df_bad = df.sort_values("sim").head(20)[["src","pred","tgt","sim","len_tgt"]]
df_bad.reset_index(drop=True, inplace=True)
df_bad.head(10)


Unnamed: 0,src,pred,tgt,sim,len_tgt
0,네일회사에 갈 건데지민도 올래? 준비 다 됐어?,<extra_id_0>,내일 회사에 갈 건데 지민도 올래? 준비 다 됐어?,0.0,28
1,점심에 마트에 갈 건데 지아도 올래? 너는 어떻게생각해?,<extra_id_0>,점심에 마트에 갈 건데 지아도 올래? 너는 어떻게 생각해?,0.0,32
2,"도윤야, 점심에 집에서 보자. 준비 다 됐어?",<extra_id_0>,"도윤야, 점심에 집에서 보자. 준비 다 됐어?",0.0,25
3,이번 주말 영수증 찾자. 준비 다 회사에 가서 됐어?,<extra_id_0>,이번 주말 회사에 가서 영수증 찾자. 준비 다 됐어?,0.0,29
4,퇴근 후 회사에서 지민랑 영화 보자. 길 안 막히겠지?,<extra_id_0>,퇴근 후 회사에서 지민랑 영화 보자. 길 안 막히겠지?,0.0,30
5,모레 마트에서 예린랑 장 보자. 길 안 막히겠지?,<extra_id_0>,모레 마트에서 예린랑 장 보자. 길 안 막히겠지?,0.0,27
6,수업 끝나고 찾자. 공원에 가서도장 필요한 거 있어?,<extra_id_0>,수업 끝나고 공원에 가서 도장 찾자. 필요한 거 있어?,0.0,30
7,점심에 마트에 갈 건데 서연도 올래? 길 안 막히겠지?,<extra_id_0>,점심에 마트에 갈 건데 서연도 올래? 길 안 막히겠지?,0.0,30
8,오늘 회사에서 예린랑밥 먹자. 예약했어?,<extra_id_0>,오늘 회사에서 예린랑 밥 먹자. 예약했어?,0.0,23
9,업무 모레 지하철역에서하준랑 은행 보자. 시간 괜찬아?,<extra_id_0>,모레 지하철역에서 하준랑 은행 업무 보자. 시간 괜찮아?,0.0,31


In [29]:
out_dir = "/content/drive/MyDrive/context_eval_v1"
import os, json
os.makedirs(out_dir, exist_ok=True)

# 전체 예측 테이블
df.to_csv(f"{out_dir}/predictions.csv", index=False)

# 요약 메트릭 저장
summary = {
    "SacreBLEU": round(bleu, 2),
    "chrF++": round(chrf, 2),
    "ROUGE-L": round(rougeL, 4),
    "TER": round(ter, 2),
    "ExactMatch": round(float(exact_match), 4),
    "LengthRatio": round(float(len_ratio), 4),
    "Distinct1": round(float(distinct1), 4),
    "Distinct2": round(float(distinct2), 4),
}
with open(f"{out_dir}/metrics.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

# 길이구간 리포트
df_bucket.to_csv(f"{out_dir}/bucket_report.csv", index=False)

print("Saved:\n", out_dir)
summary


Saved:
 /content/drive/MyDrive/context_eval_v1


{'SacreBLEU': 0.06,
 'chrF++': 0.76,
 'ROUGE-L': 0.0,
 'TER': 99.87,
 'ExactMatch': 0.0,
 'LengthRatio': 0.4781,
 'Distinct1': 1.0,
 'Distinct2': 1.0}