In [18]:
import os, math, json, random, re, itertools
from pathlib import Path
from typing import List, Dict, Any, Optional
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)
import evaluate

In [None]:
PROJECT_ROOT = Path("..").resolve()
CLEAN_DIR    = PROJECT_ROOT / "data" / "clean"

TRAIN_JSONL  = CLEAN_DIR / "train.jsonl"
VAL_JSONL    = CLEAN_DIR / "val.jsonl"
TEST_JSONL   = CLEAN_DIR / "test.jsonl"   

OUTPUT_DIR   = PROJECT_ROOT / "saved_models" / "distilgpt2-npc"

MODEL_NAME        = "distilgpt2"
EPOCHS            = 3
BLOCK_SIZE        = 384      
PER_DEVICE_BSZ    = 2
GRAD_ACCUM        = 2
LEARNING_RATE     = 2e-5
WEIGHT_DECAY      = 0.01
WARMUP_STEPS      = 200
LOG_STEPS         = 50
EVAL_STEPS        = 200
SAVE_STEPS        = 200
SEED              = 42


MAX_NEW_TOKENS    = 64
GENERATION_TOP_P  = 0.9
GENERATION_TOP_K  = 50
GENERATION_TEMP   = 0.8
NO_REPEAT_NGRAM   = 3

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA:", torch.cuda.is_available(), "| Device:", DEVICE)
set_seed(SEED)
torch.backends.cuda.matmul.allow_tf32 = True

CUDA: True | Device: cuda


In [None]:
SPECIAL_TOKENS = {
    "additional_special_tokens": ["<CONTEXT>", "<PLAYER>", "<NPC>", "<END>"]
}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
added = tokenizer.add_special_tokens(SPECIAL_TOKENS)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
if added > 0:
    model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

print(f"Added special tokens: {added} | Vocab size: {len(tokenizer)}")


Added special tokens: 4 | Vocab size: 50261


In [None]:
def load_jsonl(path: Path):
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            items.append(json.loads(line))
    return items

train_items = load_jsonl(TRAIN_JSONL)
val_items   = load_jsonl(VAL_JSONL)
test_items  = load_jsonl(TEST_JSONL) if TEST_JSONL.exists() else []

print(f"JSONL counts -> train={len(train_items)} | val={len(val_items)} | test={len(test_items)}")
assert len(train_items) > 0 and len(val_items) > 0, "Train/Val JSONL must not be empty."


JSONL counts -> train=4609 | val=255 | test=255


In [None]:
from datasets import Dataset

def to_hf_dataset(items):
    rows = []
    for it in items:
        rows.append({
            "prompt": it["prompt"],
            "target": it["target"],             
            "reference": it.get("reference", ""),
            "npc_name": it.get("npc_name", ""),
        })
    return Dataset.from_list(rows)

hf_train = to_hf_dataset(train_items)
hf_val   = to_hf_dataset(val_items)
hf_test  = to_hf_dataset(test_items) if test_items else None

ds = DatasetDict({
    "train": hf_train,
    "validation": hf_val,
    **({"test": hf_test} if hf_test is not None else {}),
})
ds


DatasetDict({
    train: Dataset({
        features: ['prompt', 'target', 'reference', 'npc_name'],
        num_rows: 4609
    })
    validation: Dataset({
        features: ['prompt', 'target', 'reference', 'npc_name'],
        num_rows: 255
    })
    test: Dataset({
        features: ['prompt', 'target', 'reference', 'npc_name'],
        num_rows: 255
    })
})

In [None]:
def build_inputs(examples):
    input_ids_list, attention_list, labels_list = [], [], []
    prompts = examples["prompt"]
    targets = examples["target"]

    for p, t in zip(prompts, targets):
        p_enc = tokenizer(p, add_special_tokens=False)
        t_enc = tokenizer(t, add_special_tokens=False)

        p_ids, p_att = p_enc["input_ids"], p_enc["attention_mask"]
        t_ids, t_att = t_enc["input_ids"], t_enc["attention_mask"]

        if len(p_ids) + len(t_ids) <= BLOCK_SIZE:
            kept_p_ids, kept_p_att = p_ids, p_att
            kept_t_ids, kept_t_att = t_ids, t_att
        else:
            min_target = 8
            max_prompt_len = max(0, BLOCK_SIZE - min_target)
            if len(p_ids) > max_prompt_len:
                kept_p_ids = p_ids[-max_prompt_len:]
                kept_p_att = p_att[-max_prompt_len:]
                kept_t_ids = t_ids[:min_target]
                kept_t_att = t_att[:min_target]
            else:
                room_for_target = BLOCK_SIZE - len(p_ids)
                kept_p_ids, kept_p_att = p_ids, p_att
                kept_t_ids = t_ids[:room_for_target]
                kept_t_att = t_att[:room_for_target]

        ids = kept_p_ids + kept_t_ids
        att = kept_p_att + kept_t_att
        labels = ([-100] * len(kept_p_ids)) + kept_t_ids

        input_ids_list.append(ids)
        attention_list.append(att)
        labels_list.append(labels)

    return {"input_ids": input_ids_list, "attention_mask": attention_list, "labels": labels_list}

tokenized = ds.map(build_inputs, batched=True, remove_columns=ds["train"].column_names)

print(tokenized)
print("Example lengths:",
      len(tokenized["train"][0]["input_ids"]),
      len(tokenized["train"][0]["labels"]))


Map:   0%|          | 0/4609 [00:00<?, ? examples/s]

Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Map:   0%|          | 0/255 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4609
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 255
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 255
    })
})
Example lengths: 68 68


In [None]:
@dataclass
class DataCollatorForCausalSFT:
    tokenizer: AutoTokenizer
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        labels = [f["labels"] for f in features]
        batch_inputs = {
            "input_ids": [f["input_ids"] for f in features],
            "attention_mask": [f["attention_mask"] for f in features],
        }
        batch = self.tokenizer.pad(
            batch_inputs,
            padding=True,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        max_len = batch["input_ids"].shape[1]
        padded_labels = []
        for l in labels:
            if len(l) < max_len:
                l = l + [self.label_pad_token_id] * (max_len - len(l))
            else:
                l = l[:max_len]
            padded_labels.append(l)
        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)
        return batch

pad_collator = DataCollatorForCausalSFT(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
    label_pad_token_id=-100,
)

ex = [tokenized["train"][i] for i in range(min(4, len(tokenized["train"])))]
b = pad_collator(ex)
for k, v in b.items():
    print(k, tuple(v.shape))


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids (4, 72)
attention_mask (4, 72)
labels (4, 72)


In [None]:
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    overwrite_output_dir=True,

    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BSZ,
    per_device_eval_batch_size=PER_DEVICE_BSZ,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,

    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=2,

    logging_strategy="steps",
    logging_steps=LOG_STEPS,

    fp16=torch.cuda.is_available(),  
    fp16_full_eval=False,            
    bf16=False,

    gradient_checkpointing=True,     
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    group_by_length=True,

    save_safetensors=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=pad_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

train_output = trainer.train()
train_output


Step,Training Loss,Validation Loss
200,1.4471,1.093547
400,0.4437,0.665516
600,0.4649,0.441094
800,0.2239,0.404773
1000,0.2105,0.377015
1200,0.0962,0.357304
1400,0.1294,0.347935
1600,0.2104,0.339929
1800,0.0437,0.344236
2000,0.0707,0.349208


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3459, training_loss=0.36564804210151747, metrics={'train_runtime': 460.111, 'train_samples_per_second': 30.051, 'train_steps_per_second': 7.518, 'total_flos': 224882598739968.0, 'train_loss': 0.36564804210151747, 'epoch': 3.0})

In [27]:
eval_metrics = trainer.evaluate()
eval_loss = eval_metrics.get("eval_loss")
perplexity = math.exp(eval_loss) if (eval_loss is not None and eval_loss < 20) else float("inf")
print({**eval_metrics, "perplexity": perplexity})

{'eval_loss': 0.29438525438308716, 'eval_runtime': 1.9639, 'eval_samples_per_second': 129.841, 'eval_steps_per_second': 65.175, 'epoch': 3.0, 'perplexity': 1.3423009313842171}


In [28]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(OUTPUT_DIR))
tokenizer.save_pretrained(str(OUTPUT_DIR))
print(f"Saved to {OUTPUT_DIR}")

Saved to D:\Game\Backend\saved_models\distilgpt2-npc


In [29]:
rouge = evaluate.load("rouge")
bleu  = evaluate.load("bleu")

def distinct_n(corpus: List[str], n: int = 2) -> float:
    total = 0
    uniq = set()
    for text in corpus:
        toks = text.strip().split()
        if len(toks) < n:
            continue
        grams = list(zip(*[toks[i:] for i in range(n)]))
        total += len(grams)
        uniq.update(grams)
    return (len(uniq) / total) if total else 0.0

def simple_repetition_rate(text: str, window: int = 5) -> float:
    toks = text.split()
    if not toks:
        return 0.0
    reps, total = 0, 0
    for n in range(1, window + 1):
        if len(toks) < n:
            continue
        grams = list(zip(*[toks[i:] for i in range(n)]))
        total += len(grams)
        seen = set()
        for g in grams:
            reps += int(g in seen)
            seen.add(g)
    return reps / total if total else 0.0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [None]:
@torch.no_grad()
def generate_from_prompts(prompts: List[str]) -> List[str]:
    model.eval()
    outs = []
    for i in range(0, len(prompts), 8): 
        batch = prompts[i:i+8]
        enc = tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)
        gen = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            top_p=GENERATION_TOP_P,
            top_k=GENERATION_TOP_K,
            temperature=GENERATION_TEMP,
            no_repeat_ngram_size=NO_REPEAT_NGRAM,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
        for j, seq in enumerate(gen):
            full = tokenizer.decode(seq, skip_special_tokens=False)
            prompt_txt = tokenizer.decode(enc["input_ids"][j], skip_special_tokens=False)
            tail = full[len(prompt_txt):] if full.startswith(prompt_txt) else full
            tail = tail.split("\n<END>")[0].strip()
            outs.append(tail)
    return outs

def strip_tags(s: str) -> str:
    s = s.replace("<END>", "")
    s = re.sub(r"</?[^>]+>", "", s)
    return s.strip()


In [32]:
def evaluate_jsonl(path: Path, max_samples: int = 1000) -> Dict[str, float]:
    if not path.exists():
        print(f"[skip] {path} not found")
        return {}
    with path.open("r", encoding="utf-8") as f:
        items = [json.loads(line) for line in f]
    if not items:
        print(f"[skip] {path} empty")
        return {}

    if len(items) > max_samples:
        random.seed(SEED)
        items = random.sample(items, k=max_samples)

    prompts = [it["prompt"] for it in items]
    refs    = [it["target"].lstrip() for it in items]

    preds = generate_from_prompts(prompts)
    preds_clean = [strip_tags(p) for p in preds]
    refs_clean  = [strip_tags(r) for r in refs]

    rouge_res = rouge.compute(predictions=preds_clean, references=refs_clean)
    bleu_res  = bleu.compute(predictions=preds_clean, references=[[r] for r in refs_clean])

    d1 = distinct_n(preds_clean, 1)
    d2 = distinct_n(preds_clean, 2)
    avg_len = float(np.mean([len(p.split()) for p in preds_clean])) if preds_clean else 0.0
    rep_rate = float(np.mean([simple_repetition_rate(p) for p in preds_clean])) if preds_clean else 0.0

    return {
        "rouge1": rouge_res.get("rouge1", 0.0),
        "rouge2": rouge_res.get("rouge2", 0.0),
        "rougeL": rouge_res.get("rougeL", 0.0),
        "bleu": bleu_res.get("bleu", 0.0),
        "distinct1": d1,
        "distinct2": d2,
        "avg_gen_len": avg_len,
        "repetition_rate": rep_rate,
        "samples_eval": len(preds_clean),
    }

val_gen_metrics  = evaluate_jsonl(VAL_JSONL,  max_samples=1000)
test_gen_metrics = evaluate_jsonl(TEST_JSONL, max_samples=1000)
print("VAL generation metrics:", val_gen_metrics)
print("TEST generation metrics:", test_gen_metrics)


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

VAL generation metrics: {'rouge1': np.float64(0.18992833196080205), 'rouge2': np.float64(0.12974840563654816), 'rougeL': np.float64(0.17688138681439566), 'bleu': 0.08514192771301035, 'distinct1': 0.09169741697416973, 'distinct2': 0.22683042040623524, 'avg_gen_len': 42.509803921568626, 'repetition_rate': 0.04115911108250407, 'samples_eval': 255}
TEST generation metrics: {'rouge1': np.float64(0.195643084823659), 'rouge2': np.float64(0.14085847828871867), 'rougeL': np.float64(0.18454455561326194), 'bleu': 0.09263656757440378, 'distinct1': 0.08961321953876884, 'distinct2': 0.22213960405242122, 'avg_gen_len': 43.1921568627451, 'repetition_rate': 0.03966016412193102, 'samples_eval': 255}


In [33]:
report = {
    "eval_loss": float(eval_loss) if eval_loss is not None else None,
    "perplexity": float(perplexity) if perplexity not in (None, float("inf")) else None,
    "val_generation_metrics": val_gen_metrics,
    "test_generation_metrics": test_gen_metrics,
    "config": {
        "epochs": EPOCHS,
        "block_size": BLOCK_SIZE,
        "per_device_bsz": PER_DEVICE_BSZ,
        "grad_accum": GRAD_ACCUM,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "warmup_steps": WARMUP_STEPS,
        "generation": {
            "max_new_tokens": MAX_NEW_TOKENS,
            "top_p": GENERATION_TOP_P,
            "top_k": GENERATION_TOP_K,
            "temperature": GENERATION_TEMP,
            "no_repeat_ngram": NO_REPEAT_NGRAM,
        },
        "special_tokens": SPECIAL_TOKENS["additional_special_tokens"],
    }
}
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "eval_report.json").write_text(json.dumps(report, indent=2), encoding="utf-8")
print("Wrote:", OUTPUT_DIR / "eval_report.json")

def build_npc_prompt(context: str, player_input: str, npc_name: str) -> str:
    ctx = context.strip() if context else "No major events. Player is on good terms with everyone."
    player = player_input.strip()
    name = npc_name.strip()
    return f"<CONTEXT> {ctx}\n<PLAYER> {player}\n<NPC>({name}) "

sample_prompt = build_npc_prompt(
    "Player apologized to Father Jacob at the church door and was heard by visitors.",
    "Jacky, think the rumor will settle now?",
    "Jacky"
)
print(sample_prompt)
print("---")
print(generate_from_prompts([sample_prompt])[0])


Wrote: D:\Game\Backend\saved_models\distilgpt2-npc\eval_report.json
<CONTEXT> Player apologized to Father Jacob at the church door and was heard by visitors.
<PLAYER> Jacky, think the rumor will settle now?
<NPC>(Jacky) 
---
ive heard about the good news. Keep it steady and we’ll be fine. Keep choosing patience; it bears fruit. Keep coming back. Stay clear of the hedge and keep your word. Stay calm. Stay true to your word and keep choosing patience. Stay healthy. Stay strong. Keep believing that every step
