In [2]:
import os
import math
import random
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# Paths
TRAIN_TXT = os.path.join('..', 'data', 'game_dialogue_10k_train.txt')
VAL_TXT   = os.path.join('..', 'data', 'game_dialogue_10k_val.txt')
OUTPUT_DIR = os.path.join('..', 'saved_models', 'distilgpt2-finetuned')

# Training config
EPOCHS = 3
BLOCK_SIZE = 256        # fits 6GB VRAM well; raise to 384 if comfortable
BATCH_SIZE = 2          # per device batch size
LR = 2e-5
GRAD_ACCUM = 2          # effective batch = 2 * 2 = 4
FP16 = torch.cuda.is_available()
SEED = 42

# Reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [3]:
SPECIAL_TOKENS = {
    "additional_special_tokens": ["<CONTEXT>", "<PLAYER>", "<NPC>", "<END>"]
}

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# GPT-2 has no pad by default; set pad to eos
tokenizer.pad_token = tokenizer.eos_token

# Add our special tokens
num_added = tokenizer.add_special_tokens(SPECIAL_TOKENS)

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Resize embeddings if we added tokens
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on: {device}. Added tokens: {num_added}")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model loaded on: cuda. Added tokens: 4


In [4]:
raw = load_dataset(
    "text",
    data_files={"train": TRAIN_TXT, "validation": VAL_TXT}
)
print(raw)
print("Train sample:", raw["train"][0])
print("Val sample:", raw["validation"][0])

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 22404
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1179
    })
})
Train sample: {'text': '<CONTEXT> Someone claimed the well water tasted sweeter today; folks nodded.'}
Val sample: {'text': '<CONTEXT> Player apologized to Tom near the scaffolding; laborers passed the word along. Talk spread about a fox sighting near Sarah’s fields.'}


In [5]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=BLOCK_SIZE,
        padding="max_length",   # uses tokenizer.pad_token (set to eos)
    )

tokenized = raw.map(
    tokenize_function,
    batched=False,
    remove_columns=["text"]
)

print(tokenized)
print("One tokenized example length:", len(tokenized["train"][0]["input_ids"]))


Map:   0%|          | 0/1179 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 22404
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1179
    })
})
One tokenized example length: 256


In [20]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False   # causal LM
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    # core training
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    warmup_steps=200,
    weight_decay=0.01,

    # evaluation & saving (note: eval_strategy, not evaluation_strategy)
    do_eval=True,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,

    # logging
    logging_strategy="steps",
    logging_steps=50,

    # precision
    fp16=FP16,
    bf16=False,  # set True only if your hardware/torch supports it

    # misc
    report_to="none",  
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # memory saver
    gradient_checkpointing=True,
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

train_output = trainer.train()
train_output


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,3.0604,
400,1.2553,
600,0.8477,
800,0.6368,
1000,0.5299,
1200,0.5872,
1400,0.5074,
1600,0.426,
1800,0.4155,
2000,0.4708,


TrainOutput(global_step=16803, training_loss=0.36936671954637795, metrics={'train_runtime': 3263.7297, 'train_samples_per_second': 20.594, 'train_steps_per_second': 5.148, 'total_flos': 4390569300197376.0, 'train_loss': 0.36936671954637795, 'epoch': 3.0})

In [22]:
metrics = trainer.evaluate()
eval_loss = metrics.get("eval_loss", None)
ppl = math.exp(eval_loss) if eval_loss is not None else None
print("Eval loss:", eval_loss)
print("Perplexity:", ppl)


Eval loss: nan
Perplexity: nan


In [23]:
trainer.save_model(OUTPUT_DIR)   # saves model + tokenizer config
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved final model to: {OUTPUT_DIR}")

Saved final model to: ..\saved_models\distilgpt2-finetuned


In [33]:
def quick_generate(
    prompt: str,
    max_new_tokens: int = 60,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 0,                   # use nucleus only; set to 40 if you prefer both
):
    # Use your custom END as EOS
    END_ID = tokenizer.convert_tokens_to_ids("<END>")
    if END_ID is None or END_ID == tokenizer.unk_token_id:
        raise ValueError("'<END>' not found in tokenizer vocab. Make sure you added it during training and saved the tokenizer.")

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Block meta-tags and short loops
    bad_words_ids = tokenizer(
        ["<CONTEXT>", "<PLAYER>", "<NPC>", "<END>"], add_special_tokens=False
    ).input_ids

    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=1.15,
        no_repeat_ngram_size=3,
        eos_token_id=END_ID,                 # stop exactly at <END>
        pad_token_id=tokenizer.eos_token_id,
        bad_words_ids=bad_words_ids,
    )

    # Decode only the NEW tokens (after the prompt), don't skip specials yet
    new_tokens = output_ids[0, input_ids.size(-1):]
    text = tokenizer.decode(new_tokens, skip_special_tokens=False)

    # Trim at <END> if present
    stop = text.find("<END>")
    if stop != -1:
        text = text[:stop]

    return text.lstrip("\n ").rstrip()


In [48]:
test_prompt = (
    "<CONTEXT> Player apologized to Sarah under the balcony; her laughter lightened the mood. "
    "Player’s name is Acool.\n"
    "<PLAYER> Jacky, where’s the busiest spot today? Is the west gate still squeaking?\n"
    "<NPC>(Jacky) "
)
print(quick_generate(test_prompt))


ills warmer than usual. A rumor said a merchant wagon might arrive tomorrow. Talk spread about a fox sighting near Sarah‘s fields. Some say the west wall hinges squeaked louder than usual; some talk of a fox incident near Sarah's fields. They whispered that the bell rope frayed again


In [1]:
import os
import math
import random
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, PeftModel
import evaluate

# Paths
TRAIN_TXT = os.path.join('..', 'data', 'game_dialogue_train.txt')  # use your cleaned 5119 train
VAL_TXT   = os.path.join('..', 'data', 'game_dialogue_val.txt')
OUTPUT_DIR = os.path.join('..', 'saved_models', 'distilgpt2-lora')

# Training config
EPOCHS = 3
BLOCK_SIZE = 256
BATCH_SIZE = 2
LR = 2e-5
GRAD_ACCUM = 2
FP16 = torch.cuda.is_available()
SEED = 42

# Reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: NVIDIA GeForce RTX 3060 Laptop GPU
