In [13]:
# ---- Cell 1: Imports & Paths
import os
import math
import random
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    __version__ as hf_version,
)

from peft import LoraConfig, get_peft_model, PeftModel
import evaluate

print("Transformers:", hf_version)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Paths (adapted to your cleaning outputs)
TRAIN_TXT = os.path.join('..', 'data', 'clean', 'train.txt')
VAL_TXT   = os.path.join('..', 'data', 'clean', 'val.txt')
OUTPUT_DIR = os.path.join('..', 'saved_models', 'distilgpt2-lora')

for p in [TRAIN_TXT, VAL_TXT]:
    assert os.path.exists(p), f"Missing file: {p}"
print("Data OK.")


Transformers: 4.55.0
CUDA available: True
GPU: NVIDIA GeForce RTX 3060 Laptop GPU
Data OK.


In [14]:
# ---- Cell 2: Reproducibility & basic config
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Training hyperparams (kept conservative for single-GPU)
EPOCHS = 3
BLOCK_SIZE = 256
BATCH_SIZE = 2
GRAD_ACCUM = 2
LR = 2e-5
FP16 = torch.cuda.is_available()

SPECIAL_TOKENS = {
    "additional_special_tokens": ["<CONTEXT>", "<PLAYER>", "<NPC>", "<END>"]
}


In [15]:
# ---- Cell 3: Tokenizer & Base Model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# GPT-2 family has no pad token by default -> set to EOS
tokenizer.pad_token = tokenizer.eos_token

# Add domain tags
num_added = tokenizer.add_special_tokens(SPECIAL_TOKENS)

base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
if num_added > 0:
    base_model.resize_token_embeddings(len(tokenizer))
print(f"Added special tokens: {num_added}")


Added special tokens: 4


In [16]:
# ---- Cell 4: LoRA wrap (do this ONCE per kernel)
# DistilGPT-2 uses GPT-2-style attention/projection names
lora_targets = ["c_attn", "c_proj"]

peft_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=lora_targets,
    task_type="CAUSAL_LM",
)

# Wrap once
model = get_peft_model(base_model, peft_cfg)

# Disable cache during training to avoid checkpointing incompat warnings
model.config.use_cache = False

# (Optional) keep it simple: no gradient checkpointing
# (If you enable it, keep use_cache=False as above)
# model.gradient_checkpointing_enable()

model.print_trainable_parameters()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


trainable params: 405,504 || all params: 82,321,152 || trainable%: 0.4926


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50261, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
            

In [17]:
# ---- Cell 5: Dataset loading (Hugging Face "text" loader)
raw = load_dataset(
    "text",
    data_files={"train": TRAIN_TXT, "validation": VAL_TXT}
)
print(raw)
print("Train sample:", raw["train"][0])
print("Val sample:", raw["validation"][0])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23044
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1274
    })
})
Train sample: {'text': '<CONTEXT> No major events; only light chatter about bread smells near Sarah’s porch. People said Mr Dawson might sponsor fresh paint for the fence.'}
Val sample: {'text': '<CONTEXT> Morning sun felt warmer than usual; everyone kept to their chores. Some said the west gate hinges squeaked louder than usual.'}


In [18]:
# ---- Cell 6: Tokenization (no packing, keep simple fixed-length blocks)
def tok_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=BLOCK_SIZE,
        padding="max_length",  # pad to BLOCK_SIZE for efficient batches
    )

tokenized = raw.map(tok_fn, batched=False, remove_columns=["text"])
print(tokenized)
print("One tokenized length:", len(tokenized["train"][0]["input_ids"]))

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Map:   0%|          | 0/23044 [00:00<?, ? examples/s]

Map:   0%|          | 0/1274 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 23044
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1274
    })
})
One tokenized length: 256


In [20]:
# ---- Cell 7: TrainingArguments
# Some older/newer HF versions differ on 'evaluation_strategy' name.
# Your 4.55.0 supports 'evaluation_strategy'.
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    warmup_steps=100,
    weight_decay=0.01,

    do_eval=True,
    eval_strategy="steps",   # evaluate during training
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,

    logging_strategy="steps",
    logging_steps=50,

    fp16=FP16,          # use fp16 on CUDA
    bf16=False,         # leave False unless you know your GPU supports it

    report_to="none",   # no wandb/etc
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [21]:
# ---- Cell 8: Trainer & train
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

train_output = trainer.train()
train_output


Step,Training Loss,Validation Loss
200,4.8034,
400,4.5497,
600,4.0513,
800,3.892,
1000,3.7406,
1200,3.2644,
1400,3.2802,
1600,3.2153,
1800,2.9891,
2000,3.0944,




KeyboardInterrupt: 