# Re-Training: Emphasize Stop Tokens for Better Ending

**Goal**: Re-train the full model with stronger stop token emphasis to reduce hallucination and ensure clean JSON endings.

**Changes**: Add repeated "</output>" to training responses, use in generation to stop early. Load from your saved full model for continued training.

**Run Instructions**: Use T4 GPU. If success <80%, switch to Phi-3.

In [None]:
# Setup (minimal, reuse if already done)
!pip uninstall -y tensorflow  # Remove TF to prevent interference
!pip install --upgrade transformers torch accelerate bitsandbytes peft datasets trl -q

from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

print("✅ Setup complete!")

Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
✅ Setup complete!


In [None]:
# Copy files (if not already)
os.chdir('/content')
!rm -rf fine_tuning
!cp -r "/content/drive/MyDrive/fine_tuning" .
!cp "/content/drive/MyDrive/fine_tuning/rpg_training_dataset_gpt4_1_filtered.jsonl" .

print("✅ Files copied!")

✅ Files copied!


In [None]:
# Imports
import sys
sys.path.append('/content/fine_tuning')
from utils.data_utils import load_dataset
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from datasets import Dataset
import torch

print("✅ Imports complete!")

⚙️  Running in WANDB offline mode
✅ Imports complete!


In [None]:
# GPU Check
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
!nvidia-smi

GPU Available: True
GPU Name: Tesla T4
Mon Sep  1 07:21:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
         

In [None]:
# Load full dataset and format with stronger stop token emphasis
dataset = load_dataset("rpg_training_dataset_gpt4_1_filtered.jsonl")
print(f"✅ Loaded {len(dataset)} examples")

formatted_data = []
for item in dataset:
    prompt = item['prompt']
    response = json.dumps(item['response']) + "</output></output></output>"  # Repeat stop token 3x for emphasis
    formatted = f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n{response}<|end|>"
    formatted_data.append({"text": formatted})

print(f"✅ Formatted with emphasized stop tokens")

✅ Loaded 421 examples
✅ Formatted with emphasized stop tokens


In [None]:
# Load your full saved model for continued training
quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

full_path = "/content/drive/MyDrive/full_tinyllama_rpg"

model = AutoModelForCausalLM.from_pretrained(
    full_path,
    quantization_config=quant_config,
    device_map="auto",
    dtype=torch.float16
)

# Re-apply LoRA if needed (for continued training)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05
)

model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(full_path)
tokenizer.pad_token = "[PAD]"
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print("✅ Loaded full model for re-training!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



✅ Loaded full model for re-training!


In [None]:
# Tokenize (use full data for re-training)
train_dataset = Dataset.from_list(formatted_data)

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train = train_dataset.map(tokenize_function, batched=True)

print("✅ Data tokenized!")

Map:   0%|          | 0/421 [00:00<?, ? examples/s]

✅ Data tokenized!


In [None]:
# Re-train with focus on stopping (more epochs, lower rate)
training_args = TrainingArguments(
    output_dir="/content/retrained_model",
    num_train_epochs=5,  # More epochs for emphasis
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,  # Lower for fine control
    fp16=True,
    save_steps=100,
    logging_steps=10,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
)

trainer.train()
print("✅ Re-training complete!")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,0.3582
20,0.3008
30,0.2421
40,0.1816
50,0.1113
60,0.0466
70,0.016
80,0.0093
90,0.0092
100,0.0073


✅ Re-training complete!


In [None]:
# Save re-trained model
save_path = "/content/drive/MyDrive/retrained_tinyllama_rpg"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"💾 Re-trained model saved!")

💾 Re-trained model saved!


In [None]:
# Test (with full printing for debug)
model.eval()
test_results = []

for i in range(5):  # Test 5 random prompts (adjust as needed)
    prompt = f"<|user|>\nGenerate a tilemap for a game level, where all the edges should be walls\nthere should only be *ONE* player and multiple enemies, all enemies should be placed\nrandomly and the player should be able to reach all enemies. Make sure to place some\nwalls inside the level and place the player near the center of the level. (w*h should be 20*15)<|end|>\n<|assistant|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=4096)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("<|assistant|>")[-1].split("</output>")[0]

    # Print FULL response
    print(f"Full Response {i+1}:{response}\n")

    try:
        parsed = json.loads(response)
        if all(k in parsed for k in ['width', 'height', 'walls', 'enemies', 'player_pos']):
            test_results.append(True)
        else:
            test_results.append(False)
    except:
        test_results.append(False)

    print(f"Test {i+1}: {'✅' if test_results[-1] else '❌'}")
    print("---")

success_rate = sum(test_results) / len(test_results)
print(f"Success Rate: {success_rate:.1%}")

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Full Response 1:
{"width": 20, "height": 15, "walls": [{"x": 0, "y": 0}, {"x": 1, "y": 0}, {"x": 2, "y": 0}, {"x": 3, "y": 0}, {"x": 4, "y": 0}, {"x": 5, "y": 0}, {"x": 6, "y": 0}, {"x": 7, "y": 0}, {"x": 8, "y": 0}, {"x": 9, "y": 0}, {"x": 10, "y": 0}, {"x": 11, "y": 0}, {"x": 12, "y": 0}, {"x": 13, "y": 0}, {"x": 14, "y": 0}, {"x": 15, "y": 0}, {"x": 16, "y": 0}, {"x": 17, "y": 0}, {"x": 18, "y": 0}, {"x": 19, "y": 0}, {"x": 0, "y": 1}, {"x": 19, "y": 1}, {"x": 0, "y": 2}, {"x": 19, "y": 2}, {"x": 0, "y": 3}, {"x": 19, "y": 3}, {"x": 0, "y": 4}, {"x": 19, "y": 4}, {"x": 0, "y": 5}, {"x": 19, "y": 5}, {"x": 0, "y": 6}, {"x": 19, "y": 6}, {"x": 0, "y": 7}, {"x": 19, "y": 7}, {"x": 0, "y": 8}, {"x": 19, "y": 8}, {"x": 0, "y": 9}, {"x": 19, "y": 9}, {"x": 0, "y": 10}, {"x": 19, "y": 10}, {"x": 0, "y": 11}, {"x": 19, "y": 11}, {"x": 0, "y": 12}, {"x": 19, "y": 12}, {"x": 0, "y": 13}, {"x": 19, "y": 13}, {"x": 0, "y": 14}, {"x": 1, "y": 14}, {"x": 2, "y": 14}, {"x": 3, "y": 14}, {"x": 4, "

KeyboardInterrupt: 