# Phase 1: Pre-Training Zero-Shot Test

**Goal**: Test if TinyLlama can understand RPG generation task without any training

**Expected**: Poor performance, but some JSON-like structure

**Success Criteria**: Model generates some valid JSON (even if content is wrong)

In [None]:
# Setup: Install required packages
!pip install transformers torch accelerate bitsandbytes -q

# Mount Google Drive to access your dataset
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [None]:
# Upload your fine_tuning folder to Colab
# Or copy from Google Drive
import os
os.chdir('/content')

# Copy your dataset and fine_tuning folder here - ADJUST THESE PATHS
!cp -r "/content/drive/MyDrive/fine_tuning" .
!cp "/content/drive/MyDrive/fine_tuning/rpg_training_dataset_gpt4_1_filtered.jsonl" .

# Verify files copied
print("Files in current directory:")
!ls -la

Files in current directory:
total 920
drwxr-xr-x 1 root root   4096 Sep  1 01:54 .
drwxr-xr-x 1 root root   4096 Sep  1 01:50 ..
drwxr-xr-x 4 root root   4096 Aug 28 13:42 .config
drwx------ 5 root root   4096 Sep  1 01:54 drive
drwx------ 6 root root   4096 Sep  1 01:54 fine_tuning
-rw------- 1 root root 915819 Sep  1 01:54 rpg_training_dataset_gpt4_1_filtered.jsonl
drwxr-xr-x 1 root root   4096 Aug 28 13:43 sample_data


In [None]:
# Fix the missing import in model_utils.py
import os

# Read the current file
with open('/content/fine_tuning/utils/model_utils.py', 'r') as f:
    content = f.read()

# Add the missing import at the top
if 'from typing import Dict' not in content:
    # Find the import section and add the missing import
    lines = content.split('\n')

    # Find where to insert the import (after existing imports)
    insert_line = 0
    for i, line in enumerate(lines):
        if line.startswith('import ') or line.startswith('from '):
            insert_line = i + 1
        elif line.strip() == '' and insert_line > 0:
            break

    # Insert the missing import
    lines.insert(insert_line, 'from typing import Dict, Any')

    # Write back the fixed file
    with open('/content/fine_tuning/utils/model_utils.py', 'w') as f:
        f.write('\n'.join(lines))

    print("✅ Fixed missing import in model_utils.py")
else:
    print("✅ Import already exists")

# Verify the fix by checking the imports
with open('/content/fine_tuning/utils/model_utils.py', 'r') as f:
    first_20_lines = f.readlines()[:20]
    print("\nFirst 20 lines of model_utils.py:")
    for i, line in enumerate(first_20_lines, 1):
        print(f"{i:2d}: {line.rstrip()}")

✅ Fixed missing import in model_utils.py

First 20 lines of model_utils.py:
 1: import torch
 2: from transformers import AutoTokenizer, AutoModelForCausalLM
 3: import json
 4: import re
 5: from typing import Dict, Any
 6: 
 7: class ModelTester:
 8:     def __init__(self, model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
 9:         self.model_name = model_name
10:         self.tokenizer = None
11:         self.model = None
12:         self.device = "cuda" if torch.cuda.is_available() else "cpu"
13: 
14:     def load_model(self):
15:         """Load model and tokenizer"""
16:         print(f"Loading {self.model_name}...")
17:         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
18:         self.model = AutoModelForCausalLM.from_pretrained(
19:             self.model_name,
20:             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,


In [None]:
# Import our utilities directly (instead of running separate script)
import sys
sys.path.append('/content/fine_tuning')

from utils.data_utils import load_dataset, get_sample_prompts
from utils.model_utils import ModelTester
import json
from datetime import datetime

print("✅ Imports successful!")

✅ Imports successful!


In [None]:
# Load dataset and get test samples
print("📊 Loading dataset...")
dataset = load_dataset("rpg_training_dataset_gpt4_1_filtered.jsonl")
sample_prompts = get_sample_prompts(dataset, 5)


# Step 1: Define JSON instructions (tailored to your exact dataset structure)
json_instruction = (
    "\n\nOutput your response STRICTLY as a valid JSON object. "
    "Do NOT add any extra text, explanations, code, or formatting outside the JSON (no markdown, no arrays outside objects). "
    "The JSON MUST include ONLY these exact fields: "
    "'width' (integer, always 20), "
    "'height' (integer, always 15), "
    "'walls' (array of objects, each {'x': integer (0-19), 'y': integer (0-14)} for wall positions; include ALL edge walls and some internal walls), "
    "'enemies' (array of objects, each {'x': integer, 'y': integer} for 5-7 randomly placed enemies inside the level; ensure they are reachable from player), "
    "'player_pos' (object {'x': integer, 'y': integer} for exactly ONE player, placed near the center, e.g., around x=10, y=7). "
    "Ensure: All edges are walls (full top/bottom rows, sides on x=0 and x=19). Player can reach all enemies (no isolated areas). Positions must be unique and within bounds."
)

# Step 2: Add a few-shot example (using one from your dataset to demonstrate format)
few_shot_example = (
    "\n\nExample: For a similar prompt, output exactly like this (but generate new random placements):\n"
    "{\n"
    "  \"width\": 20,\n"
    "  \"height\": 15,\n"
    "  \"walls\": [{\"x\": 0, \"y\": 0}, {\"x\": 1, \"y\": 0}, ... , {\"x\": 17, \"y\": 12}], "  # Truncated for brevity; in real prompt, you can use a full one if needed
    "  \"enemies\": [{\"x\": 4, \"y\": 2}, {\"x\": 7, \"y\": 5}, {\"x\": 11, \"y\": 4}, {\"x\": 15, \"y\": 8}, {\"x\": 9, \"y\": 12}],\n"
    "  \"player_pos\": {\"x\": 10, \"y\": 7}\n"
    "}\n"
    "Now, generate for the given prompt."
)

# Step 3: Modify each prompt by appending the instructions + example
modified_prompts = []
for prompt in sample_prompts:
    modified_prompt = prompt + json_instruction + few_shot_example  # Append both
    modified_prompts.append(modified_prompt)

# Step 4: Replace sample_prompts with modified version for testing
sample_prompts = modified_prompts

# Step 5: Preview the modified prompts (optional, for verification)
print("\nModified Prompts (with JSON instructions and example):")
for i, prompt in enumerate(sample_prompts, 1):
    print(f"{i}. {prompt[:100]}... (full length: {len(prompt)} chars)")

📊 Loading dataset...

Modified Prompts (with JSON instructions and example):
1. Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ... (full length: 1593 chars)
2. Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ... (full length: 1593 chars)
3. Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ... (full length: 1593 chars)
4. Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ... (full length: 1593 chars)
5. Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ... (full length: 1593 chars)


In [None]:
# Load TinyLlama model
print("🤖 Loading TinyLlama model...")
tester = ModelTester()
tester.load_model()
print("✅ Model loaded successfully!")

🤖 Loading TinyLlama model...
Loading TinyLlama/TinyLlama-1.1B-Chat-v1.0...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded on cuda
✅ Model loaded successfully!


In [None]:
# Run zero-shot tests
results = []

for i, prompt in enumerate(sample_prompts, 1):
    print(f"\n--- Test {i}/{len(sample_prompts)} ---")
    print(f"Prompt: {prompt[:100]}...")

    result = tester.test_json_generation(prompt)
    results.append(result)

    # Print immediate feedback
    print(f"✅ Valid JSON: {result['is_valid_json']}")
    print(f"✅ Required Fields: {result['has_required_fields']}")
    if result['is_valid_json']:
        print(f"📋 Fields Found: {result['analysis']['fields_present']}")
        print(f"❌ Fields Missing: {result['analysis']['fields_missing']}")

    # Show response preview
    print(f"Response preview: {result['raw_response'][:200]}...")


--- Test 1/5 ---
Prompt: Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ...
✅ Valid JSON: False
✅ Required Fields: False
Response preview: Here's a valid JSON response for the "Level Generator" task:

```
{
  "width": 20,
  "height": 15,
  "walls": [
    {"x": 0, "y": 0},
    {"x": 1, "y": 0},
    {"x": 2, "y": 0},
    {"x": 3, "y": 0},
...

--- Test 2/5 ---
Prompt: Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ...
✅ Valid JSON: False
✅ Required Fields: False
Response preview: {
  "width": 20,
  "height": 15,
  "walls": [
    {"x": 0, "y": 0},
    {"x": 1, "y": 0},
    ...,
    {"x": 17, "y": 12}
  ],
  "enemies": [
    {"x": 4, "y": 2},
    {"x": 7, "y": 5},
    {"x": 11, ...

--- Test 3/5 ---
Prompt: Generate a tilemap for a game level, where all the edges should be walls
there should only be *ONE* ...
✅ Valid JSON: False
✅ Required Fields: False
Response preview: Here's an exa

In [None]:
# Analyze results and make summary
print("\n" + "=" * 60)
print("PHASE 1 RESULTS SUMMARY")
print("=" * 60)

valid_json_count = sum(1 for r in results if r['is_valid_json'])
required_fields_count = sum(1 for r in results if r['has_required_fields'])

print(f"📊 Valid JSON Responses: {valid_json_count}/{len(results)} ({valid_json_count/len(results)*100:.1f}%)")
print(f"📊 Complete Field Sets: {required_fields_count}/{len(results)} ({required_fields_count/len(results)*100:.1f}%)")

# Expectation analysis
print(f"\n🎯 EXPECTATION ANALYSIS:")
if valid_json_count == 0:
    print("❌ CONCERN: No valid JSON generated. Model may need format training.")
elif valid_json_count < len(results) * 0.3:
    print("⚠️  LOW: Some JSON capability, but inconsistent.")
else:
    print("✅ GOOD: Model shows JSON generation capability!")

if required_fields_count == 0:
    print("❌ EXPECTED: No complete structures (normal for zero-shot)")
elif required_fields_count > 0:
    print("🎉 SURPRISING: Some complete structures! Better than expected!")


PHASE 1 RESULTS SUMMARY
📊 Valid JSON Responses: 0/5 (0.0%)
📊 Complete Field Sets: 0/5 (0.0%)

🎯 EXPECTATION ANALYSIS:
❌ CONCERN: No valid JSON generated. Model may need format training.
❌ EXPECTED: No complete structures (normal for zero-shot)


In [None]:
# Save results and make decision
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

summary = {
    "timestamp": timestamp,
    "model_name": tester.model_name,
    "num_samples": len(sample_prompts),
    "valid_json_rate": valid_json_count / len(results),
    "complete_fields_rate": required_fields_count / len(results),
    "detailed_results": results
}

# Save to Google Drive
results_file = f"/content/drive/MyDrive/phase_1_results_{timestamp}.json"
with open(results_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"💾 Results saved to: {results_file}")

# Decision for next phase
print(f"\n" + "=" * 60)
print("NEXT PHASE DECISION")
print("=" * 60)

valid_rate = summary["valid_json_rate"]

if valid_rate > 0.5:
    print("✅ PROCEED: Good JSON capability. Ready for overfitting phase.")
elif valid_rate > 0.2:
    print("⚠️  PROCEED: Basic capability shown. Overfitting should work.")
else:
    print("❌ INVESTIGATE: Very low capability. May need prompt engineering first.")

print(f"\n🚀 Next: Run Phase 2 (Overfitting Experiment)")

💾 Results saved to: /content/drive/MyDrive/phase_1_results_20250831_153350.json

NEXT PHASE DECISION
❌ INVESTIGATE: Very low capability. May need prompt engineering first.

🚀 Next: Run Phase 2 (Overfitting Experiment)


In [None]:
!ls /root/.cache/huggingface/hub

models--TinyLlama--TinyLlama-1.1B-Chat-v1.0
