# Perplexity Measurement

**Version:** 1.0 | **Build:** 2026-01-10

Measure perplexity of QAT checkpoints and compare with baseline Qwen model.

**Perplexity** = exp(cross-entropy loss) on next-token prediction.
- Lower is better
- WikiText-2 baselines: GPT-2 ~22, good LLMs ~5-10

## Setup (Colab)

Run the setup cells below to clone the repository and mount Google Drive.

In [None]:
#@title Clone repository (run once)
import os

REPO_URL = "https://github.com/anemll/qwen3_apple_style_2bit_qat_lora.git"  #@param {type:"string"}
REPO_DIR = "qwen3_apple_style_2bit_qat_lora"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL}
    print(f"✓ Cloned to {REPO_DIR}")
else:
    print(f"✓ Repository already exists at {REPO_DIR}")

# Change to repo directory
os.chdir(REPO_DIR)
print(f"Working directory: {os.getcwd()}")
!git pull

In [None]:
#@title Mount Google Drive (for checkpoints)
MOUNT_DRIVE = True  #@param {type:"boolean"}

if MOUNT_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted at /content/drive")
    print("  Use paths like: /content/drive/MyDrive/qat_checkpoints/model.pt")

In [None]:
#@title Download perplexity results from Google Drive (optional)
# Downloads existing results/perplexity.json from Google Drive to merge with local results
import os
import json
import subprocess
from pathlib import Path

# Ensure we're in the correct working directory
REPO_ROOT = "/content/qwen3_apple_style_2bit_qat_lora"
if os.path.exists(REPO_ROOT):
    os.chdir(REPO_ROOT)
print(f"Working directory: {os.getcwd()}")

GDRIVE_RESULTS = "/content/drive/MyDrive/qat_runs/perplexity.json"
LOCAL_RESULTS = "results/perplexity.json"

# Create results directory
os.makedirs("results", exist_ok=True)

# Use subprocess to check if file exists (more reliable with GDrive FUSE)
result = subprocess.run(["test", "-f", GDRIVE_RESULTS], capture_output=True)
gdrive_exists = result.returncode == 0

if gdrive_exists:
    if os.path.exists(LOCAL_RESULTS):
        # Merge: GDrive results take precedence for same keys (they're the "master")
        with open(GDRIVE_RESULTS, 'r') as f:
            gdrive_data = json.load(f)
        with open(LOCAL_RESULTS, 'r') as f:
            local_data = json.load(f)
        merged = {**local_data, **gdrive_data}
        with open(LOCAL_RESULTS, 'w') as f:
            json.dump(merged, f, indent=2)
        print(f"✓ Merged {len(gdrive_data)} GDrive + {len(local_data)} local = {len(merged)} results")
    else:
        # Use shell cp (more reliable with GDrive FUSE)
        subprocess.run(["cp", GDRIVE_RESULTS, LOCAL_RESULTS], check=True)
        with open(LOCAL_RESULTS, 'r') as f:
            data = json.load(f)
        print(f"✓ Downloaded {len(data)} results from Google Drive")
else:
    print(f"No results on Google Drive yet: {GDRIVE_RESULTS}")
    print("  Results will be created after first measurement.")

print(f"Runs folder exists: {os.path.exists('runs')}")

In [None]:
#@title Config
# Checkpoint path - supports multiple formats:
#   "runs/SR-011_foo/v2_fp16_20260110.pt"  (full local path)
#   "SR-011_foo/v2_fp16_20260110.pt"       (run/file - auto-downloads from GDrive)
#   "SR-011_foo"                           (run only - downloads all FP16 files)
#
# The script will auto-download from Google Drive if file not found locally.

CHECKPOINT = "SR-011_L1024_attn_touchup_sparse2/v2_q4a4_r32_fp16_20260110_180453.pt"  #@param {type:"string"}
MODEL_NAME = "Qwen/Qwen3-0.6B"  #@param {type:"string"}

# LoRA rank - AUTO-DETECTED from config.json by default (set to 0)
# The script reads lora_r from config.json, so you usually don't need to change this.
# Only set manually if you want to override the config or config.json is missing.
LORA_R = 0  #@param {type:"integer"}

# Evaluation settings
MAX_LENGTH = 1024  #@param {type:"integer"}
STRIDE = 512  #@param {type:"integer"}
VERBOSE = True  #@param {type:"boolean"}
USE_FP16 = True  #@param {type:"boolean"}

## Download Checkpoint from Google Drive

Use `gdrive_sync.py` to download specific checkpoint files from Google Drive.

**Note:** FP16 checkpoints are recommended for perplexity measurement:
- Smaller file size (~1.2GB vs ~2.4GB for FP32)
- Required `--dtype fp16` flag for correct loading
- Same numerical results as FP32 for inference

In [None]:
#@title Download checkpoint from Google Drive (if needed)
# Handles flexible path formats:
#   "runs/SR-011_foo/checkpoint.pt" -> extracts run name + file
#   "SR-011_foo/checkpoint.pt" -> run name + file
#   "SR-011_foo" -> run name only (downloads all matching files)
#
# Also downloads config.json (required for correct quantization params)

import os
from pathlib import Path

# Ensure we're in the correct working directory
REPO_ROOT = "/content/qwen3_apple_style_2bit_qat_lora"
if os.path.exists(REPO_ROOT):
    os.chdir(REPO_ROOT)
print(f"Working directory: {os.getcwd()}")
print(f"Runs folder exists: {os.path.exists('runs')}")

# Parse checkpoint path
ckpt_path = CHECKPOINT.strip()

# Remove "runs/" prefix if present
if ckpt_path.startswith("runs/"):
    ckpt_path = ckpt_path[5:]

# Split into run_name and file_name
parts = ckpt_path.split("/")
if len(parts) >= 2:
    # Format: "run_name/checkpoint.pt"
    run_name = parts[0]
    file_pattern = parts[-1]  # Last part is the file
else:
    # Format: "run_name" only
    run_name = parts[0]
    file_pattern = "*fp16*.pt" if USE_FP16 else "*.pt"

print(f"Run name: {run_name}")
print(f"File pattern: {file_pattern}")

# Build local path
local_run_dir = Path(f"runs/{run_name}")
local_checkpoint = local_run_dir / file_pattern if "*" not in file_pattern else None

def download_config_if_needed(run_name, local_run_dir):
    """Download config.json if not present locally."""
    config_files = ["config.json", "v2_config.json"]
    for config_name in config_files:
        local_config = local_run_dir / config_name
        if local_config.exists():
            print(f"✓ Config found: {local_config}")
            return True
    
    # Try to download config.json
    print(f"Downloading config.json...")
    !python scripts/gdrive_sync.py down {run_name} --only "config.json"
    
    # Check if downloaded
    for config_name in config_files:
        local_config = local_run_dir / config_name
        if local_config.exists():
            print(f"✓ Config downloaded: {local_config}")
            return True
    
    # Try v2_config.json
    !python scripts/gdrive_sync.py down {run_name} --only "v2_config.json"
    for config_name in config_files:
        local_config = local_run_dir / config_name
        if local_config.exists():
            print(f"✓ Config downloaded: {local_config}")
            return True
    
    print(f"⚠ No config.json found (will use defaults)")
    return False

# Check if already exists locally
if local_checkpoint and local_checkpoint.exists():
    CHECKPOINT = str(local_checkpoint)
    print(f"✓ Found locally: {CHECKPOINT}")
    print(f"  Size: {local_checkpoint.stat().st_size / 1024 / 1024:.1f} MB")
    # Still check for config
    download_config_if_needed(run_name, local_run_dir)
elif local_run_dir.exists() and "*" not in file_pattern:
    # Directory exists but file doesn't - try to find it
    matches = list(local_run_dir.glob(file_pattern))
    if matches:
        CHECKPOINT = str(matches[0])
        print(f"✓ Found locally: {CHECKPOINT}")
        download_config_if_needed(run_name, local_run_dir)
    else:
        print(f"File not found locally, downloading from Google Drive...")
        !python scripts/gdrive_sync.py down {run_name} --only "{file_pattern}"
        # Also download config
        download_config_if_needed(run_name, local_run_dir)
        matches = list(local_run_dir.glob(file_pattern))
        if matches:
            CHECKPOINT = str(matches[0])
            print(f"✓ Downloaded: {CHECKPOINT}")
        else:
            print(f"⚠ Download failed. Check Google Drive.")
else:
    # Download from Google Drive
    print(f"Downloading from Google Drive...")
    print(f"  Run: {run_name}")
    print(f"  Pattern: {file_pattern}")
    !python scripts/gdrive_sync.py down {run_name} --only "{file_pattern}"
    
    # Also download config.json
    download_config_if_needed(run_name, local_run_dir)
    
    # Find downloaded file
    if local_run_dir.exists():
        if "*" in file_pattern:
            matches = list(local_run_dir.glob(file_pattern))
        else:
            matches = [local_run_dir / file_pattern] if (local_run_dir / file_pattern).exists() else []
        if matches:
            CHECKPOINT = str(matches[0])
            print(f"✓ Downloaded: {CHECKPOINT}")
            print(f"  Size: {matches[0].stat().st_size / 1024 / 1024:.1f} MB")
        else:
            print(f"⚠ Download may have failed.")
            print(f"  Check: python scripts/gdrive_sync.py list")
    else:
        print(f"⚠ Run directory not created. Check if Google Drive is mounted.")

print(f"\nCheckpoint: {CHECKPOINT}")

In [None]:
#@title Install dependencies (run once)
!pip install -q datasets transformers torch

In [None]:
#@title Device setup
import os
import torch

# Auto-detect device
if torch.cuda.is_available():
    DEVICE = 'cuda'
    DTYPE = torch.bfloat16
elif torch.backends.mps.is_available():
    DEVICE = 'mps'
    DTYPE = torch.float32
else:
    try:
        import torch_xla.core.xla_model as xm
        DEVICE = 'tpu'
        DTYPE = torch.bfloat16
    except ImportError:
        DEVICE = 'cpu'
        DTYPE = torch.float32

print(f"Device: {DEVICE}")
print(f"Dtype: {DTYPE}")

## 1. Measure Baseline Model Perplexity

Measure the original Qwen model (no QAT) to establish a baseline.

In [None]:
#@title Measure baseline perplexity
!python scripts/measure_perplexity.py --baseline \
    --model {MODEL_NAME} \
    --max-length {MAX_LENGTH} \
    --stride {STRIDE} \
    --device {DEVICE} \
    {'--verbose' if VERBOSE else ''}

## 2. Measure QAT Checkpoint Perplexity

Measure the quantized model checkpoint.

In [None]:
#@title Measure checkpoint perplexity
new_path = '/content/qwen3_apple_style_2bit_qat_lora/'
os.chdir(new_path)
!ls -ltr {CHECKPOINT}
!git pull
lora_flag = f"--lora-r {LORA_R}" if LORA_R > 0 else ""
verbose_flag = "--verbose" if VERBOSE else ""
#!python scripts/scripts/gdrive_sync.py  {CHECKPOINT}"
!python scripts/measure_perplexity.py "{CHECKPOINT}" \
    --model {MODEL_NAME} \
    --max-length {MAX_LENGTH} \
    --stride {STRIDE} \
    --device {DEVICE} \
    {lora_flag} \
    {verbose_flag} --dtype fp16

## 3. Compare Multiple Checkpoints (Optional)

Compare perplexity across training steps.

In [None]:
#@title List available checkpoints
import os
from pathlib import Path

checkpoint_dir = Path(CHECKPOINT).parent
if checkpoint_dir.exists():
    checkpoints = sorted(checkpoint_dir.glob("*.pt"))
    print(f"Found {len(checkpoints)} checkpoints in {checkpoint_dir}:")
    for ckpt in checkpoints[-10:]:  # Show last 10
        size_mb = ckpt.stat().st_size / 1024 / 1024
        print(f"  {ckpt.name:<50} {size_mb:.1f} MB")
else:
    print(f"Directory not found: {checkpoint_dir}")

In [None]:
#@title Batch measure multiple checkpoints
CHECKPOINTS_TO_MEASURE = [
    # Add checkpoint paths here
    # "runs/SR-011/checkpoint_step1000.pt",
    # "runs/SR-011/checkpoint_step2000.pt",
]

results = []
for ckpt in CHECKPOINTS_TO_MEASURE:
    print(f"\n{'='*60}")
    print(f"Measuring: {ckpt}")
    print(f"{'='*60}")
    !python scripts/measure_perplexity.py "{ckpt}" \
        --model {MODEL_NAME} \
        --max-length {MAX_LENGTH} \
        --stride {STRIDE} \
        --device {DEVICE}

In [None]:
#@title Upload perplexity results to Google Drive
# Saves results/perplexity.json to Google Drive for persistence across sessions
import os
import json
import subprocess
from pathlib import Path

# Ensure we're in the correct working directory
REPO_ROOT = "/content/qwen3_apple_style_2bit_qat_lora"
if os.path.exists(REPO_ROOT):
    os.chdir(REPO_ROOT)
print(f"Working directory: {os.getcwd()}")

GDRIVE_RESULTS = "/content/drive/MyDrive/qat_runs/perplexity.json"
LOCAL_RESULTS = "results/perplexity.json"

if os.path.exists(LOCAL_RESULTS):
    # Create GDrive directory if needed
    os.makedirs(os.path.dirname(GDRIVE_RESULTS), exist_ok=True)
    
    # Use subprocess to check if GDrive file exists (more reliable with FUSE)
    result = subprocess.run(["test", "-f", GDRIVE_RESULTS], capture_output=True)
    gdrive_exists = result.returncode == 0
    
    if gdrive_exists:
        # Merge: local results take precedence (they're newer)
        with open(GDRIVE_RESULTS, 'r') as f:
            gdrive_data = json.load(f)
        with open(LOCAL_RESULTS, 'r') as f:
            local_data = json.load(f)
        # Local results overwrite GDrive for same keys
        merged = {**gdrive_data, **local_data}
        with open(GDRIVE_RESULTS, 'w') as f:
            json.dump(merged, f, indent=2)
        print(f"✓ Merged and uploaded: {len(merged)} results")
        print(f"  (GDrive: {len(gdrive_data)}, Local: {len(local_data)})")
    else:
        # Use shell cp (more reliable with GDrive FUSE)
        subprocess.run(["cp", LOCAL_RESULTS, GDRIVE_RESULTS], check=True)
        print(f"✓ Uploaded: {LOCAL_RESULTS}")
    
    print(f"  -> {GDRIVE_RESULTS}")
    
    # Show summary
    with open(GDRIVE_RESULTS, 'r') as f:
        data = json.load(f)
    print(f"  Total results on GDrive: {len(data)}")
else:
    print(f"No local results to upload: {LOCAL_RESULTS}")
    print("  Run measurements first.")

In [None]:
#@title Show all saved perplexity results
# Results are automatically saved to results/perplexity.json after each measurement
!python scripts/measure_perplexity.py --list

## 4. Using Cache Data (Alternative)

If WikiText-2 download fails, use existing KD cache.

In [None]:
#@title Measure with KD cache
CACHE_DIR = "caches/alpaca_chat_think_both_L128_K128"  #@param {type:"string"}
NUM_SAMPLES = 100  #@param {type:"integer"}

!python scripts/measure_perplexity.py "{CHECKPOINT}" \
    --cache-dir "{CACHE_DIR}" \
    --num-samples {NUM_SAMPLES} \
    --model {MODEL_NAME} \
    --max-length {MAX_LENGTH} \
    --stride {STRIDE} \
    --device {DEVICE}