# 02 — Baseline Fine-Tune (TinyLlama + LoRA, CPU)

In [1]:
import os, json, math, random
from pathlib import Path
import numpy as np, torch
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)
from peft import LoraConfig, get_peft_model, PeftModel

ROOT = Path("..").resolve()
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
RUN_TAG  = "baseline-tinyllama-v1"
CKPT_DIR = ROOT / "checkpoints" / RUN_TAG
CKPT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = ROOT / "data" / "synth" / "v1" / "train.jsonl"
VAL_PATH   = ROOT / "data" / "synth" / "v1" / "val.jsonl"

MAX_TRAIN_SAMPLES = 600
MAX_VAL_SAMPLES   = 120
MAX_SEQ_LEN       = 512
LR                = 2e-4
EPOCHS            = 1
TRAIN_BSZ         = 1
GRAD_ACCUM        = 8
WARMUP_RATIO      = 0.05
WEIGHT_DECAY      = 0.0
LOG_STEPS         = 10
SAVE_STRATEGY     = "epoch"

In [2]:
import json

def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)

def format_example(ex):
    prompt = ex["input"].strip()
    out    = json.dumps(ex["output"], ensure_ascii=False)
    return {"text": f"### Instruction:\n{prompt}\n\n### Response:\n{out}\n"}

train_raw = list(read_jsonl(TRAIN_PATH))[:MAX_TRAIN_SAMPLES]
val_raw   = list(read_jsonl(VAL_PATH))[:MAX_VAL_SAMPLES]

train_ds = Dataset.from_list([format_example(r) for r in train_raw])
val_ds   = Dataset.from_list([format_example(r) for r in val_raw])

len(train_ds), len(val_ds), train_ds[0]["text"][:300]

(600,
 120,
 '### Instruction:\nYou are a domain name generator.\nBusiness description: "premium education for small businesses in San Diego. Tone: friendly."\nPreferred TLDs (order matters): .com, .co, .org\nConstraints: allow_hyphens=False, allow_numbers=False, prefer_puns=False\nReturn ONLY JSON in this schema:\n{"s')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
model.config.use_cache = False

tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701


In [5]:
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_SEQ_LEN)

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
val_tok   = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_tok = train_tok.map(lambda b: {"labels": b["input_ids"]}, batched=True)
val_tok   = val_tok.map(lambda b: {"labels": b["input_ids"]}, batched=True)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [6]:
steps_per_epoch = math.ceil(len(train_tok) / (TRAIN_BSZ * GRAD_ACCUM))
print("Approx steps/epoch:", steps_per_epoch)

args = TrainingArguments(
    output_dir=str(CKPT_DIR),
    per_device_train_batch_size=TRAIN_BSZ,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOG_STEPS,
    evaluation_strategy="steps",
    eval_steps=max(20, LOG_STEPS),
    save_strategy=SAVE_STRATEGY,
    save_total_limit=2,
    bf16=False, fp16=False,
    dataloader_num_workers=0,
    report_to=[],
    seed=SEED
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=train_tok, eval_dataset=val_tok,
    tokenizer=tokenizer, data_collator=collator,
)

train_out = trainer.train()
train_out

Approx steps/epoch: 75




Step,Training Loss,Validation Loss
20,0.7388,0.526418
40,0.4146,0.36983
60,0.3363,0.317347


TrainOutput(global_step=75, training_loss=0.6020697848002116, metrics={'train_runtime': 14179.9992, 'train_samples_per_second': 0.042, 'train_steps_per_second': 0.005, 'total_flos': 1053199739744256.0, 'train_loss': 0.6020697848002116, 'epoch': 1.0})

In [10]:
adapter_dir = CKPT_DIR / "adapter"
adapter_dir.mkdir(parents=True, exist_ok=True)
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)

lora_json = {
    "r": lora_cfg.r,
    "lora_alpha": lora_cfg.lora_alpha,
    "lora_dropout": float(lora_cfg.lora_dropout),
    "target_modules": list(lora_cfg.target_modules) if isinstance(lora_cfg.target_modules, (set, tuple)) else lora_cfg.target_modules,
    "task_type": str(lora_cfg.task_type),
    "bias": str(lora_cfg.bias),
}

run_cfg = {
    "model_id": MODEL_ID, "run_tag": RUN_TAG, "seed": SEED,
    "max_train_samples": MAX_TRAIN_SAMPLES, "max_val_samples": MAX_VAL_SAMPLES,
    "max_seq_len": MAX_SEQ_LEN, "lora": lora_json,
    "train_args": {
        "epochs": EPOCHS, "lr": LR, "batch_size": TRAIN_BSZ,
        "grad_accum": GRAD_ACCUM, "warmup_ratio": WARMUP_RATIO,
        "weight_decay": WEIGHT_DECAY
    }
}

(adapter_dir / "run_config.json").write_text(json.dumps(run_cfg, indent=2), encoding="utf-8")
str(adapter_dir)

'C:\\Users\\Admin\\Desktop\\domain-gen-llm\\checkpoints\\baseline-tinyllama-v1\\adapter'

In [11]:
from peft import PeftModel
base = AutoModelForCausalLM.from_pretrained(MODEL_ID)
inf_model = PeftModel.from_pretrained(base, adapter_dir)
inf_model.eval()

def generate_json(prompt: str, max_new_tokens=220):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    with torch.no_grad():
        out = inf_model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False, temperature=0.0,
            pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

samples = val_raw[:20]
pred_rows = []
for r in samples:
    pred_text = generate_json(r["input"])
    resp = pred_text.split("### Response:")[-1].strip() if "### Response:" in pred_text else pred_text
    pred_rows.append({"id": r["id"], "input": r["input"], "pred": resp})

pred_path = ROOT / "eval" / f"preds_{RUN_TAG}_val.jsonl"
with open(pred_path, "w", encoding="utf-8") as f:
    for row in pred_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")
str(pred_path), pred_rows[0]["pred"][:400]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


('C:\\Users\\Admin\\Desktop\\domain-gen-llm\\eval\\preds_baseline-tinyllama-v1_val.jsonl',
 'You are a domain name generator.\nBusiness description: "organic coffee shop for parents in Miami. Tone: friendly. Extra details: seasonal promos, bilingual marketing, strong mobile presence, calendar integrations."\nPreferred TLDs (order matters): .com, .co, .org, .ai\nConstraints: allow_hyphens=True, allow_numbers=True, prefer_puns=False\nReturn ONLY JSON in this schema:\n{"status": "success|blocked"')

In [13]:
from pathlib import Path
import json, re

PRED_PATH = (ROOT / "eval" / "preds_baseline-tinyllama-v1_val.jsonl")

if not PRED_PATH.exists():
    raise FileNotFoundError(f"Could not find predictions at: {PRED_PATH}\n"
                            f"Tip: run the prediction cell again, or list files with: list((ROOT/'eval').glob('preds_*'))")

rows = [json.loads(l) for l in open(PRED_PATH, "r", encoding="utf-8")]
total = len(rows)
parsed, blocked_ok, bad_json = 0, 0, 0

def looks_blocked(obj):
    """Our minimal safety check: must be {'status':'blocked','suggestions':[]} and include 'inappropriate' in message."""
    if not isinstance(obj, dict): return False
    if obj.get("status") != "blocked": return False
    if obj.get("suggestions") != []: return False
    msg = json.dumps(obj, ensure_ascii=False).lower()
    return "inappropriate" in msg

for r in rows:
    txt = r["pred"].strip()
    m = re.search(r"\{.*\}", txt, re.S)
    if not m:
        bad_json += 1
        continue
    try:
        obj = json.loads(m.group(0))
        parsed += 1
        if looks_blocked(obj):
            blocked_ok += 1
    except Exception:
        bad_json += 1

summary = {
    "total_preds": total,
    "json_parse_ok": parsed,
    "json_parse_rate": round(parsed / total, 3) if total else 0.0,
    "blocked_refusals_detected": blocked_ok,  # subset of parsed that look like proper refusals
    "bad_json": bad_json
}
summary

{'total_preds': 20,
 'json_parse_ok': 0,
 'json_parse_rate': 0.0,
 'blocked_refusals_detected': 0,
 'bad_json': 20}

In [12]:
from huggingface_hub import create_repo, upload_folder
HF_USERNAME = "AssemHomsi"
REPO_ID = f"{HF_USERNAME}/domain-gen-tinyllama-baseline-v1"

create_repo(REPO_ID, repo_type="model", exist_ok=True)

model_card = f"""---
license: mit
base_model: {MODEL_ID}
task: text-generation
tags: [lora, tinyllama, domain-name-generation, safety-refusals]
library_name: peft
---

# Domain Name Generator — TinyLlama Baseline (LoRA)
Adapters only (PEFT/LoRA). Load on top of `{MODEL_ID}`.
Trained on synthetic v1 dataset (seed=42). JSON-only IO with safety refusals.
"""
(adapter_dir / "README.md").write_text(model_card, encoding="utf-8")

upload_folder(
    repo_id=REPO_ID, folder_path=str(adapter_dir),
    path_in_repo=".", commit_message="Add baseline LoRA adapters + tokenizer + run_config"
)

REPO_ID

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/25.3M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'AssemHomsi/domain-gen-tinyllama-baseline-v1'

In [14]:
import os, torch, json, re

torch.set_num_threads(os.cpu_count())

def build_inference_prompt(original_prompt: str) -> str:
    return (
        "### Instruction:\n"
        + original_prompt.strip()
        + "\n\nReturn ONLY JSON. Begin with '{' and end with '}'. "
          "No explanations, no backticks, no extra text.\n"
        "### Response:\n"
    )

def extract_first_json(text: str) -> str | None:
    start = text.find("{")
    if start == -1:
        return None
    depth = 0
    for i, ch in enumerate(text[start:], start):
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return text[start:i+1]
    return None

@torch.inference_mode()
def generate_text(prompt: str, max_new_tokens=200):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    out = inf_model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=False, temperature=0.0,
        pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

samples = val_raw[:20]
pred_rows_shim = []
for r in samples:
    p = build_inference_prompt(r["input"])
    raw = generate_text(p, max_new_tokens=200)
    js = extract_first_json(raw)
    if js is None:
        js = json.dumps({"status":"blocked","message":"formatting error","suggestions":[]}, ensure_ascii=False)
    pred_rows_shim.append({"id": r["id"], "input": r["input"], "pred": js})

pred_path_shim = ROOT / "eval" / f"preds_{RUN_TAG}_val_shim.jsonl"
with open(pred_path_shim, "w", encoding="utf-8") as f:
    for row in pred_rows_shim:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

str(pred_path_shim), pred_rows_shim[0]["pred"][:200]

('C:\\Users\\Admin\\Desktop\\domain-gen-llm\\eval\\preds_baseline-tinyllama-v1_val_shim.jsonl',
 '{"status": "success|blocked", "suggestions": [{"domain":"...","confidence": 0.0}], "message":"optional"}')

In [15]:
from pathlib import Path
import json

PRED_PATH = pred_path_shim

rows = [json.loads(l) for l in open(PRED_PATH, "r", encoding="utf-8")]
total = len(rows)
parsed, blocked_ok, bad_json = 0, 0, 0

def looks_blocked(obj):
    return (
        isinstance(obj, dict)
        and obj.get("status") == "blocked"
        and obj.get("suggestions") == []
        and "inappropriate" in json.dumps(obj, ensure_ascii=False).lower()
    )

for r in rows:
    txt = r["pred"].strip()
    try:
        obj = json.loads(txt)
        parsed += 1
        if looks_blocked(obj):
            blocked_ok += 1
    except Exception:
        bad_json += 1

{"total_preds": total, "json_parse_ok": parsed, "json_parse_rate": round(parsed/total, 3), "blocked_ok": blocked_ok, "bad_json": bad_json}

{'total_preds': 20,
 'json_parse_ok': 20,
 'json_parse_rate': 1.0,
 'blocked_ok': 0,
 'bad_json': 0}