In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


2.9.1+cu128
12.8
True
NVIDIA L4


In [2]:
import os, pathlib, json, re, random, math, time
from typing import List, Dict, Any

# === Paths (adjust if needed) ===
PROJECT_ROOT = "/home/cj.bowers/blue_cis6930/llama3_project"
DATA_JSONL   = f"{PROJECT_ROOT}/data/ai_tutor_merged.jsonl"
CACHE_DIR    = f"{PROJECT_ROOT}/.cache/huggingface"
OUTPUTS_DIR  = f"{PROJECT_ROOT}/outputs"
MODELS_DIR   = f"{PROJECT_ROOT}/models"

# Create dirs
for p in [CACHE_DIR, OUTPUTS_DIR, MODELS_DIR]:
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)

# Keep HF cache on Blue
os.environ["HF_HOME"] = CACHE_DIR

# OPTIONAL (for gated models like Llama 3). If needed, uncomment and paste your token:
# os.environ["HF_TOKEN"] = "hf_..."

# Base model (pick one you have access to)
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"   # ungated, no HF token needed
EVAL_SAMPLES = 200                            # quick eval size (increase later if you want)

print("Paths set.")



Paths set.


In [3]:
def iter_jsonl(path, limit=None):
    with open(path, "r") as f:
        for i, line in enumerate(f):
            if limit is not None and i >= limit:
                break
            yield json.loads(line)

ASSISTANT_MARK = "###Assistant:"
USER_MARK = "###User:"

def split_prompt_answer(example_text: str):
    """
    Return (prompt_text_to_feed_model, target_answer).
    We feed everything up to '###Assistant:' as the prompt context.
    """
    idx = example_text.find(ASSISTANT_MARK)
    if idx == -1:
        return None, None
    prompt_part = example_text[:idx + len(ASSISTANT_MARK)]
    # The actual assistant target follows the marker
    answer = example_text[idx + len(ASSISTANT_MARK):].strip()
    return prompt_part.strip(), answer

# Build a small eval set
eval_rows = []
for obj in iter_jsonl(DATA_JSONL, limit=20000):  # scan first chunk to sample diverse items
    p, a = split_prompt_answer(obj["text"])
    if p and a:
        eval_rows.append({"prompt": p, "answer": a})
    if len(eval_rows) >= EVAL_SAMPLES:
        break

len(eval_rows), eval_rows[0]["prompt"][:160]


(200,
 '###System: You are an expert AI tutor. Explain technical concepts clearly. ###User: You will be given a definition of a task first, then some input of the task.')

In [4]:
# Cell 3 — Tokenizer & metrics/helpers (works with Mistral)
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Uses MODEL_ID from your config cell (e.g., "mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ASSISTANT_MARK = "###Assistant:"

def normalize_text(s: str):
    return re.sub(r"\s+", " ", s.strip().lower())

def token_f1(pred: str, gold: str):
    p = normalize_text(pred).split()
    g = normalize_text(gold).split()
    if not p or not g: return 0.0
    inter = sum(min(p.count(w), g.count(w)) for w in set(p))
    if inter == 0: return 0.0
    precision = inter / len(p)
    recall    = inter / len(g)
    return 2 * precision * recall / (precision + recall)

def _lcs_len(a, b):
    n, m = len(a), len(b)
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(n):
        ai = a[i]
        dpi1 = dp[i+1]
        dpi  = dp[i]
        for j in range(m):
            dpi1[j+1] = dpi[j] + 1 if ai == b[j] else (dpi1[j] if dpi1[j] > dpi[j+1] else dpi[j+1])
    return dp[n][m]

def rouge_l(pred: str, gold: str):
    P, G = normalize_text(pred).split(), normalize_text(gold).split()
    if not P or not G: return 0.0
    L = _lcs_len(P, G)
    prec, rec = L/len(P), L/len(G)
    return 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

def reduce_scores(preds, refs):
    f1s = [token_f1(p, r) for p, r in zip(preds, refs)]
    rls = [rouge_l(p, r) for p, r in zip(preds, refs)]
    return {
        "count": len(f1s),
        "token_f1_mean": sum(f1s)/len(f1s) if f1s else 0.0,
        "rougeL_mean":   sum(rls)/len(rls) if rls else 0.0,
    }

def generate_batch(prompts, model, max_new_tokens=256, temperature=0.0):
    """Generate completions and return ONLY the text after the last '###Assistant:' marker."""
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(
        next(model.parameters()).device
    )
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0),
            temperature=temperature,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
        )
    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    completions = []
    for full in texts:
        idx = full.rfind(ASSISTANT_MARK)
        completions.append(full[idx+len(ASSISTANT_MARK):].strip() if idx != -1 else full.strip())
    return completions


In [5]:
# Cell 4 — FAST baseline (Mistral) so the cell completes quickly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm
import transformers

transformers.logging.set_verbosity_error()  # quiet the padding/temp warnings

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"   # decoder-only: left padding

# model
base_eval_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto"
).eval()

# --- make a tiny eval slice so this finishes fast ---
EVAL_SLICE = 20                   # <-- change to 100/200 later if you want
rows = eval_rows[:EVAL_SLICE]

preds_base, golds = [], []
BATCH = 1                         # safer on VRAM; bump to 2 later
MAX_NEW = 64                      # quicker gen; bump to 128–256 later

for i in tqdm(range(0, len(rows), BATCH), desc="Baseline eval"):
    batch = rows[i:i+BATCH]
    batch_prompts = [r["prompt"] for r in batch]
    batch_refs    = [r["answer"] for r in batch]

    # generate_batch uses no_grad; just ensure model is eval()
    preds = generate_batch(batch_prompts, base_eval_model,
                           max_new_tokens=MAX_NEW, temperature=0.0)
    preds_base.extend(preds)
    golds.extend(batch_refs)

baseline_metrics = reduce_scores(preds_base, golds)
print("✅ Fast baseline done:", baseline_metrics)

# Keep these globals for later cells:
# - baseline_metrics
# - golds



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Baseline eval:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Fast baseline done: {'count': 20, 'token_f1_mean': 0.2859335632238266, 'rougeL_mean': 0.21637248040869558}


In [6]:
# Cell 5 — Mistral training model init (QLoRA 4-bit, fp16 compute, HF Trainer)

from dataclasses import dataclass
from typing import Any, List, Dict
import os, torch
from datasets import load_dataset
from transformers import (
    TrainingArguments, AutoModelForCausalLM, Trainer, AutoTokenizer, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model

# --- env tweaks: avoid deepspeed, reduce fragmentation ---
os.environ["ACCELERATE_USE_DEEPSPEED"] = "0"
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

# assumes MODEL_ID, DATA_JSONL, OUTPUTS_DIR, and tokenizer already exist
# if tokenizer isn't defined yet, uncomment the next line:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, padding_side="left")

# --- 4-bit quantization config (QLoRA) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,  # fp16 compute on L4
)

# --- Load base model quantized + automatically placed on GPU ---
train_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,                      # e.g., "mistralai/Mistral-7B-Instruct-v0.2"
    quantization_config=bnb_config,
    device_map="auto",             # safe now that we’re not using meta tensors
    low_cpu_mem_usage=False        # keep False to avoid lingering 'meta'
)
train_model.config.use_cache = False
train_model.gradient_checkpointing_enable()
# needed for QLoRA so gradients flow to inputs with 4-bit weights
if hasattr(train_model, "enable_input_require_grads"):
    train_model.enable_input_require_grads()

# --- LoRA config (good defaults for Mistral 7B) ---
lora_cfg = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
train_model = get_peft_model(train_model, lora_cfg)

# --- Dataset (your merged JSONL) ---
train_ds = load_dataset("json", data_files=DATA_JSONL, split="train")

# --- Loss mask: ignore everything before "###Assistant:" ---
ASSISTANT_MARK = "###Assistant:"
assistant_ids = tokenizer(ASSISTANT_MARK, add_special_tokens=False).input_ids

@dataclass
class AssistantOnlyCollator:
    tokenizer: Any
    max_length: int = 1024  # you can raise to 2048 later if VRAM allows

    def __call__(self, features: List[Dict[str, Any]]):
        texts = [f["text"] for f in features]
        batch = self.tokenizer(
            texts,
            max_length=self.max_length,
            truncation=True,
            padding=True,
            return_tensors="pt",
        )
        labels = batch["input_ids"].clone()

        # mask tokens before the start of Assistant span
        for i, ids in enumerate(batch["input_ids"]):
            start = -1
            for j in range(len(ids) - len(assistant_ids) + 1):
                if torch.equal(ids[j:j+len(assistant_ids)],
                               torch.tensor(assistant_ids, device=ids.device)):
                    start = j + len(assistant_ids)
                    break
            if start > 0:
                labels[i, :start] = -100
        batch["labels"] = labels
        return batch

collator = AssistantOnlyCollator(tokenizer=tokenizer, max_length=1024)

# --- Training arguments (tuned for 4-bit + LoRA on L4) ---
out_dir = f"{OUTPUTS_DIR}/mistral_lora_ai_tutor"
args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    save_steps=1000,
    fp16=True,                  # fp16 compute is fine with 4-bit
    bf16=False,                 # keep False unless you specifically want bf16
    optim="adamw_torch",
    report_to="none",
    remove_unused_columns=False,
    dataloader_pin_memory=True,
)

# --- Standard HF Trainer ---
trainer = Trainer(
    model=train_model,
    args=args,
    train_dataset=train_ds,
    data_collator=collator,
    tokenizer=tokenizer,
)

print("✅ Trainer ready (Mistral + QLoRA 4-bit, fp16 compute, HF Trainer)")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Trainer ready (Mistral + QLoRA 4-bit, fp16 compute, HF Trainer)


  trainer = Trainer(


In [7]:
# smoke test first if you want:
# trainer.train(max_steps=200)

trainer.train()

# Save LoRA adapter
adapter_dir = f"{MODELS_DIR}/mistral_ai_tutor_lora"
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
adapter_dir


{'loss': 1.1154, 'grad_norm': 0.7724497318267822, 'learning_rate': 2.2975996130358548e-07, 'epoch': 3.627967365526555e-05}
{'loss': 1.2142, 'grad_norm': 0.7217940092086792, 'learning_rate': 4.7161255214946496e-07, 'epoch': 7.25593473105311e-05}
{'loss': 1.2568, 'grad_norm': 1.7581788301467896, 'learning_rate': 7.134651429953444e-07, 'epoch': 0.00010883902096579666}
{'loss': 1.1551, 'grad_norm': 0.9755426049232483, 'learning_rate': 1.1971703246871033e-06, 'epoch': 0.00018139836827632777}
{'loss': 1.0188, 'grad_norm': 0.9482971429824829, 'learning_rate': 1.4390229155329827e-06, 'epoch': 0.00021767804193159333}
{'loss': 1.2063, 'grad_norm': 0.8276782631874084, 'learning_rate': 1.6808755063788623e-06, 'epoch': 0.0002539577155868589}
{'loss': 1.0144, 'grad_norm': 0.71774822473526, 'learning_rate': 1.9227280972247415e-06, 'epoch': 0.0002902373892421244}
{'loss': 1.1212, 'grad_norm': 1.069435954093933, 'learning_rate': 2.152488058528327e-06, 'epoch': 0.00032651706289738995}
{'loss': 1.0854, '

KeyboardInterrupt: 

In [11]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch, pathlib

# Merge on CPU to be VRAM-safe
base_for_merge = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map=None
).to("cpu")

peft_loaded = PeftModel.from_pretrained(base_for_merge, adapter_dir)
merged_model = peft_loaded.merge_and_unload()  # merged fp16 weights on CPU

merged_dir = f"{MODELS_DIR}/mistral_ai_tutor_merged"
pathlib.Path(merged_dir).mkdir(parents=True, exist_ok=True)
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)

print("Merged model saved to:", merged_dir)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Merged model saved to: /home/cj.bowers/blue_cis6930/llama3_project/models/mistral_ai_tutor_merged


In [12]:
# Load merged Mistral for eval
trained_eval_model = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    torch_dtype=torch.bfloat16,
    device_map="auto"
).eval()

# Compare baseline (already computed) vs trained
preds_trained = []
B = 4
for i in range(0, len(eval_rows), B):
    batch_prompts = [r["prompt"] for r in eval_rows[i:i+B]]
    preds = generate_batch(batch_prompts, trained_eval_model, max_new_tokens=256, temperature=0.0)
    preds_trained.extend(preds)

trained_metrics = reduce_scores(preds_trained, golds)
baseline_metrics, trained_metrics



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


OutOfMemoryError: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 22.05 GiB of which 58.12 MiB is free. Including non-PyTorch memory, this process has 21.98 GiB memory in use. Of the allocated memory 21.35 GiB is allocated by PyTorch, and 404.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import csv, datetime

ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = f"{OUTPUTS_DIR}/eval_{ts}"
pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

with open(f"{results_dir}/metrics.json", "w") as f:
    json.dump({
        "baseline": baseline_metrics,
        "trained": trained_metrics,
        "n_eval": len(golds),
        "model_id": MODEL_ID,
        "adapter_dir": adapter_dir,
        "merged_dir": merged_dir
    }, f, indent=2)

with open(f"{results_dir}/predictions.tsv", "w", newline="") as f:
    w = csv.writer(f, delimiter="\t")
    w.writerow(["idx","prompt","gold","baseline_pred","trained_pred"])
    for i,(row,b,t) in enumerate(zip(eval_rows, preds_base, preds_trained)):
        w.writerow([i,
                    row["prompt"].replace("\n"," ")[:300],
                    row["answer"].replace("\n"," ")[:300],
                    b.replace("\n"," ")[:300],
                    t.replace("\n"," ")[:300]])

print("results saved to:", results_dir)
print("baseline:", baseline_metrics)
print("trained :", trained_metrics)



In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# ----- CONFIG -----
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"  # or your base model
ADAPTER_DIR = f"{OUTPUTS_DIR}/mistral_ai_tutor_lora"

TEST_PROMPTS = [
    "Explain what a function is in Python to a first-year CS student.",
    "Help me understand the difference between a list and a dictionary in Python.",
    "I keep forgetting the chain rule in calculus. Can you explain it with an example?",
    "I got 75% on a quiz. It had 40 questions. How many did I get right?",
]

MAX_NEW_TOKENS = 200
TEMPERATURE = 0.7
TOP_P = 0.9

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Use tokenizer from your adapter folder (includes special tokens + template)
tokenizer = AutoTokenizer.from_pretrained(
    ADAPTER_DIR,
    use_fast=True,
    padding_side="left"
)

def run_model(model, prompts, label):
    print(f"\n=== {label} ===")
    outputs = []
    for i, prompt in enumerate(prompts, start=1):
        print(f"\n[{label}] Prompt {i}/{len(prompts)}")
        print("Q:", prompt)

        full_prompt = f"###Human: {prompt}\n###Assistant:"
        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            gen = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )

        text = tokenizer.decode(gen[0], skip_special_tokens=True)
        outputs.append(text)
        print("---- Response ----")
        print(text)
        print("------------------")
    return outputs


# ================== 1) BASE MODEL ONLY ==================
print("Loading BASE model (no LoRA)…")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
).to(device)
base_model.eval()

base_outputs = run_model(base_model, TEST_PROMPTS, "BASE")

del base_model
torch.cuda.empty_cache()


# ================== 2) TUNED MODEL (LoRA) ==================
print("\nLoading TUNED model (Mistral + LoRA)…")
tuned_base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
).to(device)

tuned_model = PeftModel.from_pretrained(tuned_base, ADAPTER_DIR)
tuned_model.eval()

tuned_outputs = run_model(tuned_model, TEST_PROMPTS, "TUNED")

# ================== 3) SIDE-BY-SIDE SUMMARY ==================
print("\n\n================ SIDE-BY-SIDE COMPARISON ================\n")
for i, prompt in enumerate(TEST_PROMPTS):
    print(f"### Prompt {i+1}")
    print("Q:", prompt)
    print("\n[BASE]")
    print(base_outputs[i])
    print("\n[TUNED]")
    print(tuned_outputs[i])
    print("\n" + "=" * 70 + "\n")

Using device: cuda
Loading BASE model (no LoRA)…


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 22.05 GiB of which 58.12 MiB is free. Including non-PyTorch memory, this process has 21.98 GiB memory in use. Of the allocated memory 21.50 GiB is allocated by PyTorch, and 247.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)