In [9]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
from datasets import load_dataset
from accelerate import Accelerator
import math
import json
from tqdm import tqdm
import os

In [14]:
# Configurations
TEACHER_MODEL = "meta-llama/Llama-2-7b-chat-hf"
STUDENT_MODEL = "LLaMA-2-0.7b"
DATASET_NAME = "tatsu-lab/alpaca"
MAX_LENGTH = 128
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
EPOCHS = 1   # for demo; increase for real training
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TEMPERATURE = 1.0

print (DEVICE)

cpu


In [15]:
tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL)
dataset = load_dataset(DATASET_NAME)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 52002/52002 [00:00<00:00, 208645.33 examples/s]


In [None]:
with open("alpaca_test.jsonl", "w") as f:
    for item in dataset["test"]:
        f.write(json.dumps(item) + "\n")

def tokenize_fn(ex):
    prompt = ex["instruction"]
    enc = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding="max_length")
    enc["labels"] = enc["input_ids"].copy()
    return enc

tokenized_dataset = dataset["train"].map(tokenize_fn, remove_columns=dataset["train"].column_names)
train_loader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=True)

# ----------------------------
# Load teacher and student
# ----------------------------
teacher_model = AutoModelForCausalLM.from_pretrained(TEACHER_MODEL).eval().to(DEVICE)
student_model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL).quantize(bits=4).to(DEVICE)  # 4-bit quantization
student_model.train()

# ----------------------------
# Optimizer & scheduler
# ----------------------------
optimizer = AdamW(student_model.parameters(), lr=LEARNING_RATE)
accelerator = Accelerator()
student_model, optimizer, train_loader = accelerator.prepare(student_model, optimizer, train_loader)

# ----------------------------
# KL + MLE distillation loss
# ----------------------------
def distillation_loss(student_logits, teacher_logits, labels, temperature=TEMPERATURE):
    # KL divergence with temperature scaling
    kl_loss = nn.KLDivLoss(reduction="batchmean")(nn.LogSoftmax(dim=-1)(student_logits/temperature),
                                                   nn.Softmax(dim=-1)(teacher_logits/temperature)) * (temperature**2)
    # Standard MLE loss
    ce_loss = nn.CrossEntropyLoss()(student_logits.view(-1, student_logits.size(-1)), labels.view(-1))
    return ce_loss + kl_loss

# ----------------------------
# Training loop
# ----------------------------
for epoch in range(EPOCHS):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        with torch.no_grad():
            teacher_logits = teacher_model(input_ids).logits

        student_logits = student_model(input_ids).logits
        loss = distillation_loss(student_logits, teacher_logits, labels)

        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()

    print(f"Epoch {epoch+1}/{EPOCHS} done. Last batch loss: {loss.item():.4f}")

# ----------------------------
# Evaluation: Perplexity
# ----------------------------
def perplexity(model, dataloader):
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * input_ids.numel()
            total_tokens += input_ids.numel()
    return math.exp(total_loss / total_tokens)

test_dataset = dataset["test"].map(tokenize_fn, remove_columns=dataset["test"].column_names)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

student_ppl = perplexity(student_model, test_loader)
teacher_ppl = perplexity(teacher_model, test_loader)
print(f"\n=== Evaluation ===")
print(f"Teacher Perplexity: {teacher_ppl:.2f}")
print(f"Student Perplexity: {student_ppl:.2f}")

# ----------------------------
# Benchmarking generation speed
# ----------------------------
import time
prompt = "Explain quantum computing in simple terms."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)

start_time = time.time()
with torch.no_grad():
    _ = student_model.generate(input_ids, max_new_tokens=50)
end_time = time.time()
print(f"Student generation speed: {50/(end_time - start_time):.2f} tokens/sec")

# ----------------------------
# Extended Benchmarking Metrics
# ----------------------------
from datasets import load_metric

# 1. Token KL Divergence
def token_kl_divergence(student_model, teacher_model, dataloader):
    kl_loss_fn = nn.KLDivLoss(reduction="batchmean")
    total_kl = 0
    count = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            teacher_logits = teacher_model(input_ids).logits
            student_logits = student_model(input_ids).logits
            kl = kl_loss_fn(
                nn.LogSoftmax(dim=-1)(student_logits),
                nn.Softmax(dim=-1)(teacher_logits)
            )
            total_kl += kl.item()
            count += 1
    return total_kl / count

token_kl = token_kl_divergence(student_model, teacher_model, test_loader)

# 2. BLEU Score (lexical similarity)
bleu = load_metric("bleu")
all_references, all_predictions = [], []

for batch in test_loader:
    input_ids = batch["input_ids"].to(DEVICE)
    with torch.no_grad():
        outputs = student_model.generate(input_ids, max_new_tokens=50)
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    refs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
    all_predictions.extend([p.split() for p in preds])
    all_references.extend([[r.split()] for r in refs])

bleu_score = bleu.compute(predictions=all_predictions, references=all_references)["bleu"]

# 3. Semantic similarity (BERTScore)
import bert_score

P, R, F1 = bert_score.score(all_predictions, [r[0] for r in all_references], lang="en", verbose=True)
semantic_similarity = F1.mean().item()

# 4. Generation speed (CPU and GPU if available)
def measure_speed(model, prompt="Hello world", max_new_tokens=50):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
    start = time.time()
    with torch.no_grad():
        _ = model.generate(input_ids, max_new_tokens=max_new_tokens)
    end = time.time()
    return max_new_tokens / (end - start)

cpu_speed = measure_speed(student_model)

# 5. Model size & RAM
import os
def model_size(model):
    torch.save(model.state_dict(), "temp.pt")
    size = os.path.getsize("temp.pt") / 1e9  # GB
    os.remove("temp.pt")
    return size

teacher_size = model_size(teacher_model)
student_size = model_size(student_model)

# ----------------------------
# Portfolio-ready metrics table
# ----------------------------
import pandas as pd

metrics = {
    "Metric": [
        "Perplexity", "Token KL Divergence", "BLEU", "Semantic Similarity (BERTScore)",
        "Tokens/sec (CPU)", "Model Size (GB)"
    ],
    "Teacher (7B)": [teacher_ppl, 0, 1.0, 1.0, measure_speed(teacher_model), teacher_size],
    "Student (<1B)": [student_ppl, token_kl, bleu_score, semantic_similarity, cpu_speed, student_size],
    "Notes": [
        "Baseline", "Perfect alignment", "Lexical similarity", "Meaning retention",
        "Huge efficiency gain", "Deployable on IoT"
    ]
}

df = pd.DataFrame(metrics)
print("\n=== Recommended Metrics Table ===")
print(df.to_string(index=False))



  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'transformers'