In [None]:
# Install

In [None]:
# Import

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from evaluate import load as load_metric
import torch


In [None]:

# Settings
teacher_model_id = "your-username/teacher-model-name"
student_model_id = "your-username/student-model-name"
dataset_id = "your-username/your-dataset-id"
test_size = 0.2
max_input_length = 512
max_gen_length = 128

# Load original dataset (train only)
full_dataset = load_dataset(dataset_id, split="train")

# Split into train/test
split_dataset = full_dataset.train_test_split(test_size=test_size, seed=42)
test_dataset = split_dataset["test"]

# Load models and tokenizer
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_model_id)

teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model_id, torch_dtype=torch.float16).cuda()
student_model = AutoModelForCausalLM.from_pretrained(student_model_id, torch_dtype=torch.float16).cuda()

teacher_pipeline = pipeline("text-generation", model=teacher_model, tokenizer=teacher_tokenizer, device=0)
student_pipeline = pipeline("text-generation", model=student_model, tokenizer=student_tokenizer, device=0)

# Load evaluation metrics
f1_metric = load_metric("f1")
bleu_metric = load_metric("bleu")
exact_match_metric = load_metric("exact_match")

# Format prompt for your task
def format_prompt(example):
    return f"Question: {example['question']}\nAnswer:"

# Generate answers
def generate_answers(model_pipeline, inputs):
    outputs = model_pipeline(inputs, max_new_tokens=max_gen_length, do_sample=False)
    return [o["generated_text"].replace(i, "").strip() for o, i in zip(outputs, inputs)]

# Get prompts and references
prompts = [format_prompt(x) for x in test_dataset]
references = test_dataset["answer"]
bleu_refs = [[ref] for ref in references]

# Generate predictions
print("Generating with teacher...")
teacher_preds = generate_answers(teacher_pipeline, prompts)

print("Generating with student...")
student_preds = generate_answers(student_pipeline, prompts)

# Evaluation function
def evaluate(preds, refs):
    f1 = f1_metric.compute(predictions=preds, references=refs)["f1"]
    bleu = bleu_metric.compute(predictions=preds, references=bleu_refs)["bleu"]
    em = exact_match_metric.compute(predictions=preds, references=refs)["exact_match"]
    return {"f1": f1, "bleu": bleu, "exact_match": em}

# Evaluate both models
teacher_scores = evaluate(teacher_preds, references)
student_scores = evaluate(student_preds, references)

# Print results
print("\n--- Evaluation Results ---")
print("Teacher Model:")
print(teacher_scores)

print("Student Model:")
print(student_scores)

