# Distillation Notebook
This notebook demonstrates knowledge distillation in PyTorch.

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel, DistilBertTokenizer

# Create dummy dataset
X = torch.randn(100, 20)
y = torch.randint(0, 2, (100,))
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=16)

model_name = "distilbert-base-uncased"
model = DistilBertModel.from_pretrained(model_name)
model.eval()

# Define teacher and student models
teacher = nn.Sequential(nn.Linear(20, 50), nn.ReLU(), nn.Linear(50, 2))
student = nn.Sequential(nn.Linear(20, 10), nn.ReLU(), nn.Linear(10, 2))

# Assume teacher is pre-trained
with torch.no_grad():
    pass  # placeholder for teacher training

# Distillation training loop
optimizer = torch.optim.Adam(student.parameters(), lr=1e-3)
temperature = 5.0
alpha = 0.5

for epoch in range(50):
    student.train()
    total_loss = 0
    for data, labels in loader:
        with torch.no_grad():
            teacher_logits = model(data)
        student_logits = student(data)

        # Soft targets
        T = temperature
        teacher_probs = F.softmax(teacher_logits / T, dim=1)
        student_log_probs = F.log_softmax(student_logits / T, dim=1)
        distill_loss = F.kl_div(student_log_probs, teacher_probs, reduction='batchmean') * (T*T)
        task_loss = F.cross_entropy(student_logits, labels)

        loss = alpha * distill_loss + (1 - alpha) * task_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)