In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F

from torch.utils.data import DataLoader

In [2]:
# 1) Data Transformer
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

# 2) Create Train Dataset
trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                      download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)

# 3) Create Test Dataset
testset = torchvision.datasets.MNIST(root='./data', train=False,
                                     download=True, transform=transform)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 58886607.15it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 30482056.83it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 24165468.96it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7464940.74it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [3]:
class TeacherNet(nn.Module):
    def __init__(self):
        super(TeacherNet, self).__init__()
        self.conv = nn.Conv2d(1, 32, 5)
        self.pool = nn.MaxPool2d(5, 5)
        self.fc1 = nn.Linear(32 * 4 * 4, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [4]:
# Initialize model
teacher_model = TeacherNet()

# Define optimizer
teacher_optimizer = optim.Adam(teacher_model.parameters(),
                               lr=0.001)

# Define loss function
teacher_criterion = nn.CrossEntropyLoss()

In [6]:
def evaluate(model):
    model.eval()  # Set model to evaluation mode
    total, correct = 0, 0

    with torch.no_grad():  # No need to track gradients for evaluation
        for data in testloader:
            inputs, labels = data
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy


In [11]:
import torch

for epoch in range(5):
    epoch_loss = 0.0

    # set to train mode
    teacher_model.train()

    # train for all batches of data
    for inputs, labels in trainloader:
        teacher_optimizer.zero_grad()  # Zero the parameter gradients

        # Forward pass
        outputs = teacher_model(inputs)
        loss = teacher_criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        teacher_optimizer.step()

        epoch_loss += loss.item()

    # set to evaluation mode
    teacher_model.eval()
    teacher_accuracy = evaluate(teacher_model)

    # print performance metrics
    print(f"""Epoch {epoch + 1},
              Loss: {epoch_loss / len(trainloader)},
              Acc: {teacher_accuracy * 100:.2f}%""")



Epoch 1,
              Loss: 0.21638135280289345,
              Acc: 96.50%
Epoch 2,
              Loss: 0.0744556292652218,
              Acc: 97.58%
Epoch 3,
              Loss: 0.0572398169008868,
              Acc: 98.37%
Epoch 4,
              Loss: 0.047253273061485586,
              Acc: 97.73%
Epoch 5,
              Loss: 0.03912784883434495,
              Acc: 98.59%


**Defining Student Model**

In [12]:
class StudentNet(nn.Module):
    def __init__(self):
        super(StudentNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [13]:
# Knowledge distillation loss (KL divergence)
def KL_loss(student_logits, teacher_logits):

    # convert teacher model outputs to probabilities
    p_teacher = F.softmax(teacher_logits, dim=1)

    # convert student model outputs to probabilities
    p_student = F.log_softmax(student_logits, dim=1)

    # compute KL divergence loss (PyTorch's method)
    loss = F.kl_div(p_student, p_teacher, reduction='batchmean')

    return loss

In [14]:
# Initialize model
student_model = StudentNet()

# Define optimizer
student_optimizer = optim.Adam(student_model.parameters(),
                               lr=0.001)

In [16]:
for epoch in range(5):
    # set to train mode
    student_model.train()

    epoch_loss = 0.0

    # train for all batches of data
    for inputs, labels in trainloader:
        student_optimizer.zero_grad()

        # get student outputs
        student_logits = student_model(inputs)

        # get teacher outputs and detach them
        # to avoid backpropagation
        teacher_logits = teacher_model(inputs).detach()

        # compute KL Divergence loss
        loss = KL_loss(student_logits, teacher_logits)

        # run backpropagation step
        loss.backward()
        student_optimizer.step()

        epoch_loss += loss.item()

    # Evaluate on test data
    student_model.eval()  # set to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            outputs = student_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Epoch {epoch+1}, Loss: {epoch_loss / len(trainloader)}, Accuracy on test data: {accuracy:.2f}%')


Epoch 1, Loss: 0.06747170624289431, Accuracy on test data: 97.12%
Epoch 2, Loss: 0.059983497829055356, Accuracy on test data: 97.34%
Epoch 3, Loss: 0.05388834924975247, Accuracy on test data: 96.38%
Epoch 4, Loss: 0.050274628327726556, Accuracy on test data: 97.24%
Epoch 5, Loss: 0.046857432319796576, Accuracy on test data: 97.31%
