# SPML HW3: Breaking Defenses & Black-Box Attacks

In [1]:
name = 'َAlireza Farajtabrizi'
std_id = '403206554'

In [1]:
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader

from torchvision import transforms
from torchvision.models import resnet18, mobilenet_v2
from torchvision.models import ResNet18_Weights, MobileNet_V2_Weights
from torchvision.datasets.cifar import CIFAR10

from tqdm import trange, tqdm

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# CIFAR10 Dataset (5 points)

In [2]:
norm_mean = (0.4914, 0.4822, 0.4465)
norm_std = (0.2023, 0.1994, 0.2010)
batch_size = 128

mu = torch.tensor(norm_mean).view(3,1,1).to(device)
std = torch.tensor(norm_std).view(3,1,1).to(device)

# TODO: Set the upper limit and lower limit possible for images
upper_limit = (1 - mu) / std
lower_limit = (0 - mu) / std

# Define transforms for training and testing
transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std),
])

trainset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')


Files already downloaded and verified
Files already downloaded and verified


# Defensive Distillation (25 points)

[Defensive distillation](https://arxiv.org/abs/1511.04508) proceeds in four steps:

1.   **Train the teacher network**, by setting the temperature of the softmax to T during the
training phase.
2.   **Compute soft labels** by apply the teacher network to each instance in the training set, again evaluating the softmax at temperature T.
3.  **Train the distilled network** (a network with the same shape as the teacher network) on the soft labels, using softmax at temperature T.
4.  Finally, when running the distilled network at test time to classify new inputs, use temperature 1.



## Train the teacher

In [4]:
def train_step(model, dataloader, loss_fn, optimizer, temperature):
    model.train()

    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs) / temperature
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        predictions = torch.argmax(outputs, dim=1)
        correct_preds += (predictions == labels).sum().item()
        total_preds += labels.size(0)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / total_preds * 100

    return epoch_loss, epoch_accuracy


def train_teacher(model, n_epochs, loader=trainloader, temp=100):
    loss_fn = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.01)

    for epoch in range(n_epochs):
        epoch_loss, epoch_accuracy = train_step(model, loader, loss_fn, optimizer, temp)
        print(f"Epoch [{epoch + 1}/{n_epochs}] - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

You can use a pre-trained resnet to speed up the training process.

In [5]:
teacher = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
teacher.fc = nn.Linear(teacher.fc.in_features, 10)
teacher = teacher.to(device)

train_teacher(teacher, 15)
torch.save(teacher.state_dict(), 'Teacher.pth')

Epoch [1/15] - Loss: 1.4126, Accuracy: 47.42%
Epoch [2/15] - Loss: 0.9707, Accuracy: 65.88%
Epoch [3/15] - Loss: 0.7864, Accuracy: 72.52%
Epoch [4/15] - Loss: 0.6633, Accuracy: 77.08%
Epoch [5/15] - Loss: 0.5610, Accuracy: 80.64%
Epoch [6/15] - Loss: 0.4732, Accuracy: 83.71%
Epoch [7/15] - Loss: 0.3854, Accuracy: 86.77%
Epoch [8/15] - Loss: 0.3177, Accuracy: 89.14%
Epoch [9/15] - Loss: 0.2593, Accuracy: 90.99%
Epoch [10/15] - Loss: 0.2082, Accuracy: 92.86%
Epoch [11/15] - Loss: 0.1709, Accuracy: 94.10%
Epoch [12/15] - Loss: 0.1502, Accuracy: 94.73%
Epoch [13/15] - Loss: 0.1265, Accuracy: 95.66%
Epoch [14/15] - Loss: 0.1102, Accuracy: 96.20%
Epoch [15/15] - Loss: 0.0987, Accuracy: 96.56%


## Test the teacher

In [3]:
def test_clean(model, dataloader=testloader):
    model.eval()

    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=1)

            correct_preds += (predictions == labels).sum().item()
            total_preds += labels.size(0)

    accuracy = correct_preds / total_preds * 100
    return accuracy


Print the clean accuracy of the teacher.

In [7]:
teacher = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
teacher.fc = nn.Linear(teacher.fc.in_features, 10)

teacher.load_state_dict(torch.load('Teacher.pth', weights_only=True))
teacher = teacher.to(device)

In [8]:
print(f'Teacher Accuracy {test_clean(teacher):.2f}%')

Teacher Accuracy 75.99%


## Train the student

In [9]:
def distill(model, teacher, dataloader, optimizer, T):
    model.train()
    teacher.eval()

    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        with torch.no_grad():
            teacher_outputs = teacher(inputs) / T
            teacher_probs = F.softmax(teacher_outputs, dim=1)


        student_outputs = model(inputs) / T
        student_log_probs = F.log_softmax(student_outputs, dim=1)

        distillation_loss = F.kl_div(student_log_probs, teacher_probs, reduction='batchmean')
        classification_loss = F.cross_entropy(student_outputs, labels)
        total_loss = 0.9 * distillation_loss + 0.1 * classification_loss

        total_loss.backward()
        optimizer.step()

        running_loss += total_loss.item()
        predictions = torch.argmax(student_outputs, dim=1)
        correct_preds += (predictions == labels).sum().item()
        total_preds += labels.size(0)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / total_preds * 100

    return epoch_accuracy, epoch_loss

def train_student(model, teacher, n_epochs, loader=trainloader, temp=100):
    model.to(device)
    optimizer = Adam(model.parameters(), lr=0.01)

    for epoch in range(n_epochs):

        epoch_accuracy, epoch_loss = distill(model, teacher, loader, optimizer, temp)
        print(f"Epoch [{epoch + 1}/{n_epochs}] - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

This time use a `resnet18` without the pretrained weights.

In [10]:
student = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
student.fc = nn.Linear(student.fc.in_features, 10)
student = student.to(device)

train_student(student, teacher, 15)
torch.save(student.state_dict(), 'Student.pth')

Epoch [1/15] - Loss: 1.4034, Accuracy: 43.46%
Epoch [2/15] - Loss: 0.9252, Accuracy: 64.25%
Epoch [3/15] - Loss: 0.7368, Accuracy: 71.35%
Epoch [4/15] - Loss: 0.6119, Accuracy: 75.92%
Epoch [5/15] - Loss: 0.5219, Accuracy: 79.37%
Epoch [6/15] - Loss: 0.4345, Accuracy: 82.57%
Epoch [7/15] - Loss: 0.3652, Accuracy: 85.13%
Epoch [8/15] - Loss: 0.2998, Accuracy: 87.57%
Epoch [9/15] - Loss: 0.2496, Accuracy: 89.41%
Epoch [10/15] - Loss: 0.2028, Accuracy: 91.32%
Epoch [11/15] - Loss: 0.1734, Accuracy: 92.34%
Epoch [12/15] - Loss: 0.1441, Accuracy: 93.61%
Epoch [13/15] - Loss: 0.1309, Accuracy: 94.19%
Epoch [14/15] - Loss: 0.1206, Accuracy: 94.56%
Epoch [15/15] - Loss: 0.1085, Accuracy: 94.91%


## Test the student

In [None]:
student = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
student.fc = nn.Linear(student.fc.in_features, 10)

student.load_state_dict(torch.load('Student.pth', weights_only=True))
student = student.to(device)

In [12]:
print(f'Student Accuracy {test_clean(student):.2f}%')

Student Accuracy 75.36%


# Attack (15 points)

Implement the FGSM attack and the `test_attack` funcion to report the robust accuracy for different values of epsilon.

In [4]:
def attack_fgsm(model, x, y, epsilon, T=100):
    x_adv = x.clone().detach().requires_grad_(True).to(device)
    model.zero_grad()
    outputs = model(x_adv) / T

    loss = F.cross_entropy(outputs, y)
    loss.backward()

    perturbation = epsilon * x_adv.grad.sign()
    x_adv = x_adv + perturbation
    x_adv = torch.clamp(x_adv, lower_limit, upper_limit)

    return x_adv


def attack_pgd(model, x, y, epsilon, alpha=0.2, num_iters=10, T=100):
    x_adv = x.clone().detach().requires_grad_(True).to(device)
    x_orig = x.clone().detach().to(device)

    for _ in range(num_iters):
        model.zero_grad()
        outputs = model(x_adv) / T

        loss = F.cross_entropy(outputs, y)
        loss.backward()

        x_adv = x_adv + alpha * x_adv.grad.sign()
        perturbation = torch.clamp(x_adv - x_orig, -epsilon, epsilon)
        x_adv = torch.clamp(x_orig + perturbation, lower_limit, upper_limit).detach().requires_grad_(True).to(device)

    return x_adv


def test_attack(model, epsilon, attack=attack_fgsm, loader=testloader, T=100):
    model.eval()
    correct_preds = 0
    total_preds = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        x_adv = attack(model, x, y, epsilon, T=T)

        with torch.no_grad():
            outputs = model(x_adv)
            predictions = torch.argmax(outputs, dim=1)
            correct_preds += (predictions == y).sum().item()
            total_preds += y.size(0)

    robust_accuracy = correct_preds / total_preds * 100

    return robust_accuracy

Report the robust accuracy of the teacher for `ϵ = [1, 2, 4, 8, 16]`.

In [14]:
epsilons = [1, 2, 4, 8, 16]
scale = 1/std.mean().item()

for eps in epsilons:
    acc = test_attack(model=teacher, epsilon=eps*scale/255, attack=attack_fgsm, T=100)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')
    acc = test_attack(model=teacher, epsilon=eps*scale/255, attack=attack_pgd, T=100)
    print(f'PGD  with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

FGSM with ϵ=1/255 has Accuracy: 51.15%
PGD  with ϵ=1/255 has Accuracy: 49.70%
FGSM with ϵ=2/255 has Accuracy: 32.96%
PGD  with ϵ=2/255 has Accuracy: 30.80%
FGSM with ϵ=4/255 has Accuracy: 14.21%
PGD  with ϵ=4/255 has Accuracy: 12.54%
FGSM with ϵ=8/255 has Accuracy: 3.65%
PGD  with ϵ=8/255 has Accuracy: 0.31%
FGSM with ϵ=16/255 has Accuracy: 1.27%
PGD  with ϵ=16/255 has Accuracy: 0.00%


Do the same for the student:

In [15]:
for eps in epsilons:
    acc = test_attack(model=student, epsilon=eps*scale/255, attack=attack_fgsm, T=1)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')
    acc = test_attack(model=student, epsilon=eps*scale/255, attack=attack_pgd, T=1)
    print(f'PGD  with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

FGSM with ϵ=1/255 has Accuracy: 69.25%
PGD  with ϵ=1/255 has Accuracy: 69.17%
FGSM with ϵ=2/255 has Accuracy: 69.22%
PGD  with ϵ=2/255 has Accuracy: 69.18%
FGSM with ϵ=4/255 has Accuracy: 69.21%
PGD  with ϵ=4/255 has Accuracy: 69.20%
FGSM with ϵ=8/255 has Accuracy: 69.19%
PGD  with ϵ=8/255 has Accuracy: 69.19%
FGSM with ϵ=16/255 has Accuracy: 69.22%
PGD  with ϵ=16/255 has Accuracy: 69.19%


What do you see?

`your response:` Both FGSM and PGD attacks fails, seems like we have a defense! Increasing e won't change that. It is a sign of obfuscated gradients.

# Transferring Adversarial Examples (15 points)

Train yet another model to be used as the surrogate. (set temperature to 1)

In [16]:
surrogate = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
surrogate.fc = nn.Linear(surrogate.fc.in_features, 10)
surrogate = surrogate.to(device)

train_teacher(surrogate, 15, temp=1)
torch.save(surrogate.state_dict(), 'Surrogate.pth')

Epoch [1/15] - Loss: 2.1782, Accuracy: 25.50%
Epoch [2/15] - Loss: 1.5049, Accuracy: 45.60%
Epoch [3/15] - Loss: 1.2974, Accuracy: 53.38%
Epoch [4/15] - Loss: 1.0910, Accuracy: 61.16%
Epoch [5/15] - Loss: 0.9514, Accuracy: 66.40%
Epoch [6/15] - Loss: 0.8288, Accuracy: 70.95%
Epoch [7/15] - Loss: 0.7388, Accuracy: 74.02%
Epoch [8/15] - Loss: 0.6808, Accuracy: 76.06%
Epoch [9/15] - Loss: 0.5963, Accuracy: 79.04%
Epoch [10/15] - Loss: 0.5322, Accuracy: 81.17%
Epoch [11/15] - Loss: 0.4748, Accuracy: 83.38%
Epoch [12/15] - Loss: 0.4290, Accuracy: 84.86%
Epoch [13/15] - Loss: 0.3754, Accuracy: 86.73%
Epoch [14/15] - Loss: 0.3300, Accuracy: 88.50%
Epoch [15/15] - Loss: 0.2920, Accuracy: 89.65%


Print the surrogate accuracy.

In [17]:
surrogate = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
surrogate.fc = nn.Linear(surrogate.fc.in_features, 10)

surrogate.load_state_dict(torch.load('Surrogate.pth', weights_only=True))
surrogate = surrogate.to(device)

In [18]:
print(f'Surrogate Accuracy {test_clean(surrogate):.2f}%')

Surrogate Accuracy 72.76%


Report the accuracy of the surrogate for `ϵ = [1, 2, 4, 8, 16]`.

In [19]:
for eps in epsilons:
    acc = test_attack(model=surrogate, epsilon=eps*scale/255, attack=attack_fgsm, T=1)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

FGSM with ϵ=1/255 has Accuracy: 45.56%
FGSM with ϵ=2/255 has Accuracy: 27.29%
FGSM with ϵ=4/255 has Accuracy: 10.49%
FGSM with ϵ=8/255 has Accuracy: 2.65%
FGSM with ϵ=16/255 has Accuracy: 1.57%


Implement the following functions to transfer attacks from a surrogate model to an oracle.

In [20]:
def transfer_attack(oracle, model, eps, loader=testloader):
    oracle.eval()
    correct_preds = 0
    total_preds = 0

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        x_adv = attack_fgsm(model, inputs, labels, eps)

        with torch.no_grad():
            outputs = oracle(x_adv)
            predictions = torch.argmax(outputs, dim=1)
            correct_preds += (predictions == labels).sum().item()
            total_preds += labels.size(0)

        robust_accuracy = 100.0 * correct_preds / total_preds

    return robust_accuracy


Transfer attacks for `ϵ = [1, 2, 4, 8, 16]` from your model to the student.

In [21]:
for eps in epsilons:
    acc = transfer_attack(student, surrogate, eps*scale/255)
    print(f'FGSM with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

FGSM with ϵ=1/255 has Accuracy: 71.71%
FGSM with ϵ=2/255 has Accuracy: 68.10%
FGSM with ϵ=4/255 has Accuracy: 60.58%
FGSM with ϵ=8/255 has Accuracy: 44.87%
FGSM with ϵ=16/255 has Accuracy: 24.42%


- What can be inferred from these results?
- How are the accuracies of the student and the surrogate under attack related?
- Does Defensive Distillation obfuscate the gradients? Why?

`your response:`
1. That we dont have a real defense since transfer attacks are successful and break the defense.
Both of the accuracies decrease as epsilon increases but the transfer one falls slower.
2. Yes it is as of the observations that we know from lectures are happening: 1-transfer attacks are successful while gradiant ones are not.
2-increasing epsilon won't help.
3. Using temperature = 100 forces the logits to get higher values during training phase and when we set temperature = 1 in infrence time it makes probability vectors almost one-hot which makes the gradiant 0.

# ZOO Based Black-Box Attacks (25 points)

Based on [Black-box Adversarial Attacks with Limited Queries and Information](https://arxiv.org/abs/1804.08598) you must first calculate the estimate of the graidents, and next attack the model based on your estimates.

In [22]:
# Using CELoss difference for estimation
def nes_gradient_estimate(model, x, y, epsilon, num_samples, sigma):
    grad_estimate = torch.zeros_like(x)

    for _ in range(num_samples):
        with torch.no_grad():
            delta_i = torch.randn_like(x).to(device)
            x_plus = x + sigma * delta_i
            x_minus = x - sigma * delta_i

            logits_plus = model(x_plus)
            logits_minus = model(x_minus)
            loss_plus = F.cross_entropy(logits_plus, y, reduction='none')
            loss_minus = F.cross_entropy(logits_minus, y, reduction='none')
            grad_estimate += (loss_plus - loss_minus).view(-1, 1, 1, 1) * delta_i

    grad_estimate /= (2 * sigma * num_samples)
    return -grad_estimate

# Using Logits difference for estimation
def nes_gradient_estimate(model, x, y, epsilon, num_samples, sigma):
    grad_estimate = torch.zeros_like(x)

    for _ in range(num_samples):
        with torch.no_grad():
            delta_i = torch.randn_like(x).to(device)
            x_plus = x + sigma * delta_i
            x_minus = x - sigma * delta_i

            logits_plus = model(x_plus).gather(1, y.view(-1, 1))
            logits_minus = model(x_minus).gather(1, y.view(-1, 1))
            grad_estimate += (logits_plus - logits_minus).view(-1, 1, 1, 1) * delta_i

    grad_estimate /= (2 * sigma * num_samples)
    return grad_estimate

# Using Probabilities difference for estimation
def nes_gradient_estimate(model, x, y, epsilon, num_samples, sigma):
    grad_estimate = torch.zeros_like(x)

    for _ in range(num_samples):
        with torch.no_grad():
            delta_i = torch.randn_like(x).to(device)
            x_plus = x + sigma * delta_i
            x_minus = x - sigma * delta_i

            prob_plus = F.softmax(model(x_plus), dim=1).gather(1, y.view(-1, 1))
            prob_minus = F.softmax(model(x_minus), dim=1).gather(1, y.view(-1, 1))
            grad_estimate += (prob_plus - prob_minus).view(-1, 1, 1, 1) * delta_i

    grad_estimate /= (2 * sigma * num_samples)
    return grad_estimate


I used 3 different things to estimate gradiant and all of them end up almost the same result. The bottom result is made with probabilities.

In [23]:
def partial_information_attack(model, x, y, epsilon, num_samples, sigma, num_steps, alpha):
    x_adv = x.clone().detach()

    for step in range(num_steps):
        grad_estimate = nes_gradient_estimate(model, x_adv, y, epsilon, num_samples, sigma)

        x_adv = x_adv - alpha * grad_estimate.sign()
        x_adv = torch.clamp(x_adv, x - epsilon, x + epsilon)

        x_adv = torch.clamp(x_adv, lower_limit, upper_limit)
        x_adv = x_adv

    return x_adv

Now run this attack on your models and report the results. (You **DON'T** need to run the attack for the entire test dataset as this will take a lot of time!)

In [24]:
def test_zoo_attack(model, epsilon, num_samples, sigma, num_steps, alpha, loader=testloader):
    model = model.to(device)
    model.eval()

    correct_preds = 0
    total_preds = 0

    for x, y in loader:
        with torch.no_grad():
            x, y = x.to(device), y.to(device)
            x_adv = partial_information_attack(model, x, y, epsilon, num_samples, sigma, num_steps, alpha)
            output = model(x_adv)

            predictions = output.argmax(dim=1)
            correct_preds += (predictions == y).sum().item()
            total_preds += y.size(0)

            if total_preds > 1000:
                break

    zoo_accuracy = 100 * correct_preds / total_preds
    return zoo_accuracy

In [25]:
epsilons = [1, 2, 4, 8, 16]

for eps in epsilons:
    acc = test_zoo_attack(model=surrogate, epsilon=eps*scale/255, num_samples=100, sigma=0.001, num_steps=10, alpha=0.1, loader=testloader)
    print(f'ZOO with ϵ={eps}/255 has Accuracy: {acc:.2f}%')

ZOO with ϵ=1/255 has Accuracy: 67.87%
ZOO with ϵ=2/255 has Accuracy: 62.30%
ZOO with ϵ=4/255 has Accuracy: 47.56%
ZOO with ϵ=8/255 has Accuracy: 15.53%
ZOO with ϵ=16/255 has Accuracy: 2.34%


# Adversarially Robust Distillation (15 points)

In this section we are going to test another type of distillation to see if this method is robust. This technique is [Adversarially Robust Distillation](https://arxiv.org/abs/1905.09747).



1.   We will try to distill a robsut teacher from [Robust Bench](https://robustbench.github.io/) onto a smaller architecture.
2.   We minimize the KL-Divergence between the logits of the student and teacher to ensure fidelity. (You can also incorporate the classification loss as mentioned in the paper but you can choose to ignore it as well)
3.   At each step of the distillation you will attack the student (you can use either FGSM or PGD) and find an adversarial example $X + \delta$ for data point $X$. Next you will minimize $t^2 \times \text{KL}(S(X+\delta), T(X))$ where $S$ and $T$ are the student and teacher networks respectively.



In [None]:
! pip install git+https://github.com/RobustBench/robustbench.git

In [None]:
from robustbench.utils import load_model

teacher = load_model(model_name='Gowal2021Improving_R18_ddpm_100m', dataset='cifar10', threat_model='Linf')
teacher = teacher.to(device)

In [28]:
def ard(student, teacher, dataloader, optimizer, eps, attack, T, alpha):
    student.train()
    teacher.eval()

    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()

        with torch.no_grad():
            teacher_outputs = teacher(x) / T
            teacher_probs = F.softmax(teacher_outputs, dim=1)

        student_outputs = student(x) / T
        classification_loss = F.cross_entropy(student_outputs, y)

        x_adv = attack(student, x, y, eps, T=T)
        student_perturbed_outputs = student(x_adv) / T
        student_perturbed_log_probs = F.log_softmax(student_perturbed_outputs, dim=1)

        distillation_loss = F.kl_div(student_perturbed_log_probs, teacher_probs, reduction='batchmean') * T * T
        total_loss = alpha * distillation_loss + (1-alpha) * classification_loss

        total_loss.backward()
        optimizer.step()

        running_loss += total_loss.item()
        predictions = torch.argmax(student_outputs, dim=1)
        correct_preds += (predictions == y).sum().item()
        total_preds += y.size(0)

    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / total_preds * 100

    return epoch_accuracy, epoch_loss

def adv_train_student(model, teacher, n_epochs, eps=8/255, loader=trainloader, lr=0.01, T=100, alpha=1.):
    optimizer = Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        epoch_accuracy, epoch_loss = ard(student, teacher, loader, optimizer, eps, attack_pgd, T, alpha)
        print(f"Epoch [{epoch + 1}/{n_epochs}] - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

In [32]:
student = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
student.classifier = nn.Linear(student.classifier[1].in_features, 10)
student = student.to(device)

adv_train_student(model=student, teacher=teacher, n_epochs=15, eps=scale*8/255, loader=trainloader, lr=0.001, T=10, alpha=0.8)
torch.save(student.state_dict(), 'Student_Mobilenet_v2.pth')

Epoch [1/15] - Loss: 2.8294, Accuracy: 17.19%
Epoch [2/15] - Loss: 2.3497, Accuracy: 21.36%
Epoch [3/15] - Loss: 2.1978, Accuracy: 23.50%
Epoch [4/15] - Loss: 2.1297, Accuracy: 25.51%
Epoch [5/15] - Loss: 2.0597, Accuracy: 26.75%
Epoch [6/15] - Loss: 2.0010, Accuracy: 27.47%
Epoch [7/15] - Loss: 1.9563, Accuracy: 29.34%
Epoch [8/15] - Loss: 1.9193, Accuracy: 30.78%
Epoch [9/15] - Loss: 1.9044, Accuracy: 31.92%
Epoch [10/15] - Loss: 1.8660, Accuracy: 31.20%
Epoch [11/15] - Loss: 1.8262, Accuracy: 32.59%
Epoch [12/15] - Loss: 1.8063, Accuracy: 34.40%
Epoch [13/15] - Loss: 1.7858, Accuracy: 34.44%
Epoch [14/15] - Loss: 1.7387, Accuracy: 35.12%
Epoch [15/15] - Loss: 1.7224, Accuracy: 34.12%


Now report the accuracy of the student on the test dataset.

In [5]:
student = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
student.classifier = nn.Linear(student.classifier[1].in_features, 10)
student = student.to(device)

student.load_state_dict(torch.load('Student_Mobilenet_v2.pth', weights_only=True))
student = student.to(device)

In [10]:
# TODO: Clean accurcy
print(f'Student Accuracy {test_clean(student):.2f}%')

# TODO: FGSM with eps=8/255
acc = test_attack(model=student, epsilon=scale*8/255, attack=attack_fgsm, T=10)
print(f'FGSM with ϵ=8/255 has Accuracy: {acc:.2f}%')

# TODO: PGD with eps=8/255
acc = test_attack(model=student, epsilon=scale*8/255, attack=attack_pgd, T=10)
print(f'PGD  with ϵ=8/255 has Accuracy: {acc:.2f}%')

Student Accuracy 30.87%
FGSM with ϵ=8/255 has Accuracy: 17.19%
PGD  with ϵ=8/255 has Accuracy: 16.07%
