In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to calculate test accuracy
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    accuracy = 100. * correct / total
    return accuracy

Using device: cuda


In [3]:
import os
import time
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from tqdm import tqdm


# Create test dataset
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

test_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # Convert grayscale to RGB (3 channels)
    transforms.ToTensor(),  # Convert the image to tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize for RGB images
])

test_set = datasets.MNIST(root='./data', train=False, download=True, transform=test_transform)

test_loader = DataLoader(
    test_set,
    batch_size=32,
    shuffle=False,
    num_workers=2
)

# Example loop over the test set
for inputs, labels in test_loader:
    # Your evaluation code here
    print(inputs.shape, labels.shape)
    break

100%|██████████| 9.91M/9.91M [00:00<00:00, 16.2MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 482kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.47MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.56MB/s]


torch.Size([32, 3, 28, 28]) torch.Size([32])


In [4]:
# Teacher Model (ResNet50)
class TeacherModel(nn.Module):
    def __init__(self, num_classes=40):
        super().__init__()
        self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)

        # Freeze early layers
        for param in self.backbone.parameters():
            param.requires_grad = False
        for param in self.backbone.layer3.parameters():
            param.requires_grad = True
        for param in self.backbone.layer4.parameters():
            param.requires_grad = True

        # Classifier head
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(self.backbone.fc.in_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, num_classes))

    def forward(self, x):
        return self.backbone(x)

In [5]:
# Student Model (ResNet18)
class StudentModel(nn.Module):
    def __init__(self, num_classes=40):
        super().__init__()
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        self.backbone.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.backbone.fc.in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes))

    def forward(self, x):
        return self.backbone(x)

In [6]:
num_classes = 10

In [7]:
class AttentionWrapper(nn.Module):
    def __init__(self, base_model, num_classes, pretrained=True):
        super().__init__()
        # Use "model" to match checkpoint keys
        if base_model == 'resnet50':
            self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT if pretrained else None)
        else:
            self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT if pretrained else None)

        in_features = self.model.fc.in_features
        self.model.fc = nn.Linear(in_features, num_classes)

        self.attention_maps = {}
        self._register_hooks()

    def _register_hooks(self):
        def hook_fn(name):
            def hook(_, __, output):
                self.attention_maps[name] = output.detach()
            return hook
        self.model.layer2.register_forward_hook(hook_fn('layer2'))
        self.model.layer4.register_forward_hook(hook_fn('layer4'))

    def forward(self, x):
        self.attention_maps = {}
        return self.model(x)

    def get_attention_maps(self):
        return [self.attention_maps.get('layer2'),
                self.attention_maps.get('layer4')]

In [8]:
teacher = AttentionWrapper('resnet50', num_classes, pretrained=False)
teacher.fc = nn.Linear(2048, 10)
teacher.load_state_dict(torch.load('/content/drive/MyDrive/KnowledgeDistillation/mnist/models/resnet50_mnist.pth', map_location=device), strict=False)
teacher = teacher.to(device)
teacher.eval()

student_soft = AttentionWrapper('resnet18', num_classes, pretrained=False)
student_soft.fc = nn.Linear(512, 10)
student_soft.load_state_dict(torch.load('/content/drive/MyDrive/KnowledgeDistillation/mnist/models/soft_target_final.pth', map_location=device), strict=False)
student_soft = student_soft.to(device)
student_soft.eval()

student_soft_dml = AttentionWrapper('resnet18', num_classes, pretrained=False)
student_soft_dml.fc = nn.Linear(512, 10)
student_soft_dml.load_state_dict(torch.load('/content/drive/MyDrive/KnowledgeDistillation/mnist/models/dml_resnet18_student1_final.pth', map_location=device), strict=False)
student_soft_dml = student_soft_dml.to(device)
student_soft_dml.eval()

student_at = AttentionWrapper('resnet18', num_classes, pretrained=False)
student_at.fc = nn.Linear(512, 10)
student_at.load_state_dict(torch.load('/content/drive/MyDrive/KnowledgeDistillation/mnist/models/attention_transfer_final.pth', map_location=device), strict=False)
student_at = student_at.to(device)
student_at.eval()

student_at_dml = AttentionWrapper('resnet18', num_classes, pretrained=False)
student_at_dml.fc = nn.Linear(512, 10)
student_at_dml.load_state_dict(torch.load('/content/drive/MyDrive/KnowledgeDistillation/mnist/models/dml_resnet18_student2_final.pth', map_location=device), strict=False)
student_at_dml = student_at_dml.to(device)
student_at_dml.eval()



FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/KnowledgeDistillation/mnist/models/resnet50_mnist.pth'

In [None]:
# Knowledge Distillation (KD) Loss
def kl_div_loss(teacher_logits, student_logits, temperature):
  teacher_probs = F.softmax(teacher_logits / temperature, dim=1)
  student_log_probs = F.log_softmax(student_logits / temperature, dim=1)
  return F.kl_div(student_log_probs, teacher_probs, reduction='batchmean') * (temperature ** 2)

In [None]:
# Binary Cross-Entropy (CE) Loss
def cross_entropy_loss(logits, labels):
  # return F.binary_cross_entropy_with_logits(logits, labels.float(), reduction='mean')
  # Binary Cross-Entropy Loss with logits
  loss_fn = nn.CrossEntropyLoss()

  # Calculate loss
  loss = loss_fn(logits, labels)

  return loss

In [None]:
kl_div_loss_soft_list = []
kl_div_loss_soft_dml_list = []
kl_div_loss_at_list = []
kl_div_loss_at_dml_list = []

binary_cross_entropy_loss_teacher_list = []
binary_cross_entropy_loss_student_soft_list = []
binary_cross_entropy_loss_student_soft_dml_list = []
binary_cross_entropy_loss_student_at_list = []
binary_cross_entropy_loss_student_at_dml_list = []

with torch.no_grad():
      for inputs, labels in test_loader:
          inputs = inputs.to(device)
          labels = labels.to(device)

          teacher_logits = teacher(inputs)
          student_soft_logits = student_soft(inputs)
          student_at_logits = student_at(inputs)
          student_soft_dml_logits = student_soft_dml(inputs)
          student_at_dml_logits = student_at_dml(inputs)

          print("Teacher Logits Shape: ", teacher_logits.shape)
          print("Student Soft Logits Shape: ", student_soft_logits.shape)
          print("Student AT Logits Shape: ", student_at_logits.shape)
          print("Student Soft DML Logits Shape: ", student_soft_dml_logits.shape)
          print("Student AT DML Logits Shape: ", student_at_dml_logits.shape)
          print("\n")

          kl_div_loss_soft_list.append(kl_div_loss(teacher_logits, student_soft_logits, temperature=4.0).item())
          kl_div_loss_soft_dml_list.append(kl_div_loss(teacher_logits, student_soft_dml_logits, temperature=4.0).item())
          kl_div_loss_at_list.append(kl_div_loss(teacher_logits, student_at_logits, temperature=4.0).item())
          kl_div_loss_at_dml_list.append(kl_div_loss(teacher_logits, student_at_dml_logits, temperature=4.0).item())

          binary_cross_entropy_loss_teacher_list.append(cross_entropy_loss(teacher_logits, labels).item())
          binary_cross_entropy_loss_student_soft_list.append(cross_entropy_loss(student_soft_logits, labels).item())
          binary_cross_entropy_loss_student_soft_dml_list.append(cross_entropy_loss(student_soft_dml_logits, labels).item())
          binary_cross_entropy_loss_student_at_list.append(cross_entropy_loss(student_at_logits, labels).item())
          binary_cross_entropy_loss_student_at_dml_list.append(cross_entropy_loss(student_at_dml_logits, labels).item())

          print("Teacher - Student Soft KL Diversion Loss: ", kl_div_loss_soft_list[-1])
          print("Teacher - Student Soft DML KL Diversion Loss: ", kl_div_loss_soft_dml_list[-1])
          print("Teacher - Student AT KL Diversion Loss: ", kl_div_loss_at_list[-1])
          print("Teacher - Student AT DML KL Diversion Loss: ", kl_div_loss_at_dml_list[-1])

          print("Teacher CE Loss: ", binary_cross_entropy_loss_teacher_list[-1])
          print("Student Soft CE Loss: ", binary_cross_entropy_loss_student_soft_list[-1])
          print("Student Soft DML CE Loss: ", binary_cross_entropy_loss_student_soft_dml_list[-1])
          print("Student AT CE Loss: ", binary_cross_entropy_loss_student_at_list[-1])
          print("Student AT DML CE Loss: ", binary_cross_entropy_loss_student_at_dml_list[-1])

          print("\n")

In [None]:
def calculate_attention_similarity(teacher, student, loader, device, total_similarity_list, adaptation_layers=None, use_cosine=True):
    teacher.eval()
    student.eval()
    total_similarity = 0
    total_batches = 0

    with torch.no_grad():
        for inputs, _ in loader:
            inputs = inputs.to(device)

            # Get teacher features
            _ = teacher(inputs)
            t_atts = teacher.get_attention_maps()

            # Get student features
            _ = student(inputs)
            s_atts = student.get_attention_maps()

            batch_sim = 0
            count = 0

            for t_att, s_att, adapt_layer in zip(t_atts, s_atts, adaptation_layers):
                if t_att is None or s_att is None:
                    continue

                # Apply adaptation layers if provided
                if adapt_layer is not None:
                    s_att = adapt_layer(s_att)

                # Handle spatial mismatch
                if t_att.shape[-2:] != s_att.shape[-2:]:
                    t_att = F.adaptive_avg_pool2d(t_att, s_att.shape[-2:])

                t_flat = t_att.view(t_att.size(0), -1)
                s_flat = s_att.view(s_att.size(0), -1)

                if use_cosine:
                    cos_sim = F.cosine_similarity(t_flat, s_flat, dim=1)
                    batch_sim += cos_sim.mean().item()
                else:
                    batch_sim += (torch.norm(t_flat - s_flat, p=2).item() / t_att.size(0))
                count += 1

            if count > 0:
                total_similarity += batch_sim / count
                total_batches += 1
                print(f"Average similarity for batch: {batch_sim / count}")
                total_similarity_list.append((batch_sim / count))

    return total_similarity / total_batches if total_batches > 0 else 0

In [None]:
# Add adaptation layers for dimension matching
adaptation_layers = nn.ModuleList()
with torch.no_grad():
    # Get sample attention shapes
    dummy_input = torch.randn(2, 3, 224, 224).to(device)
    _ = teacher(dummy_input)
    teacher_atts = teacher.get_attention_maps()
    _ = student_at(dummy_input)
    student_atts = student_at.get_attention_maps()

    for t_att, s_att in zip(teacher_atts, student_atts):
        if t_att is None or s_att is None:
            continue
        # Create layer to adapt student channels to teacher
        in_channels = s_att.shape[1]
        out_channels = t_att.shape[1]
        adaptation_layers.append(
            nn.Conv2d(in_channels, out_channels, kernel_size=1).to(device))

In [None]:
cosine_similarity_loss_teacher_student_at_list = []
calculate_attention_similarity(teacher, student_at, test_loader, device, cosine_similarity_loss_teacher_student_at_list, adaptation_layers=adaptation_layers)

In [None]:
# Add adaptation layers for dimension matching
adaptation_layers = nn.ModuleList()
with torch.no_grad():
    # Get sample attention shapes
    dummy_input = torch.randn(2, 3, 224, 224).to(device)
    _ = teacher(dummy_input)
    teacher_atts = teacher.get_attention_maps()
    _ = student_at_dml(dummy_input)
    student_atts = student_at_dml.get_attention_maps()

    for t_att, s_att in zip(teacher_atts, student_atts):
        if t_att is None or s_att is None:
            continue
        # Create layer to adapt student channels to teacher
        in_channels = s_att.shape[1]
        out_channels = t_att.shape[1]
        adaptation_layers.append(
            nn.Conv2d(in_channels, out_channels, kernel_size=1).to(device))

In [None]:
cosine_similarity_loss_teacher_student_at_dml_list = []
calculate_attention_similarity(teacher, student_at_dml, test_loader, device, cosine_similarity_loss_teacher_student_at_dml_list, adaptation_layers=adaptation_layers)

In [None]:
# Add adaptation layers for dimension matching
adaptation_layers = nn.ModuleList()
with torch.no_grad():
    # Get sample attention shapes
    dummy_input = torch.randn(2, 3, 224, 224).to(device)
    _ = student_at(dummy_input)
    teacher_atts = student_at.get_attention_maps()
    _ = student_at_dml(dummy_input)
    student_atts = student_at_dml.get_attention_maps()

    for t_att, s_att in zip(teacher_atts, student_atts):
        if t_att is None or s_att is None:
            continue
        # Create layer to adapt student channels to teacher
        in_channels = s_att.shape[1]
        out_channels = t_att.shape[1]
        adaptation_layers.append(
            nn.Conv2d(in_channels, out_channels, kernel_size=1).to(device))

In [None]:
cosine_similarity_loss_student_at_student_at_dml_list = []
calculate_attention_similarity(student_at, student_at_dml, test_loader, device, cosine_similarity_loss_student_at_student_at_dml_list, adaptation_layers=adaptation_layers)

In [None]:
batch_numbers = list(range(1, len(test_loader) + 1))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
bce_model1 = binary_cross_entropy_loss_student_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
bce_model2 = binary_cross_entropy_loss_student_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [mean_kl_model1, mean_bce_model1, mean_cos_model1]
mean_losses_model2 = [mean_kl_model2, mean_bce_model2, mean_cos_model2]

# Labels for the different loss functions
loss_labels = ['KL Divergence', 'Binary Cross Entropy', 'Cosine Similarity']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of Losses Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
bce_model1 = binary_cross_entropy_loss_student_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
bce_model2 = binary_cross_entropy_loss_student_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_bce_model1,  0]
mean_losses_model2 = [0, mean_bce_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'Binary Cross Entropy', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of Binary Cross Entropy Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function (only KL and Cosine)
mean_losses_model1 = [mean_kl_model1, mean_cos_model1]
mean_losses_model2 = [mean_kl_model2, mean_cos_model2]

# Labels for the different loss functions (only KL and Cosine)
loss_labels = ['KL Divergence', 'Cosine Similarity']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of KL Divergence and Cosine Similarity Losses Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
bce_model1 = binary_cross_entropy_loss_student_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
bce_model2 = binary_cross_entropy_loss_student_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_cos_model1,  0]
mean_losses_model2 = [0, mean_cos_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'Cosine Similarity', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of Cosine Similarity Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
bce_model1 = binary_cross_entropy_loss_student_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
bce_model2 = binary_cross_entropy_loss_student_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_kl_model1,  0]
mean_losses_model2 = [0, mean_kl_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'KL Divergence', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of KL Divergence Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_at_list
bce_model1 = binary_cross_entropy_loss_student_at_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_at_dml_list
bce_model2 = binary_cross_entropy_loss_student_at_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_bce_model1,  0]
mean_losses_model2 = [0, mean_bce_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'Binary Cross Entropy', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='AT', color='blue')
ax.bar(x, mean_losses_model2, width, label='AT + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of Binary Cross Entropy Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_soft_list
bce_model1 = binary_cross_entropy_loss_student_soft_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_soft_dml_list
bce_model2 = binary_cross_entropy_loss_student_soft_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_bce_model1,  0]
mean_losses_model2 = [0, mean_bce_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'Binary Cross Entropy', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='Soft', color='blue')
ax.bar(x, mean_losses_model2, width, label='Soft + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of Binary Cross Entropy Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample batch-wise loss data for each model and loss function
kl_model1 = kl_div_loss_soft_list
bce_model1 = binary_cross_entropy_loss_student_soft_list
cos_model1 = np.abs(cosine_similarity_loss_teacher_student_at_list)

kl_model2 = kl_div_loss_soft_dml_list
bce_model2 = binary_cross_entropy_loss_student_soft_dml_list
cos_model2 = np.abs(cosine_similarity_loss_teacher_student_at_dml_list)

# Calculate mean loss for each model and each loss function
mean_kl_model1 = np.mean(kl_model1)
mean_bce_model1 = np.mean(bce_model1)
mean_cos_model1 = np.mean(cos_model1)

mean_kl_model2 = np.mean(kl_model2)
mean_bce_model2 = np.mean(bce_model2)
mean_cos_model2 = np.mean(cos_model2)

# Create a list of means for each model and loss function
mean_losses_model1 = [0, mean_kl_model1,  0]
mean_losses_model2 = [0, mean_kl_model2,  0]

# Labels for the different loss functions
loss_labels = ['', 'KL Divergence', '']

# Plotting the comparison using a grouped bar plot
width = 0.25  # Width of the bars
x = np.arange(len(loss_labels))  # Label positions

fig, ax = plt.subplots()

# Bar positions for each model
ax.bar(x - width, mean_losses_model1, width, label='Soft', color='blue')
ax.bar(x, mean_losses_model2, width, label='Soft + DML', color='green')

# Add labels and title
ax.set_ylabel('Mean Loss')
ax.set_title('Comparison of KL Divergence Loss Across Models')
ax.set_xticks(x)
ax.set_xticklabels(loss_labels)
ax.legend()

# Display the plot
plt.show()