# Preface

This code is meant to be used for finding the Hyperparameter optimization. This is not the code used for generating final results and visualization. If you want to re-create the final results, please run the 'final_version_Thesis_KDmodels.py' file located on the github repo. It already contains all of the best hyperparameters as found by myself.

# loading pickle files or data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

# Define the path to your folder
folder_path = '/content/drive/MyDrive/Thesis_Bsc'
os.chdir(folder_path)

# Check the current working directory to ensure you are in the correct folder
print("Current Working Directory: ", os.getcwd())


Current Working Directory:  /content/drive/MyDrive/Thesis_Bsc


# Validation for Teacher and/or Student

First we must setup the paramgrid with our desired hyper parameters to be tested. Run the code block containing the Functions and train/test/validation function before running the Param_grid code block. This will test for the Teacher and Student, as these have the most impact on the following methods ensure this is done with the most care.

In [None]:
import itertools

# Define the hyperparameter grid to a selected amount of Hyper parameters you want to test.
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'batch_size': [64, 128, 256],
    'num_epochs': [10, 20, 30]
}

# Create a list of all combinations of hyperparameters
param_combinations = list(itertools.product(param_grid['learning_rate'], param_grid['batch_size'], param_grid['num_epochs']))

best_val_accuracy = 0
best_params = None
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Iterate over all combinations of hyperparameters
for learning_rate, batch_size, num_epochs in param_combinations:
    # Create data loaders with the current batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    # Initialize the model
    model = LightNN()  # or DeepNN() depending on which model you want to optimize

    # Train the model
    train(model, train_loader, val_loader, epochs=num_epochs, learning_rate=learning_rate, device=device)

    # Validate the model
    val_loss, val_accuracy = validate(model, val_loader, device)

    # Track the best performing hyperparameters
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_params = {'learning_rate': learning_rate, 'batch_size': batch_size, 'num_epochs': num_epochs}

print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.2f}%")

# Train the final model with the best hyperparameters on the combined training and validation set
full_train_loader = DataLoader(full_train_dataset, batch_size=best_params['batch_size'], shuffle=True, num_workers=2)
final_model = LightNN()  # or DeepNN()
train(final_model, full_train_loader, val_loader, epochs=best_params['num_epochs'], learning_rate=best_params['learning_rate'], device=device)

# Test the final model
test_accuracy = test(final_model, test_loader, device)
print(f"Final Test Accuracy: {test_accuracy:.2f}%")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
# from torchmetrics import Accuracy, Precision, Recall, F1Score
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import datetime

# Check if GPU is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pickle

# Function to load a pickle file
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

# Replace 'file_path.pkl' with the actual path to your pickled files
image_data_path = '/content/drive/MyDrive/Thesis_Bsc/image_data.pkl'
one_hot_encoded_data_path = '/content/drive/MyDrive/Thesis_Bsc/one_hot_encoded_data.pkl'
test_image_data_path = '/content/drive/MyDrive/Thesis_Bsc/test_image_data.pkl'
test_one_hot_encoded_data_path = '/content/drive/MyDrive/Thesis_Bsc/test_one_hot_encoded_data.pkl'

# Unpickling the data
image_data = load_pickle(image_data_path)
one_hot_encoded_data = load_pickle(one_hot_encoded_data_path)
test_image_data = load_pickle(test_image_data_path)
test_one_hot_encoded_data = load_pickle(test_one_hot_encoded_data_path)

from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label


calculated_mean = torch.tensor([0.5075, 0.5064, 0.5082]).mean().item()
calculated_std = torch.tensor([0.2556, 0.2558, 0.2541]).mean().item()


transforms_data = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[calculated_mean], std=[calculated_std])
])

# Assuming `train_images` and `train_labels` are your pre-loaded training data and labels
# And `test_images` and `test_labels` are your pre-loaded test data and labels
train_dataset = SimpleDataset(image_data, one_hot_encoded_data, transform=transforms_data)
test_dataset = SimpleDataset(test_image_data, test_one_hot_encoded_data, transform=transforms_data)

# Define the split ratio for train and validation sets
train_size = int(0.85 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create the data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

def validate(model, val_loader, device):
    model.eval()

    val_loss = 0.0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in val_loader:
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    return avg_val_loss, val_accuracy

def train(model, train_loader, val_loader, epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss, val_accuracy = validate(model, val_loader, device)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

def test(model, test_loader, device):
    model.to(device)
    model.eval()

    # Initialize metrics
    accuracy_metric = Accuracy(num_classes=6, task='multiclass').to(device)
    precision_metric = Precision(num_classes=6, task='multiclass').to(device)
    recall_metric = Recall(num_classes=6, task='multiclass').to(device)
    f1_metric = F1Score(num_classes=6, task='multiclass').to(device)

    all_preds = []
    all_labels = []

    # Loop through batches in the test set
    with torch.no_grad():
        for batch_index, (inputs, labels) in enumerate(test_loader):
            # Convert labels from one-hot encoded to class indices
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            # Update metrics
            accuracy_metric.update(predicted, labels)
            precision_metric.update(predicted, labels)
            recall_metric.update(predicted, labels)
            f1_metric.update(predicted, labels)
            print("Predictions:", predicted)
            print("True labels:", labels)

            all_preds.append(predicted.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # Compute overall metrics after going through all batches
    overall_accuracy = accuracy_metric.compute()
    overall_precision = precision_metric.compute()
    overall_recall = recall_metric.compute()
    overall_f1_score = f1_metric.compute()

    # Print overall metrics
    print(f'Overall Test - '
          f'Accuracy: {overall_accuracy:.2f}, '
          f'Precision: {overall_precision:.3f}, '
          f'Recall: {overall_recall:.3f}, '
          f'F1: {overall_f1_score:.3f}')

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    cm = confusion_matrix(all_labels, all_preds)

    cm_df = pd.DataFrame(cm)

    model_class_name = model.__class__.__name__
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    csv_file_path = f'./results/confusion_matrix_{model_class_name}_{timestamp}.csv'
    cm_df.to_csv(csv_file_path, index=False)

    # Return a dictionary of the overall metrics
    return {
        "accuracy": overall_accuracy.item(),
        "precision": overall_precision.item(),
        "recall": overall_recall.item(),
        "f1_score": overall_f1_score.item()
    }

# Deeper neural network class to be used as teacher:
class DeepNN(nn.Module):
    def __init__(self, num_classes=6):
        super(DeepNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(4608, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Lightweight neural network class to be used as student:
class LightNN(nn.Module):
    def __init__(self, num_classes=6):
        super(LightNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(2304, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

torch.manual_seed(42)
nn_deep = DeepNN(num_classes=6).to(device)
#train(nn_deep, train_loader, val_loader, epochs=25, learning_rate=0.001, device=device)
#results_deep = test(nn_deep, test_loader, device)

# Instantiate the lightweight network:
torch.manual_seed(42)
nn_light = LightNN(num_classes=6).to(device)

torch.manual_seed(42)
new_nn_light = LightNN(num_classes=6).to(device)

# Print the norm of the first layer of the initial lightweight model
print("Norm of 1st layer of nn_light:", torch.norm(nn_light.features[0].weight).item())
# Print the norm of the first layer of the new lightweight model
print("Norm of 1st layer of new_nn_light:", torch.norm(new_nn_light.features[0].weight).item())

total_params_deep = "{:,}".format(sum(p.numel() for p in nn_deep.parameters()))
print(f"DeepNN parameters: {total_params_deep}")
total_params_light = "{:,}".format(sum(p.numel() for p in nn_light.parameters()))
print(f"LightNN parameters: {total_params_light}")

#train(nn_light, train_loader, val_loader, epochs=10, learning_rate=0.001, device=device)
#results_light_ce = test(nn_light, test_loader, device)

# Print all metrics for the deep model
print("Deep Model Metrics:")
for metric, value in results_deep.items():
    print(f"{metric.capitalize()}: {value:.2f}%")

# Print all metrics for the lightweight model
print("Lightweight Model Metrics:")
for metric, value in results_light_ce.items():
    print(f"{metric.capitalize()}: {value:.2f}%")


# Validation for KD approach using output labels
Same as previous but modified for the first distillation function

In [23]:

def train_knowledge_distillation(teacher, student, train_loader, val_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.eval()  # Teacher set to evaluation mode
    student.train()  # Student to train mode

    best_val_accuracy = 0

    for epoch in range(epochs):
        running_loss = 0.0
        student.train()  # Ensure the student model is in training mode

        for inputs, labels in train_loader:
            # Remove the middle dimension and convert one-hot encoded labels to class indices
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            # Soften the student logits by applying softmax first and log() second
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss, val_accuracy = validate(student, val_loader, device)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = student.state_dict().copy()

    student.load_state_dict(best_model_state)
    return best_val_accuracy


In [28]:
def validate(model, val_loader, device):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in val_loader:
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    return avg_val_loss, val_accuracy


In [None]:
import itertools

# Define the parameter grid
param_grid = {
    'T': [1.5, 2, 2.5],
    'soft_target_loss_weight': [0.25, 0.35, 0.45],
    'ce_loss_weight': [0.75, 0.65, 0.55]
}

# Create a list of all combinations of hyperparameters
param_combinations = list(itertools.product(param_grid['T'], param_grid['soft_target_loss_weight'], param_grid['ce_loss_weight']))

nn_deep = DeepNN(num_classes=6).to(device)
nn_deep.eval()  # Set the teacher model to evaluation mode

# Best hyperparameters initialization
best_val_accuracy = 0
best_params = None

# Loop through all combinations of hyperparameters
for T, soft_target_loss_weight, ce_loss_weight in param_combinations:
    print(f"Training with T={T}, soft_target_loss_weight={soft_target_loss_weight}, ce_loss_weight={ce_loss_weight}")

    # Ensure complementary loss weights
    if soft_target_loss_weight + ce_loss_weight != 1.0:
        print(f"Skipping invalid combination: T={T}, soft_target_loss_weight={soft_target_loss_weight}, ce_loss_weight={ce_loss_weight}")
        continue

    # Initialize student model
    torch.manual_seed(42)
    student_model = LightNN(num_classes=6).to(device)

    # Train and validate the student model
    val_accuracy = train_knowledge_distillation(
        teacher=nn_deep,
        student=student_model,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=25,
        learning_rate=0.001,
        T=T,
        soft_target_loss_weight=soft_target_loss_weight,
        ce_loss_weight=ce_loss_weight,
        device=device
    )

    # Track the best performing hyperparameters based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_params = {
            'T': T,
            'soft_target_loss_weight': soft_target_loss_weight,
            'ce_loss_weight': ce_loss_weight
        }

print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.2f}%")


# Now we need to test for the other 2 methods

In [None]:
class ModifiedDeepNNCosine(nn.Module):
    def __init__(self, num_classes=6):
        super(ModifiedDeepNNCosine, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),  # Adjusted for 1 input channel
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # Assuming an input image size of 48x48 for correct flattened size calculation
        self.classifier = nn.Sequential(
            nn.Linear(4608, 512),  # Adjusted linear layer size
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        flattened_conv_output = torch.flatten(x, 1)
        x = self.classifier(flattened_conv_output)
        flattened_conv_output_after_pooling = torch.nn.functional.avg_pool1d(flattened_conv_output, 2)
        return x, flattened_conv_output_after_pooling

class ModifiedLightNNCosine(nn.Module):
    def __init__(self, num_classes=6):
        super(ModifiedLightNNCosine, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),  # Assuming grayscale images
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # Adjusted linear layer size to match 1024
        self.classifier = nn.Sequential(
            nn.Linear(2304, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        flattened_conv_output = torch.flatten(x, 1)
        x = self.classifier(flattened_conv_output)
        return x, flattened_conv_output

# We do not have to train the modified deep network from scratch of course, we just load its weights from the trained instance
modified_nn_deep = ModifiedDeepNNCosine(num_classes=6).to(device)
modified_nn_deep.load_state_dict(nn_deep.state_dict())

# Once again ensure the norm of the first layer is the same for both networks
print("Norm of 1st layer for deep_nn:", torch.norm(nn_deep.features[0].weight).item())
print("Norm of 1st layer for modified_deep_nn:", torch.norm(modified_nn_deep.features[0].weight).item())

# Initialize a modified lightweight network with the same seed as our other lightweight instances. This will be trained from scratch to examine the effectiveness of cosine loss minimization.
torch.manual_seed(42)
modified_nn_light = ModifiedLightNNCosine(num_classes=6).to(device)
print("Norm of 1st layer:", torch.norm(modified_nn_light.features[0].weight).item())

# Adjust sample input for grayscale images
sample_input = torch.randn(128, 1, 48,48).to(device) # Batch size: 128, Filters: 1 (grayscale), Image size: 32x32

# Pass the input through the student
logits, hidden_representation = modified_nn_light(sample_input)

# Print the shapes of the tensors
print("Student logits shape:", logits.shape) # batch_size x total_classes
print("Student hidden representation shape:", hidden_representation.shape) # batch_size x hidden_representation_size

# Pass the input through the teacher
logits, hidden_representation = modified_nn_deep(sample_input)

# Print the shapes of the tensors
print("Teacher logits shape:", logits.shape) # batch_size x total_classes
print("Teacher hidden representation shape:", hidden_representation.shape)



In [None]:
def train_cosine_loss(teacher, student, train_loader, val_loader, epochs, learning_rate, hidden_rep_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    cosine_loss = nn.CosineEmbeddingLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.to(device)
    student.to(device)
    teacher.eval()  # Teacher set to evaluation mode
    student.train()  # Student to train mode

    best_val_accuracy = 0

    for epoch in range(epochs):
        running_loss = 0.0
        student.train()  # Ensure the student model is in training mode

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.squeeze(1).max(dim=1)[1]

            optimizer.zero_grad()

            # Forward pass with the teacher model and keep only the hidden representation
            with torch.no_grad():
                _, teacher_hidden_representation = teacher(inputs)

            # Forward pass with the student model
            student_logits, student_hidden_representation = student(inputs)

            # Calculate the cosine loss. Target is a vector of ones.
            hidden_rep_loss = cosine_loss(student_hidden_representation, teacher_hidden_representation, target=torch.ones(inputs.size(0)).to(device))

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = hidden_rep_loss_weight * hidden_rep_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss, val_accuracy = validate(student, val_loader, device)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = student.state_dict().copy()

    student.load_state_dict(best_model_state)
    return best_val_accuracy


In [None]:
def validate(model, val_loader, device):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in val_loader:
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    return avg_val_loss, val_accuracy


In [None]:
# Define the parameter grid
param_grid = {
    'hidden_rep_loss_weight': [0.25, 0.35, 0.45],
    'ce_loss_weight': [0.75, 0.65, 0.55]
}

# Create a list of all combinations of hyperparameters
param_combinations = list(itertools.product(param_grid['hidden_rep_loss_weight'], param_grid['ce_loss_weight']))

# Ensure complementary loss weights
param_combinations = [(hrw, clw) for hrw, clw in param_combinations if hrw + clw == 1.0]

# Load the pretrained teacher model
teacher_model_path = 'path_to_saved_teacher_model.pth'
nn_deep = DeepNN(num_classes=6).to(device)
nn_deep.load_state_dict(torch.load(teacher_model_path))
nn_deep.eval()  # Set the teacher model to evaluation mode

# Load the modified teacher model weights
modified_nn_deep = ModifiedDeepNNCosine(num_classes=6).to(device)
modified_nn_deep.load_state_dict(nn_deep.state_dict())

# Best hyperparameters initialization
best_val_accuracy = 0
best_params = None

# Loop through all combinations of hyperparameters
for hidden_rep_loss_weight, ce_loss_weight in param_combinations:
    print(f"Training with hidden_rep_loss_weight={hidden_rep_loss_weight}, ce_loss_weight={ce_loss_weight}")

    # Initialize student model
    torch.manual_seed(42)
    student_model = ModifiedLightNNCosine(num_classes=6).to(device)

    # Train and validate the student model
    val_accuracy = train_cosine_loss(
        teacher=modified_nn_deep,
        student=student_model,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=25,
        learning_rate=0.001,
        hidden_rep_loss_weight=hidden_rep_loss_weight,
        ce_loss_weight=ce_loss_weight,
        device=device
    )

    # Track the best performing hyperparameters based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_params = {
            'hidden_rep_loss_weight': hidden_rep_loss_weight,
            'ce_loss_weight': ce_loss_weight
        }

print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.2f}%")


mse regression


In [None]:
# Pass the sample input only from the convolutional feature extractor
convolutional_fe_output_student = nn_light.features(sample_input)
convolutional_fe_output_teacher = nn_deep.features(sample_input)

# Print their shapes
print("Student's feature extractor output shape: ", convolutional_fe_output_student.shape)
print("Teacher's feature extractor output shape: ", convolutional_fe_output_teacher.shape)

class ModifiedDeepNNRegressor(nn.Module):
    def __init__(self, num_classes=6):
        super(ModifiedDeepNNRegressor, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 128, kernel_size=3, padding=1),  # Adjusted for 1 input channel
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # Assuming an input image size of 48x48
        self.classifier = nn.Sequential(
            nn.Linear(4608, 512),  # Corrected to match output from the feature extractor
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        conv_feature_map = x
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x, conv_feature_map

class ModifiedLightNNRegressor(nn.Module):
    def __init__(self, num_classes=6):
        super(ModifiedLightNNRegressor, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),  # Adjusted for 1 input channel
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.regressor = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1)
        )        # Assuming an input image size of 48x48
        self.classifier = nn.Sequential(
            nn.Linear(2304, 256),  # Adjusted to match output from the feature extractor
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        regressor_output = self.regressor(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x, regressor_output

# Example of initializing and preparing the models
modified_deep_regressor = ModifiedDeepNNRegressor(num_classes=6).to(device)
modified_light_regressor = ModifiedLightNNRegressor(num_classes=6).to(device)

In [None]:
def train_mse_loss(teacher, student, train_loader, val_loader, epochs, learning_rate, feature_map_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.to(device)
    student.to(device)
    teacher.eval()  # Teacher set to evaluation mode
    student.train()  # Student to train mode

    best_val_accuracy = 0

    for epoch in range(epochs):
        running_loss = 0.0
        student.train()  # Ensure the student model is in training mode

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            labels = labels.squeeze(1).max(dim=1)[1]

            optimizer.zero_grad()

            # Ignore teacher logits, get only the feature map
            with torch.no_grad():
                _, teacher_feature_map = teacher(inputs)

            # Forward pass with the student model
            student_logits, student_feature_map = student(inputs)

            # Calculate the MSE loss for the feature maps
            hidden_rep_loss = mse_loss(student_feature_map, teacher_feature_map)

            # Calculate the Cross-Entropy loss for the actual labels
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = feature_map_weight * hidden_rep_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss, val_accuracy = validate(student, val_loader, device)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = student.state_dict().copy()

    student.load_state_dict(best_model_state)
    return best_val_accuracy


In [None]:
def validate(model, val_loader, device):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, labels in val_loader:
            labels = labels.squeeze(1).max(dim=1)[1]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    return avg_val_loss, val_accuracy


In [None]:
# Define the parameter grid
param_grid = {
    'feature_map_weight': [0.25, 0.35, 0.45],
    'ce_loss_weight': [0.75, 0.65, 0.55]
}

# Create a list of all combinations of hyperparameters
param_combinations = list(itertools.product(param_grid['feature_map_weight'], param_grid['ce_loss_weight']))

# Ensure complementary loss weights
param_combinations = [(fmw, clw) for fmw, clw in param_combinations if fmw + clw == 1.0]

# Load the pretrained teacher model
teacher_model_path = 'path_to_saved_teacher_model.pth'
nn_deep = DeepNN(num_classes=6).to(device)
nn_deep.load_state_dict(torch.load(teacher_model_path))
nn_deep.eval()  # Set the teacher model to evaluation mode

# Load the modified teacher model weights
modified_nn_deep_reg = ModifiedDeepNNRegressor(num_classes=6).to(device)
modified_nn_deep_reg.load_state_dict(nn_deep.state_dict())

# Best hyperparameters initialization
best_val_accuracy = 0
best_params = None

# Loop through all combinations of hyperparameters
for feature_map_weight, ce_loss_weight in param_combinations:
    print(f"Training with feature_map_weight={feature_map_weight}, ce_loss_weight={ce_loss_weight}")

    # Initialize student model
    torch.manual_seed(42)
    student_model = ModifiedLightNNRegressor(num_classes=6).to(device)

    # Train and validate the student model
    val_accuracy = train_mse_loss(
        teacher=modified_nn_deep_reg,
        student=student_model,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=25,
        learning_rate=0.001,
        feature_map_weight=feature_map_weight,
        ce_loss_weight=ce_loss_weight,
        device=device
    )

    # Track the best performing hyperparameters based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_params = {
            'feature_map_weight': feature_map_weight,
            'ce_loss_weight': ce_loss_weight
        }

print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.2f}%")
