# Assignment 3
Enhance the efficiency and performance of the image classification CNN model from assignment 2 using Active Learning Stratergies

# First: Using Neural Networks
This is a neural network with two convulational layers with relu activation and SGD optimizer.<br>
We also calculate least confidence, prediction entropy, margin sampling, cosine similarity, l2 norm and kl divergence

In [33]:
# Importing necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from sklearn.metrics import pairwise_distances
import numpy as np

# Define the transform for MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # For grayscale images
])

# Download and load the MNIST dataset
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)

# Define the classes for MNIST dataset (digits 0-9)
classes = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')

# Define the neural network architecture for MNIST
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 6, 5)  # Input channels: 1 (grayscale), Output channels: 6, Kernel size: 5x5
        self.conv2 = nn.Conv2d(6, 16, 5)  # Input channels: 6, Output channels: 16, Kernel size: 5x5
        # Fully connected layers
        # Adjust input size after convolution (28x28 -> 12x12 -> 4x4)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  # Adjusted for MNIST image size
        self.fc2 = nn.Linear(120, 84)  # Input features: 120, Output features: 84
        self.fc3 = nn.Linear(84, 10)  # Output features: 10 (digits 0-9)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2, 2)  # First convolution and max pooling
        x = F.max_pool2d(F.relu(self.conv2(x)), 2, 2)  # Second convolution and max pooling
        x = x.view(-1, 16 * 4 * 4)  # Flatten the output before passing to fully connected layers
        x = F.relu(self.fc1(x))  # First fully connected layer
        x = F.relu(self.fc2(x))  # Second fully connected layer
        x = self.fc3(x)  # Final output layer
        return x

# Instantiate the network
net = Net()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)  # SGD optimizer with momentum
epochs = 10  # Number of epochs


# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained
    return cosine_similarity, l2_norm

# Creating a functiom to calculate kl divergence
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)
        # Calculate KL divergence between the current sample and its neighbors
        # Include 'reduction' argument inside F.kl_div
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))

    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Define lists for uncertainty and diversity metrics
least_confidence_list = []
prediction_entropy_list = []
margin_sampling_list = []
cosine_similarity_list = []
l2_norm_list = []
kl_divergence_list = []

# Training loop
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(trainloader)}")

# # Testing the model
# correct = 0
# total = 0

# # Use torch.no_grad() to disable gradient computation during testing
# with torch.no_grad():
#     for data in testloader:
#         images, labels = data
#         outputs = net(images)
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#         # Extract features using the first convolutional layer
#         features = net.conv1(images)
#         features = F.max_pool2d(F.relu(features), 2, 2)
#         features = net.conv2(features)
#         features = F.max_pool2d(F.relu(features), 2, 2)
#         features = features.view(features.size(0), -1)

#         # Here, you should calculate uncertainty and diversity metrics as needed
#         # calculate_metrics(outputs, features)

# accuracy = 100 * correct / total
# print(f"Accuracy on the test set: {accuracy:.2f}%")


# Testing the model
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

# Initialize lists to store uncertainty and diversity measures
least_confidence_list = [] # Initialize lists to store least confidence measures
prediction_entropy_list = [] # Initialize lists to store prediction entropy measures
margin_sampling_list = [] # Initialize lists to store margin sampling measures
cosine_similarity_list = [] # Initialize lists to store cosine similarity measures
l2_norm_list = [] # Initialize lists to store L2 norm estimates
kl_divergence_list = [] # Initialize lists to store KL divergence estimates

# Use torch.no_grad() to disable gradient computation during testing
with torch.no_grad():
    for data in testloader:  # Iterate over batches in the test loader
        images, labels = data  # Get inputs and true labels for the current batch
        outputs = net(images)  # Forward pass to compute predicted outputs
        _, predicted = torch.max(outputs.data, 1)  # Get the index of the maximum predicted value
        total += labels.size(0)  # Increment the total number of samples by the batch size
        correct += (predicted == labels).sum().item()  # Count the number of correctly predicted samples

        # Extract features using the first convolutional layer
        features = net.conv1(images)
        # Apply max pooling and ReLU activation
        features = F.max_pool2d(F.relu(features), 2, 2)
        # Extract features using the second convolutional layer
        features = net.conv2(features)
        # Apply max pooling and ReLU activation
        features = F.max_pool2d(F.relu(features), 2, 2)
        # Flatten the features to be used in fully connected layers
        features = features.view(features.size(0), -1)
        # Calculate metrics using the extracted features and model outputs
        calculate_metrics(outputs, features)

accuracy = 100 * correct / total # Calculate accuracy
print(f"Accuracy on the test set: {accuracy:.2f}%") # Print the accuracy on test set
# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence


Epoch 1/10, Loss: 0.2380423673719819
Epoch 2/10, Loss: 0.05572305065324529
Epoch 3/10, Loss: 0.040640152702296796
Epoch 4/10, Loss: 0.03136932350921372
Epoch 5/10, Loss: 0.025474647406290227
Epoch 6/10, Loss: 0.02150734419139197
Epoch 7/10, Loss: 0.018364312424345882
Epoch 8/10, Loss: 0.015374035355039754
Epoch 9/10, Loss: 0.013457818132385703
Epoch 10/10, Loss: 0.012166778185696154
Accuracy on the test set: 99.23%
Average Least Confidence: 0.004795989952981472
Average Prediction Entropy: 0.01333171222358942
Average Margin Sampling: 0.0047930460423231125
Average Cosine Similarity: 0.5263910889625549
Average L2 Norm: 0.8339491486549377
Average KL Divergence: 1.5235717296600342


# Second: Using CNN Architecture
(same as used in assignment 2)
This is a CNN model with two convulational layers with Adam optimizer and maxpool and using cuda for encorporating GPU for the computation.<br>
We also calculate least confidence, prediction entropy, margin sampling, cosine similarity, l2 norm and kl divergence

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import pairwise_distances
from scipy.stats import entropy

# Define the CNN architecture by creating a class that inherits from nn.Module
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()

        # Adjusted the convolutional layers for MNIST (grayscale 1 channel, 28x28 image size)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # Input: 1 channel (grayscale), Output: 32 filters
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Input: 32 channels, Output: 64 filters

        self.flatten = nn.Flatten()

        # Fully connected layers (adjusted based on output of conv layers)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)  # Adjust input size based on feature map size
        self.fc2 = nn.Linear(512, 10)  # Output: 10 neurons (for 10 classes in MNIST)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.flatten(x)

        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x

# Set the device for computations (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
batch_size = 32
learning_rate = 0.001
epochs = 10
m = 5

# Define data transformations for preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Adjusted for MNIST (1 channel)
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the neural network model, loss function, and optimizer
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the uncertainty and diversity metrics functions (unchanged)

# Train the model
for epoch in range(epochs):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


# Evaluate the model on the test set
model.eval()
correct = 0
total = 0

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)


def calculate_metrics(outputs, features, m=5):
    # Ensure the tensor is on the CPU before converting to NumPy
    outputs_cpu = outputs.detach().cpu()  # Move tensor to CPU and detach it from computation graph

    # Now call the uncertainty metrics function with the CPU tensor
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs_cpu)

    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))

    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)

    # Extend the list with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))

    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')

    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs_cpu, feature_distances, m=5)

    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)


def calculate_uncertainty_metrics(outputs):
    # outputs is already a tensor, no need to convert
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    ## Uses the formula for entropy: H(p)=−∑p(x)log(p(x).
    ## Multiplies each probability by its logarithm, sums these values for each sample, and negates the result.
    ## Entropy measures the "spread" or uncertainty of the predicted probabilities. Higher entropy indicates greater uncertainty.
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    ## Calculates the difference between the maximum and minimum predicted probabilities for each sample.
    ## Subtracts this difference from 1 to obtain the margin sampling score.
    ## A smaller margin (closer to 1) indicates higher uncertainty, as the model is less confident in discriminating between classes.
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - (torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                           torch.min(probabilities, dim=1).values.cpu().detach().numpy())
    ## Returns three metrics:
    ## least_confidence: A measure of how uncertain the model is about its most likely class.
    ## prediction_entropy: The overall uncertainty in the prediction distribution.
    ## margin_sampling: A measure of the closeness of the most likely predictions.
    return least_confidence, prediction_entropy, margin_sampling

# Summary of Logic:
# Converts raw model outputs (logits) into probabilities.Calculates three distinct uncertainty metrics to
# analyze the model's predictions from different perspectives:
# Least Confidence: Focuses on the top prediction.
# Entropy: Considers the overall distribution.
# Margin Sampling: Examines the spread between top and bottom probabilities.
# These metrics can be used to select samples with high uncertainty for further inspection or active learning.

# Use Case: This function is particularly useful in:
# Active Learning: Selecting uncertain samples to label for model improvement.
# Model Evaluation: Analyzing prediction confidence and uncertainty for robust decision-making.
# Anomaly Detection: Identifying outliers based on uncertainty metrics.


# Define a function to calculate diversity metrics
# The function calculates two diversity metrics for a set of feature vectors
# Cosine Similarity: Measures the angular similarity between a feature vector and its neighbors.
# L2 Norm (Euclidean Distance): Measures the average Euclidean distance between a feature vector and its neighbors.

# Input Arguments:
# features: A tensor containing feature vectors for all samples, typically obtained from a model's intermediate layer.
# m: The number of nearest neighbors to consider for diversity metrics (default value is 5).
def calculate_diversity_metrics(features, m=5):

    ## Converts the features tensor into a NumPy array for computation.
    ## Uses the pairwise_distances function to calculate cosine distances between each pair of feature vectors.
    ## Cosine Distance = 1 - Cosine Similarity.
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')  # Cosine distance

    ## feature_distances[:, 1:m+1] selects the distances to the nearest m neighbors (ignoring the distance to the sample itself, which is at index 0).
    ## Takes the mean of these distances for each feature vector.
    ## Since cosine similarity is the complement of cosine distance, it computes the similarity as 1 - distance.
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)  # Cosine similarity

    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')  # Computes the Euclidean (L2) distances between each pair of feature vectors

    ## Selects the distances to the nearest m neighbors (excluding the sample itself).
    ## Computes the mean of these distances for each feature vector.
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)  # L2 norm

    ## Returns two metrics for each feature vector:
    ## cosine_similarity: The average cosine similarity to its m nearest neighbors.
    ## l2_norm: The average L2 distance to its m nearest neighbors.
    return cosine_similarity, l2_norm

## Summary of Logic:
## For each feature vector:
## Find the distances to its m nearest neighbors based on cosine and Euclidean distance metrics.
## Compute the average cosine similarity and L2 distance (norm).
## The results indicate how diverse the feature vectors are:
## High cosine similarity suggests that the feature vectors are closely aligned (less diverse).
## Low L2 norm indicates that the feature vectors are tightly clustered in feature space.

## Use Case:
## This function can be used to analyze the diversity of feature vectors, understand cluster compactness,
## and evaluate how well-separated or grouped the features are in the latent space of a model.

# Define a function to calculate KL divergence
## Input Arguments:
## outputs: A tensor containing the model's raw output logits for all samples.
## feature_distances: A matrix of pairwise distances between feature vectors for all samples, where feature_distances[i, j] represents
## the distance between sample i and j.
## m: The number of nearest neighbors to consider for each sample (default is 5).
def calculate_kl_divergence(outputs, feature_distances, m=5):
    kl_divergence = []  # List to store KL divergence values for each sample.
    for i in range(len(outputs)):    ## The loop iterates through each sample i in the dataset.
        ## The outputs[i] contains the raw logits for sample i. The softmax function converts logits into a probability
        ## distribution  P(i) over the classes.
        current_sample_prob = F.softmax(outputs[i], dim=0)  # Softmax for the current sample

        ## The row feature_distances[i] contains distances of sample i from all other samples.
        ## feature_distances[i, 1:m+1] retrieves the indices of the nearest m neighbors (excluding itself, which is at index 0).
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)  # Indices of nearest neighbors

        ## outputs[neighbor_indices] retrieves the logits of the m nearest neighbors of sample i.
        ## Softmax is applied along the second dimension (dim=1) to convert logits into probabilities for all neighbors.
        ## The mean is computed across neighbors to obtain the average probability distribution Q(i).
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)  # Average neighbor probabilities

        ## Small values (1e−10) are added to both probability distributions to avoid issues with zero probabilities during the logarithm computation.
        epsilon = 1e-10
        current_sample_prob += epsilon
        neighbors_prob += epsilon

        ## The logarithm of the current sample's probability distribution P(i) is computed using torch.log(current_sample_prob).
        ## F.kl_div calculates the KL divergence
        ## reduction='batchmean' ensures the result is averaged across all classes.

        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))  # KL Divergence
    return kl_divergence

## Summary of Logic:
## For each sample: Compute its probability distribution using softmax. Find the indices of its m nearest neighbors based
## on feature distances. Compute the average probability distribution of the neighbors. Calculate the KL divergence between the sample's distribution
## and the average distribution of its neighbors. The output is a list of KL divergence values, where each value corresponds to a sample in the dataset.
## Use Case:
## Outlier Detection: Samples with high KL divergence have significantly different predictions compared to their neighbors, suggesting potential outliers or inconsistent predictions.
## Model Uncertainty: High KL divergence indicates regions in the feature space where the model may be less reliable.

least_confidence_list = []
prediction_entropy_list = []
margin_sampling_list = []
cosine_similarity_list = []
l2_norm_list = []
kl_divergence_list = []

with torch.no_grad():
    for images, labels in test_loader:

        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Extract features from the convolutional layers
        conv1_output = model.conv1(images)
        relu_output = model.relu(conv1_output)
        maxpool_output = model.maxpool(relu_output)
        conv2_output = model.conv2(maxpool_output)
        features = model.flatten(conv2_output)
        features = features.view(features.size(0), -1)

        # Calculate metrics (metrics functions not modified)
        calculate_metrics(outputs, features)

# Calculate accuracy
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}")
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}")
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}")
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}")
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}")
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}")


Epoch 1/10, Loss: 0.013133395463228226
Epoch 2/10, Loss: 0.11068765074014664
Epoch 3/10, Loss: 0.011432253755629063
Epoch 4/10, Loss: 0.010805139318108559
Epoch 5/10, Loss: 0.0041971271857619286
Epoch 6/10, Loss: 0.026356974616646767
Epoch 7/10, Loss: 0.005212348885834217
Epoch 8/10, Loss: 2.0116024188610027e-06
Epoch 9/10, Loss: 0.00017639149155002087
Epoch 10/10, Loss: 6.183942673487763e-07
Test Accuracy: 98.80%
Average Least Confidence: 0.0040636989288032055
Average Prediction Entropy: 0.010589069686830044
Average Margin Sampling: 0.0040637110359966755
Average Cosine Similarity: 0.7735907435417175
Average L2 Norm: 0.6562566161155701
Average KL Divergence: 1.9077818393707275


# Third: Using a Pretrained Model - RESNET18
(using this for better accuracy)
This is a CNN model incorporating the pretrained model RESNET18, it has one convolutional layer and one fully connected layer<br>
RESNET18 takes input in three channels so we have normalized the input for one channel in grayscale.<br>
We also calculate least confidence, prediction entropy, margin sampling, cosine similarity, l2 norm and kl divergence

In [13]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import pairwise_distances
from scipy.stats import entropy
import torchvision.models as models  # Import pretrained models

# Define the pretrained ResNet18 architecture
class ResNet18ForMNIST(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet18ForMNIST, self).__init__()
        self.resnet18 = models.resnet18(pretrained=True)
        self.resnet18.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet18.fc = nn.Linear(self.resnet18.fc.in_features, num_classes)

    def forward(self, x):
        # Extract features from layer4
        x = self.resnet18.conv1(x)
        x = self.resnet18.bn1(x)
        x = self.resnet18.relu(x)
        x = self.resnet18.maxpool(x)

        x = self.resnet18.layer1(x)
        x = self.resnet18.layer2(x)
        x = self.resnet18.layer3(x)
        features = self.resnet18.layer4(x)  # Extract features here

        x = self.resnet18.avgpool(features)
        x = torch.flatten(x, 1)
        output = self.resnet18.fc(x)  # Final output
        return output, features


# Set the device for computations (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
batch_size = 32
learning_rate = 0.001
epochs = 10
m = 5

# Define data transformations for preprocessing
transform = transforms.Compose([
    transforms.ToTensor(), # Remove the transforms.Grayscale to keep the images as 1-channel grayscale
    transforms.Normalize((0.5,), (0.5,)) # Normalization for 1-channel images
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the neural network model, loss function, and optimizer
model = ResNet18ForMNIST().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the uncertainty and diversity metrics functions (unchanged)

# Train the model
for epoch in range(epochs):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # Unpack output and features
        outputs, features = model(images)  # `features` is unused during training
        
        # Compute loss only using `outputs`
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        torch.cuda.empty_cache()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


# Evaluate the model on the test set
model.eval()
correct = 0
total = 0

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained
    return cosine_similarity, l2_norm

# Creating a functiom to calculate kl divergence
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []

    # Add a small epsilon to avoid log(0) and ensure numerical stability
    epsilon = 1e-10

    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample (softmax)
        current_sample_prob = F.softmax(outputs[i], dim=0)
        
        # Add epsilon to avoid log(0) when computing log-probabilities
        current_sample_log_prob = torch.log(current_sample_prob + epsilon)

        # Get the indices of the top m neighbors for the current sample from the distance matrix
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)  # Exclude the first index, which is the sample itself
        
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add epsilon to the neighbor probabilities to avoid log(0) in KL divergence
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_score = F.kl_div(current_sample_log_prob, neighbors_prob, reduction='batchmean')

        # Store the KL divergence for this sample as a Tensor
        kl_divergence.append(torch.tensor(kl_score.item()))  # Convert to tensor before appending

    # Return the KL divergence values obtained as Tensors
    return kl_divergence




# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.cpu().detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)


least_confidence_list = []
prediction_entropy_list = []
margin_sampling_list = []
cosine_similarity_list = []
l2_norm_list = []
kl_divergence_list = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs, features = model(images)  # Get both output and features
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Flatten features for further processing
        features = features.view(features.size(0), -1)

        # Calculate metrics
        calculate_metrics(outputs, features)


# Calculate accuracy
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}")
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}")
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}")
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}")
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}")
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}")




Epoch 1/10, Loss: 0.12671785056591034
Epoch 2/10, Loss: 0.005561455152928829
Epoch 3/10, Loss: 0.06107950210571289
Epoch 4/10, Loss: 0.12433136999607086
Epoch 5/10, Loss: 0.0018317534122616053
Epoch 6/10, Loss: 0.03611539304256439
Epoch 7/10, Loss: 0.003682696493342519
Epoch 8/10, Loss: 0.0002998908457811922
Epoch 9/10, Loss: 0.009061828255653381
Epoch 10/10, Loss: 0.0004075284523423761
Test Accuracy: 99.31%
Average Least Confidence: 0.005538662895560265
Average Prediction Entropy: 0.01951856166124344
Average Margin Sampling: 0.005533408373594284
Average Cosine Similarity: 0.3630516231060028
Average L2 Norm: 1.075036883354187
Average KL Divergence: 1.4472111463546753


# Fourth: Using a Pretrained model LeNet5
This model was made for MNIST dataset and we're using this for getting better accuracy<br>
This has two convolutional layers and two pooling layers and two fully connected layers<br>
We also calculate least confidence, prediction entropy, margin sampling, cosine similarity, l2 norm and kl divergence

In [32]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import pairwise_distances
import numpy as np

# Define the LeNet-5 architecture
# Define the LeNet-5 architecture with feature extraction
class LeNet5WithFeatures(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet5WithFeatures, self).__init__()
        # Define the layers
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)  # First convolutional layer
        self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2)  # First pooling layer
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)  # Second convolutional layer
        self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2)  # Second pooling layer
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  # First fully connected layer
        self.fc2 = nn.Linear(120, 84)  # Second fully connected layer
        self.fc3 = nn.Linear(84, num_classes)  # Output layer

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))  # Apply conv1 -> relu -> pool1
        x = self.pool2(F.relu(self.conv2(x)))  # Apply conv2 -> relu -> pool2
        features = x.view(-1, 16 * 4 * 4)  # Flatten the features
        x = F.relu(self.fc1(features))  # Apply fc1 -> relu
        x = F.relu(self.fc2(x))  # Apply fc2 -> relu
        output = self.fc3(x)  # Output layer
        return output, features  # Return both output and features


# Set the device for computations (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
batch_size = 32
learning_rate = 0.001
epochs = 10

# Define data transformations for preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5,), (0.5,))  # Normalize for 1-channel grayscale images
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the neural network model, loss function, and optimizer
model = LeNet5WithFeatures().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs, _ = model(images)  # Get both outputs and features
        loss = criterion(outputs, labels)  # Use only the logits for loss computation
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print average loss after each epoch
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Evalu# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained
    return cosine_similarity, l2_norm

# # Creating a functiom to calculate kl divergence
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []

    # Add a small epsilon to avoid log(0) and ensure numerical stability
    epsilon = 1e-10

    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample (softmax)
        current_sample_prob = F.softmax(outputs[i], dim=0)
        
        # Add epsilon to avoid log(0) when computing log-probabilities
        current_sample_log_prob = torch.log(current_sample_prob + epsilon)

        # Get the indices of the top m neighbors for the current sample from the distance matrix
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)  # Exclude the first index, which is the sample itself
        
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add epsilon to the neighbor probabilities to avoid log(0) in KL divergence
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_score = F.kl_div(current_sample_log_prob, neighbors_prob, reduction='batchmean')

        # Store the KL divergence for this sample as a Tensor
        kl_divergence.append(torch.tensor(kl_score.item()))  # Convert to tensor before appending

    # Return the KL divergence values obtained as Tensors
    return kl_divergence




# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.cpu().detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)


# Evaluating the model and calculating metrics
model.eval()
correct = 0
total = 0

# Reset metric lists at the start of each evaluation
least_confidence_list = []
prediction_entropy_list = []
margin_sampling_list = []
cosine_similarity_list = []
l2_norm_list = []
kl_divergence_list = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass
        outputs, features = model(images)  # Extract both outputs and features
        _, predicted = torch.max(outputs, 1)  # Use only the logits for predictions
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        features = features.view(features.size(0),-1)
        calculate_metrics(outputs, features)

# Calculate accuracy
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}")
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}")
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}")
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}")
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}")
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}")





Epoch 1/10, Loss: 0.25552816745961704
Epoch 2/10, Loss: 0.07841236211778596
Epoch 3/10, Loss: 0.056385052964215476
Epoch 4/10, Loss: 0.04438513330162969
Epoch 5/10, Loss: 0.03729607515376217
Epoch 6/10, Loss: 0.03215757854296438
Epoch 7/10, Loss: 0.02685596070534666
Epoch 8/10, Loss: 0.023651628206616444
Epoch 9/10, Loss: 0.020647949802401127
Epoch 10/10, Loss: 0.017744616032129124
Test Accuracy: 98.99%
Average Least Confidence: 0.007267912849783897
Average Prediction Entropy: 0.02061202935874462
Average Margin Sampling: 0.007266698405146599
Average Cosine Similarity: 0.35836660861968994
Average L2 Norm: 1.1013730764389038
Average KL Divergence: 1.622452974319458
