## FINAL ASSIGNMENT OF ACTIVE LEARNING PROJECT

### TRY 1: USING NEURAL NETWORK 

In [1]:
# Importing necessary libraries
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from sklearn.metrics import pairwise_distances
import numpy as np

# Define the transform
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Download and load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)

# Define the classes for classification
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class (nn.Module) to initialize the base class
        super(Net, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 6, 5)  # Input channels: 3, Output channels: 6, Kernel size: 5x5
        self.conv2 = nn.Conv2d(6, 16, 5)  # Input channels: 6, Output channels: 16, Kernel size: 5x5
        # Fully connected layers
        # Flatten layer to convert 3D tensor to 1D tensor before fully connected layers
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # Input features: 16*5*5, Output features: 120
        self.fc2 = nn.Linear(120, 84)  # Input features: 120, Output features: 84
        self.fc3 = nn.Linear(84, 10)  # Input features: 84, Output features: 10 (number of classes)

    def forward(self, x):
        # Forward pass through the network

        x = F.max_pool2d(F.relu(self.conv1(x)), 2, 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2, 2)
        # Flatten the output to a 1D tensor before passing it to fully connected layers
        x = x.view(-1, 16 * 5 * 5)
        # First fully connected layer (fc1) followed by ReLU activation
        x = F.relu(self.fc1(x))
        # Second fully connected layer (fc2) followed by ReLU activation
        x = F.relu(self.fc2(x))
        # Final fully connected layer (fc3) for classification output
        x = self.fc3(x)
        # Return the final output 
        return x

# Instantiate the network
net = Net()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)  # SGD optimizer with momentum
epochs = 10  # Number of times to iterate through the entire dataset during training

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm

# Creating a functiom to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)
        # Calculate KL divergence between the current sample and its neighbors
        # Include 'reduction' argument inside F.kl_div
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Training loop
for epoch in range(epochs):  # Loop over the dataset for a specified number of epochs
    running_loss = 0.0  # Initialize the running loss for the current epoch
    for i, data in enumerate(trainloader, 0):  # Iterate over batches in the training loader
        inputs, labels = data  # Get inputs and labels for the current batch
        optimizer.zero_grad()  # Zero the gradients to clear previous gradients
        outputs = net(inputs)  # Forward pass to compute the predicted outputs
        loss = criterion(outputs, labels)  # Compute the loss between predicted and true labels
        loss.backward()  # Backward pass to compute gradients of the loss with respect to model parameters
        optimizer.step()  # Update model parameters using the optimizer
        running_loss += loss.item()  # Accumulate the running loss for statistics

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(trainloader)}")

# Testing the model
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

# Initialize lists to store uncertainty and diversity measures
least_confidence_list = [] # Initialize lists to store least confidence measures
prediction_entropy_list = [] # Initialize lists to store prediction entropy measures
margin_sampling_list = [] # Initialize lists to store margin sampling measures
cosine_similarity_list = [] # Initialize lists to store cosine similarity measures
l2_norm_list = [] # Initialize lists to store L2 norm estimates
kl_divergence_list = [] # Initialize lists to store KL divergence estimates

# Use torch.no_grad() to disable gradient computation during testing
with torch.no_grad():
    for data in testloader:  # Iterate over batches in the test loader
        images, labels = data  # Get inputs and true labels for the current batch
        outputs = net(images)  # Forward pass to compute predicted outputs
        _, predicted = torch.max(outputs.data, 1)  # Get the index of the maximum predicted value
        total += labels.size(0)  # Increment the total number of samples by the batch size
        correct += (predicted == labels).sum().item()  # Count the number of correctly predicted samples

        # Extract features using the first convolutional layer
        features = net.conv1(images)
        # Apply max pooling and ReLU activation
        features = F.max_pool2d(F.relu(features), 2, 2)
        # Extract features using the second convolutional layer
        features = net.conv2(features)
        # Apply max pooling and ReLU activation
        features = F.max_pool2d(F.relu(features), 2, 2)
        # Flatten the features to be used in fully connected layers
        features = features.view(features.size(0), -1)
        # Calculate metrics using the extracted features and model outputs
        calculate_metrics(outputs, features)

accuracy = 100 * correct / total # Calculate accuracy
print(f"Accuracy on the test set: {accuracy:.2f}%") # Print the accuracy on test set 
# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Loss: 1.7455806453585625
Epoch 2/10, Loss: 1.3378883168798685
Epoch 3/10, Loss: 1.192586838748604
Epoch 4/10, Loss: 1.1020615001526475
Epoch 5/10, Loss: 1.0350671640495956
Epoch 6/10, Loss: 0.979087567339465
Epoch 7/10, Loss: 0.9408895053713768
Epoch 8/10, Loss: 0.8955124899016507
Epoch 9/10, Loss: 0.8617812046063319
Epoch 10/10, Loss: 0.8321819068693672
Accuracy on the test set: 62.14%
Average Least Confidence: 0.2873304784297943
Average Prediction Entropy: 0.8017570376396179
Average Margin Sampling: 0.28619784116744995
Average Cosine Similarity: 0.45348525047302246
Average L2 Norm: 0.9036170840263367
Average KL Divergence: 0.4173566401004791


### TRY 2: USING CNN ARCHITECTURE AND DOING MODIFACTIONS FOR BETTER ACCURACY 

In [89]:
# Import necessary libraries
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.metrics import pairwise_distances
from scipy.stats import entropy
# Define the CNN architecture
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
    
        # Convolutional layer 1: Input channels=3, output channels=32, kernel size=3, padding=1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        # Rectified Linear Unit (ReLU) activation function
        self.relu = nn.ReLU()
        # Max pooling layer 1: Kernel size=2, stride=2
        self.maxpool = nn.MaxPool2d(2)
        # Convolutional layer 2: Input channels=32, output channels=64, kernel size=3, padding=1
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        # Flatten layer to convert 3D tensor to 1D tensor
        self.flatten = nn.Flatten()
        # Fully connected layer 1: Input features=64*8*8, output features=512
        self.fc1 = nn.Linear(64 * 8 * 8, 512)
        # Fully connected layer 2: Input features=512, output features=10 (output classes)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):

        # Convolutional layer 1: Input tensor x undergoes convolution
        x = self.conv1(x)
        # Apply ReLU activation function to introduce non-linearity
        x = self.relu(x)
        # Perform max pooling to down-sample the spatial dimensions
        x = self.maxpool(x)
    
        # Convolutional layer 2: Apply another convolution operation
        x = self.conv2(x)
        # Apply ReLU activation
        x = self.relu(x)
        # Another max pooling operation
        x = self.maxpool(x)
    
        # Flatten the tensor to prepare for fully connected layers
        x = self.flatten(x)
    
        # Fully connected layer 1: Apply linear transformation
        x = self.fc1(x)
        # Apply ReLU activation
        x = self.relu(x)
    
        # Fully connected layer 2: Produce the final output
        x = self.fc2(x)
    
        # Return the final output tensor after passing through the network
        return x

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
batch_size = 64 # Set the batch size for training
learning_rate = 0.001 # Set the learning rate for the optimizer
epochs = 10 # Set the number of training epochs
m = 5  # Number of nearest neighbors for diversity measures

# Define data transformation for CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize pixel values
])

# Download and Load CIFAR-10 training and test datasets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)  # Create training dataset
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)  # Create test dataset

# Create DataLoader instances for training and test datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the neural network model, loss function, and optimizer
model = SimpleCNN().to(device)  # Move the model to GPU if available
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm


# Creating a function to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add a small epsilon to avoid zero probabilities
        epsilon = 1e-10
        current_sample_prob = current_sample_prob + epsilon
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence


# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    # Inside the calculate_metrics function
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Train the model
for epoch in range(epochs): # Iterate over each epoch
    model.train()  # Set the model to training mode
    for images, labels in train_loader: # Iterate through batches of training data
        images, labels = images.to(device), labels.to(device) # Move data to GPU if available
        optimizer.zero_grad()  # Zero gradients to clear previous gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backward pass to compute gradients
        optimizer.step()  # Update model parameters using the optimizer

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")  # Print the loss for the current epoch

# Evaluate the model on the test set
model.eval() # Set the model to evaluation mode
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

least_confidence_list= [] # Initialize an empty list to store least confidence values
prediction_entropy_list = [] # Initialize an empty list to store prediction entropy values
margin_sampling_list = [] # Initialize an empty list to store margin sampling values
cosine_similarity_list = [] # Initialize an empty list to store cosine similarity values
l2_norm_list = [] # Initialize an empty list to store L2 norm values
kl_divergence_list = [] # Initialize an empty list to store KL Divergence values

with torch.no_grad(): # Use torch.no_grad() to disable gradient computation during testing
    for images, labels in test_loader: # Iterate through batches of test data
        images, labels = images.to(device), labels.to(device) # Move data to GPU if available
        outputs = model(images) # Forward pass through the model
        _, predicted = torch.max(outputs.data, 1) # Get the predicted class labels
        total += labels.size(0) # Increment total number of images
        correct += (predicted == labels).sum().item() # Count correctly predicted images

        # Apply convolutional layer 1 to the input images
        conv1_output = model.conv1(images)
        # Apply ReLU activation function
        relu_output = model.relu(conv1_output)
        # Apply max pooling
        maxpool_output = model.maxpool(relu_output)
        # Apply convolutional layer 2 to the max-pooled output
        conv2_output = model.conv2(maxpool_output)
        # Flatten the output
        features = model.flatten(conv2_output)
        # Reshape the features to have a consistent size
        features = features.view(features.size(0), -1)

        # Calculate metrics using the extracted features and model outputs
        calculate_metrics(outputs, features)
        
# Calculate accuracy
accuracy = correct / total # Calculate accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%") # Print the accuracy on the test seT

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Loss: 0.8350726962089539
Epoch 2/10, Loss: 0.4385553002357483
Epoch 3/10, Loss: 0.3632165491580963
Epoch 4/10, Loss: 0.7260843515396118
Epoch 5/10, Loss: 0.5566812753677368
Epoch 6/10, Loss: 0.19536720216274261
Epoch 7/10, Loss: 0.16471228003501892
Epoch 8/10, Loss: 0.23137861490249634
Epoch 9/10, Loss: 0.06536935269832611
Epoch 10/10, Loss: 0.030354849994182587
Test Accuracy: 72.18%
Average Least Confidence: 0.08586908876895905
Average Prediction Entropy: 0.22230054438114166
Average Margin Sampling: 0.08586746454238892
Average Cosine Similarity: 0.5769681930541992
Average L2 Norm: 0.9092544913291931
Average KL Divergence: 1.3938003778457642


### TRY 3: USING PRETRAINED MODEL: RESNET 18 FOR ACHIEVING BETTER ACCURACY

In [1]:
# Importing necessary libraries
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn.functional as F
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import entropy

# Define the transform for data augmentation and normalization
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.RandomResizedCrop(32),    # Randomly crop the image and resize to 32x32
    transforms.ToTensor(),               # Convert image to PyTorch tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize image pixel values
])

# Download and load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # Create training dataset
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # Create test dataset
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)

# Use a pre-trained ResNet18 model
# model = models.resnet18(pretrained=True)
model = models.resnet18(weights='IMAGENET1K_V1') # Initialize the ResNet18 model with pre-trained weights from the ImageNet dataset
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10)  # Change the output layer to have 10 classes

# Move the model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Check if GPU is available, else use CPU
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # SGD optimizer with momentum

# Training loop
num_epochs = 10 # Set the number of training epochs
m = 5  # Number of nearest neighbors for diversity measures

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm

# Creating a function to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add a small epsilon to avoid zero probabilities
        epsilon = 1e-10
        current_sample_prob = current_sample_prob + epsilon
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    # Inside the calculate_metrics function
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0  # Variable to track the running loss during each epoch
    # Iterate over the training dataset
    for i, data in enumerate(trainloader, 0): 
        inputs, labels = data # Get inputs and labels for the current batch
        inputs, labels = inputs.to(device), labels.to(device) # Move data to GPU if available
        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Calculate the loss
        loss.backward()  # Backward pass to compute gradients
        optimizer.step()  # Update model parameters using the optimizer
        running_loss += loss.item()  # Accumulate the running loss

    # Print the average loss for the current epoch
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(trainloader)}")


# Testing the model
model.eval()  # Set the model to evaluation mode
correct = 0  # Variable to track the number of correctly predicted images
total = 0  # Variable to track the total number of images in the test set

least_confidence_list = [] # Initialize an empty list to store least confidence values
prediction_entropy_list = [] # Initialize an empty list to store prediction entropy values
margin_sampling_list = [] # Initialize an empty list to store margin sampling values
cosine_similarity_list = [] # Initialize an empty list to store cosine similarity values
l2_norm_list = [] # Initialize an empty list to store L2 norm values
kl_divergence_list = [] # Initialize an empty list to store KL Divergence values

with torch.no_grad(): # Use torch.no_grad() to disable gradient computation during testing
    for data in testloader:  # Iterate over batches in the test loader
        images, labels = data # Get inputs and labels for the current batch
        images, labels = images.to(device), labels.to(device)  # Move data to GPU if available
        outputs = model(images)  # Forward pass
        _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels
        total += labels.size(0)  # Increment total number of images
        correct += (predicted == labels).sum().item()  # Count correctly predicted images

        # Diversity measures

        # Extract features from the model layers
        features = model.conv1(images)  # Apply convolutional layer 1 to the input images
        features = model.relu(features) # Apply Rectified Linear Unit (ReLU) activation function to introduce non-linearity
        features = model.maxpool(features) # Apply max pooling to reduce spatial dimensions and retain important information
        features = model.layer1(features)  # Use layer1 instead of conv2 for ResNet
        features = model.layer2(features)  # Use layer2 instead of conv2 for ResNet
        features = model.avgpool(features)  # Use avgpool instead of maxpool for ResNet

        # Flatten the features for distance calculations
        features = features.view(features.size(0), -1)

        calculate_metrics(outputs, features)

accuracy = 100 * correct / total # Calculate accuracy
print(f"Accuracy on the test set: {accuracy:.2f}%") # Print the test accuracy

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1, Loss: 1.3471097491128976
Epoch 2, Loss: 1.062493772610374
Epoch 3, Loss: 0.9705260539298777
Epoch 4, Loss: 0.919052358935861
Epoch 5, Loss: 0.8840376783133773
Epoch 6, Loss: 0.8539965738497122
Epoch 7, Loss: 0.8247907693733645
Epoch 8, Loss: 0.806672897675763
Epoch 9, Loss: 0.7819206634979419
Epoch 10, Loss: 0.76822626811769
Accuracy on the test set: 73.00%
Average Least Confidence: 0.2607212960720062
Average Prediction Entropy: 0.7505708336830139
Average Margin Sampling: 0.25922003388404846
Average Cosine Similarity: 0.8664329051971436
Average L2 Norm: 0.5039328336715698
Average KL Divergence: 0.48649540543556213


### TRY 4: USING ANOTHER PRETRAINED MODEL: DENSENET 121 MODEL FOR BETTER ACCURACY

In [2]:
# Importing necessary libraries 
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision import models
import torch.nn.functional as F
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import entropy

# Step 1: Set device and hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available, else use CPU
batch_size = 64  # Number of images in each mini-batch
learning_rate = 0.001  # Learning rate for the optimizer
epochs = 10 # Number of times to iterate through the entire dataset during training
m = 5  # Number of nearest neighbors for diversity measures 

# Step 2: Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize image pixel values
])

# Download and create training and test datasets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # Create training dataset
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # Create test dataset

# Create DataLoader instances to efficiently load and iterate over batches of data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Step 3: Initialize the pre-trained DenseNet model
# Initialize the DenseNet121 model with pre-trained weights from the ImageNet dataset
model = models.densenet121(weights='IMAGENET1K_V1')
# Modify the classifier for CIFAR-10 (10 classes)
model.classifier = nn.Linear(1024, 10)  # Change the output layer to have 10 classes for CIFAR-10
model = model.to(device)  # Move the model to the GPU (if available)

# Step 4: Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer with specified learning rate

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm

# Creating a function to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the top m neighbors for the current sample 
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add a small epsilon to avoid zero probabilities
        epsilon = 1e-10
        current_sample_prob = current_sample_prob + epsilon # epsilon is the  probability that the current sample corresponds to the current sample probability in the distribution function .
        neighbors_prob = neighbors_prob + epsilon 

        # Calculate KL divergence between the current sample and its neighbors
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    # Inside the calculate_metrics function
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Train the model
for epoch in range(epochs): # Iterate over each epoch
    model.train()  # Set the model to training mode
    for images, labels in train_loader: # Iterate through batches of training data
        images, labels = images.to(device), labels.to(device) # Move data to GPU if available
        optimizer.zero_grad()  # Zero gradients to clear previous gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backward pass to compute gradients
        optimizer.step()  # Update model parameters using the optimizer

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")  # Print the loss for the current epoch

# Evaluate the model on the test set
model.eval() # Set the model to evaluation mode
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

least_confidence_list = [] # Initialize an empty list to store least confidence values
prediction_entropy_list = [] # Initialize an empty list to store prediction entropy values
margin_sampling_list = [] # Initialize an empty list to store margin sampling values
cosine_similarity_list = [] # Initialize an empty list to store cosine similarity values
l2_norm_list = [] # Initialize an empty list to store L2 norm values
kl_divergence_list = [] # Initialize an empty list to store KL Divergence values

# Use torch.no_grad() to disable gradient computation during testing
with torch.no_grad():
    for images, labels in test_loader: # Iterate over batches in the test loader
        images, labels = images.to(device), labels.to(device)  # Move data to GPU if available
        outputs = model(images)  # Forward pass
        _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels
        total += labels.size(0)  # Increment total number of images
        correct += (predicted == labels).sum().item()  # Count correctly predicted images

        # Diversity measures
        features = model.features(images)  # Use the feature extraction part of the model
        features = F.adaptive_avg_pool2d(features, (1, 1))  # Global average pooling
        features = features.view(features.size(0), -1)  # Flatten the features for distance calculations

        calculate_metrics(outputs, features)

accuracy = correct / total  # Calculate accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")  # Print test accuracy

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Loss: 0.6909970045089722
Epoch 2/10, Loss: 1.0706369876861572
Epoch 3/10, Loss: 0.46002820134162903
Epoch 4/10, Loss: 0.2053932100534439
Epoch 5/10, Loss: 0.19087783992290497
Epoch 6/10, Loss: 0.1159883439540863
Epoch 7/10, Loss: 0.38801509141921997
Epoch 8/10, Loss: 0.23865385353565216
Epoch 9/10, Loss: 0.1268896460533142
Epoch 10/10, Loss: 0.48889589309692383
Test Accuracy: 84.22%
Average Least Confidence: 0.08035318553447723
Average Prediction Entropy: 0.22045551240444183
Average Margin Sampling: 0.08034031093120575
Average Cosine Similarity: 0.13549207150936127
Average L2 Norm: 1.2808501720428467
Average KL Divergence: 1.0776267051696777


### TRY 5: USING ANOTHER PRETRAINED MODEL: RESNET 50 FOR BETTER ACCURACY

In [14]:
# Importing necessary libraries 
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision import models
import torch.nn.functional as F
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import entropy

# Step 1: Set device and hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available, else use CPU
batch_size = 64  # Number of images in each mini-batch
learning_rate = 0.001  # Learning rate for the optimizer
epochs = 10 # Number of times to iterate through the entire dataset during training
m = 5  # Number of nearest neighbors for diversity measures 

# Step 2: Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize image pixel values
])

# Download and create training and test datasets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # Create training dataset
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # Create test dataset

# Create DataLoader instances to efficiently load and iterate over batches of data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Step 3: Initialize the pre-trained DenseNet model
# Initialize the DenseNet121 model with pre-trained weights from the ImageNet dataset
# model = models.efficientnet_v2_m(weights='IMAGENET1K_V1')
model = models.resnet50(weights='IMAGENET1K_V2')
# Modify the classifier for CIFAR-10 (10 classes)
model.classifier = nn.Linear(1024, 10)  # Change the output layer to have 10 classes for CIFAR-10
model = model.to(device)  # Move the model to the GPU (if available)

# Step 4: Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer with specified learning rate

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm

# Creating a function to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add a small epsilon to avoid zero probabilities
        epsilon = 1e-10
        current_sample_prob = current_sample_prob + epsilon
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    # Inside the calculate_metrics function
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Train the model
for epoch in range(epochs): # Iterate over each epoch
    model.train()  # Set the model to training mode
    for images, labels in train_loader: # Iterate through batches of training data
        images, labels = images.to(device), labels.to(device) # Move data to GPU if available
        optimizer.zero_grad()  # Zero gradients to clear previous gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backward pass to compute gradients
        optimizer.step()  # Update model parameters using the optimizer

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")  # Print the loss for the current epoch"""

# Evaluate the model on the test set
model.eval() # Set the model to evaluation mode
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

least_confidence_list = [] # Initialize an empty list to store least confidence values
prediction_entropy_list = [] # Initialize an empty list to store prediction entropy values
margin_sampling_list = [] # Initialize an empty list to store margin sampling values
cosine_similarity_list = [] # Initialize an empty list to store cosine similarity values
l2_norm_list = [] # Initialize an empty list to store L2 norm values
kl_divergence_list = [] # Initialize an empty list to store KL Divergence values

# Use torch.no_grad() to disable gradient computation during testing
with torch.no_grad():
    for images, labels in test_loader: # Iterate over batches in the test loader
        images, labels = images.to(device), labels.to(device)  # Move data to GPU if available
        outputs = model(images)  # Forward pass
        _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels
        total += labels.size(0)  # Increment total number of images
        correct += (predicted == labels).sum().item()  # Count correctly predicted images
        # Diversity measures

        # Extract features from the model layers
        features = model.conv1(images)  # Apply convolutional layer 1 to the input images
        features = model.relu(features) # Apply Rectified Linear Unit (ReLU) activation function to introduce non-linearity
        features = model.maxpool(features) # Apply max pooling to reduce spatial dimensions and retain important information
        features = model.layer1(features)  # Use layer1 instead of conv2 for ResNet
        features = model.layer2(features)  # Use layer2 instead of conv2 for ResNet
        features = model.avgpool(features)  # Use avgpool instead of maxpool for ResNet

        # Flatten the features for distance calculations
        features = features.view(features.size(0), -1)

        calculate_metrics(outputs, features) # Calculate metrics function call 

accuracy = correct / total  # Calculate accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")  # Print test accuracy

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Loss: 0.3159460723400116
Epoch 2/10, Loss: 0.6475083827972412
Epoch 3/10, Loss: 0.2746032178401947
Epoch 4/10, Loss: 0.7377022504806519
Epoch 5/10, Loss: 0.736390233039856
Epoch 6/10, Loss: 0.8800309300422668
Epoch 7/10, Loss: 0.4674454927444458
Epoch 8/10, Loss: 0.9689621925354004
Epoch 9/10, Loss: 0.44302937388420105
Epoch 10/10, Loss: 0.3406006991863251
Test Accuracy: 84.41%
Average Least Confidence: 0.0828772708773613
Average Prediction Entropy: 0.32879531383514404
Average Margin Sampling: 0.11347316950559616
Average Cosine Similarity: 0.6799415349960327
Average L2 Norm: 0.6122885346412659
Average KL Divergence: 0.3962635397911072


### TRY 6 : USINNG ANOTHER PRETRAINED MODEL : RESNET152 FOR ACHIEVING BETTER ACCURACY

In [1]:
# Importing necessary libraries 
# This includes the essential PyTorch and torchvision modules for working with neural networks and datasets.
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision import models
import torch.nn.functional as F
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import entropy

# Step 1: Set device and hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available, else use CPU
batch_size = 64  # Number of images in each mini-batch
learning_rate = 0.001  # Learning rate for the optimizer
epochs = 5 # Number of times to iterate through the entire dataset during training
m = 5  # Number of nearest neighbors for diversity measures 

# Step 2: Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize image pixel values
])

# Download and create training and test datasets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # Create training dataset
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # Create test dataset

# Create DataLoader instances to efficiently load and iterate over batches of data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Step 3: Initialize the pre-trained DenseNet model
# Initialize the DenseNet121 model with pre-trained weights from the ImageNet dataset
# model = models.efficientnet_v2_m(weights='IMAGENET1K_V1')
model = models.resnet152(weights='IMAGENET1K_V2')
# Modify the classifier for CIFAR-10 (10 classes)
model.classifier = nn.Linear(1024, 10)  # Change the output layer to have 10 classes for CIFAR-10
model = model.to(device)  # Move the model to the GPU (if available)

# Step 4: Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer with specified learning rate

# Creating a function to calculate uncertainty for each epoch metrics
def calculate_uncertainty_metrics(outputs):
    # Convert NumPy array to PyTorch tensor
    outputs_tensor = torch.from_numpy(outputs)
    # Applying softmax along dimension 1
    probabilities = F.softmax(outputs_tensor, dim=1)
    # Least Confidence: 1 - Maximum probability for each sample
    least_confidence = 1 - probabilities.max(dim=1).values.cpu().detach().numpy()
    # Handling NaN in prediction entropy
    current_probs = probabilities.clone().detach()
    current_probs[current_probs == 0] = 1e-10  # Adding a small epsilon to avoid log(0)
    # Prediction Entropy: Negative sum of (probability * log(probability)) for each class
    prediction_entropy = -torch.sum(current_probs * torch.log(current_probs), dim=1).cpu().detach().numpy()
    # Margin Sampling: 1 - (Maximum probability - Minimum probability) for each sample
    margin_sampling = 1 - torch.max(probabilities, dim=1).values.cpu().detach().numpy() - \
                      torch.min(probabilities, dim=1).values.cpu().detach().numpy()
    # Returning least confidence,prediction entropy and margin sampling values obtained
    return least_confidence, prediction_entropy, margin_sampling

# Creating a function to caalculate diversiy metrics
def calculate_diversity_metrics(features, m=5):
    # Calculate pairwise distances using cosine similarity
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    # Cosine Similarity: 1 - Mean cosine similarity with the top m neighbors for each sample
    cosine_similarity = 1 - feature_distances[:, 1:m+1].mean(axis=1)
    # Calculate pairwise distances using L2 (Euclidean) norm
    l2_distances = pairwise_distances(features.cpu().detach().numpy(), metric='euclidean')
    # L2 Norm: Mean L2 norm with the top m neighbors for each sample
    l2_norm = l2_distances[:, 1:m+1].mean(axis=1)
    # Returning cosine similarity and l2 norm values obtained 
    return cosine_similarity, l2_norm

# Creating a function to calculate kl divergence 
def calculate_kl_divergence(outputs, feature_distances, m=5):
    # List to store KL divergence scores for each sample
    kl_divergence = []
    # Iterate over each sample in the outputs
    for i in range(len(outputs)):
        # Calculate the probability distribution of the current sample
        current_sample_prob = F.softmax(outputs[i], dim=0)
        # Get the indices of the top m neighbors for the current sample
        neighbor_indices = feature_distances[i, 1:m+1].astype(int)
        # Calculate the average probability distribution of the neighbors
        neighbors_prob = torch.mean(F.softmax(outputs[neighbor_indices], dim=1), dim=0)

        # Add a small epsilon to avoid zero probabilities
        epsilon = 1e-10
        current_sample_prob = current_sample_prob + epsilon
        neighbors_prob = neighbors_prob + epsilon

        # Calculate KL divergence between the current sample and its neighbors
        kl_divergence.append(F.kl_div(torch.log(current_sample_prob), neighbors_prob, reduction='batchmean'))
    
    # Returning kl divergence values obtained
    return kl_divergence

# Creating a Function to calculate uncertainty and diversity metrics
def calculate_metrics(outputs, features, m=5):
    # Calculate uncertainty metrics
    least_confidence, prediction_entropy, margin_sampling = calculate_uncertainty_metrics(outputs.detach().numpy())
    # Extend lists with uncertainty metrics
    least_confidence_list.extend(torch.from_numpy(least_confidence))
    prediction_entropy_list.extend(torch.from_numpy(prediction_entropy))
    margin_sampling_list.extend(torch.from_numpy(margin_sampling))
    # Calculate diversity metrics
    feature_distances = pairwise_distances(features.cpu().detach().numpy(), metric='cosine')
    features_normalized = F.normalize(features, p=2, dim=1)
    cosine_similarity, l2_norm = calculate_diversity_metrics(features_normalized)
    # cosine_similarity, l2_norm = calculate_diversity_metrics(features)
    # Extend lists with diversity metrics
    cosine_similarity_list.extend(torch.from_numpy(cosine_similarity))
    # Inside the calculate_metrics function
    l2_norm_list.extend(torch.from_numpy(l2_norm))
    # Calculate KL divergence scores
    kl_divergence_scores = calculate_kl_divergence(outputs, feature_distances, m=5)
    # Extend the list with KL divergence scores
    kl_divergence_list.extend(kl_divergence_scores)

# Train the model
for epoch in range(epochs): # Iterate over each epoch
    model.train()  # Set the model to training mode
    for images, labels in train_loader: # Iterate through batches of training data
        images, labels = images.to(device), labels.to(device) # Move data to GPU if available
        optimizer.zero_grad()  # Zero gradients to clear previous gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backward pass to compute gradients
        optimizer.step()  # Update model parameters using the optimizer

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")  # Print the loss for the current epoch"""

# Evaluate the model on the test set
model.eval() # Set the model to evaluation mode
correct = 0  # Initialize the number of correctly predicted samples
total = 0  # Initialize the total number of samples

least_confidence_list = [] # Initialize an empty list to store least confidence values
prediction_entropy_list = [] # Initialize an empty list to store prediction entropy values
margin_sampling_list = [] # Initialize an empty list to store margin sampling values
cosine_similarity_list = [] # Initialize an empty list to store cosine similarity values
l2_norm_list = [] # Initialize an empty list to store L2 norm values
kl_divergence_list = [] # Initialize an empty list to store KL Divergence values

# Use torch.no_grad() to disable gradient computation during testing
with torch.no_grad():
    for images, labels in test_loader: # Iterate over batches in the test loader
        images, labels = images.to(device), labels.to(device)  # Move data to GPU if available
        outputs = model(images)  # Forward pass
        _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels
        total += labels.size(0)  # Increment total number of images
        correct += (predicted == labels).sum().item()  # Count correctly predicted images
        
        # Diversity measures
        # Extract features from the model layers
        features = model.conv1(images)  # Apply convolutional layer 1 to the input images
        features = model.relu(features) # Apply Rectified Linear Unit (ReLU) activation function to introduce non-linearity
        features = model.maxpool(features) # Apply max pooling to reduce spatial dimensions and retain important information
        features = model.layer1(features)  # Use layer1 instead of conv2 for ResNet
        features = model.layer2(features)  # Use layer2 instead of conv2 for ResNet
        features = model.avgpool(features)  # Use avgpool instead of maxpool for ResNet

        # Flatten the features for distance calculations
        features = features.view(features.size(0), -1)

        calculate_metrics(outputs, features) # Calculate metrics function call 

accuracy = correct / total  # Calculate accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")  # Print test accuracy

# Print the average values of uncertainty and diversity measures
print(f"Average Least Confidence: {torch.mean(torch.stack(least_confidence_list))}") # Print the average value of least confidence
print(f"Average Prediction Entropy: {torch.mean(torch.stack(prediction_entropy_list))}") # Print the average value of prediction entropy
print(f"Average Margin Sampling: {torch.mean(torch.stack(margin_sampling_list))}") # Print the average value of margin sampling
print(f"Average Cosine Similarity: {torch.mean(torch.stack(cosine_similarity_list))}") # Print the average value of cosine similarity
print(f"Average L2 Norm: {torch.mean(torch.stack(l2_norm_list))}") # Print the average value of L2 norm
print(f"Average KL Divergence: {torch.mean(torch.stack(kl_divergence_list))}") # Print the average value of KL divergence

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/5, Loss: 1.2921116352081299
Epoch 2/5, Loss: 0.9806951284408569
Epoch 3/5, Loss: 0.3718707263469696
Epoch 4/5, Loss: 0.11784355342388153
Epoch 5/5, Loss: 0.47144776582717896
Test Accuracy: 82.18%
Average Least Confidence: 0.12839721143245697
Average Prediction Entropy: 0.3821943402290344
Average Margin Sampling: 0.1283971667289734
Average Cosine Similarity: 0.9807947278022766
Average L2 Norm: 0.18331636488437653
Average KL Divergence: 0.007689288351684809
