## Task 1

The general approach taken for task 1 is as follows.

1) We process the images using the pre-trained Vision Transformer (vit-base-patch16-224-in21k) and save the extracted features.
2) Post this we use simple LwP using cosine similarity as the distance metric to make classifications.
3) Then we make the updates using a confidence based approach wherein the prototypes are updated by a running weighted average of current prototype and new computed means, weighted by cosine-similarity based confidence.

In [1]:
import torch
from torchvision import transforms
from transformers import ViTModel, ViTFeatureExtractor
from PIL import Image
import os
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load the data from the .pth file
def load_data(file_path):
    # Load the .pth file
    t = torch.load(file_path)
    
    # Extract data and targets (assumes they are numpy arrays)
    data, targets = t['data'], t['targets']
    
    # Convert data to PyTorch tensors if they are numpy arrays
    if isinstance(data, np.ndarray):
        data = torch.tensor(data)
        
    if isinstance(targets, np.ndarray):
        targets = torch.tensor(targets)
    
    return data, targets

In [3]:
# 4. Extract features from images using ViT
def extract_features(model, images, batch_size=32):
    features = []
    with torch.no_grad():
        for i in range(0, len(images), batch_size):
            batch = images[i:i+batch_size]
            outputs = model(pixel_values=batch).last_hidden_state
            features.append(outputs[:, 0, :])
    return torch.cat(features)


In [4]:
# 3. Load the Vision Transformer (ViT) model
def load_vit_model():
    # Load a pre-trained ViT model (e.g., ViT-B/16)
    model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
    model.eval()  # Set to evaluation mode
    return model

In [5]:
# 2. Preprocess the images
def preprocess_images(data, image_size=224):
    # Define transformations: Resize, Normalize
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),  # Resize to ViT input size
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
    ])
    
    # Apply transformations to the images
    # Assumes data has shape (N, H, W, C) and is in range [0, 255] (numpy-like)
    data = data.permute(0, 3, 1, 2) / 255.0  # Convert to (N, C, H, W) and normalize to [0, 1]
    data = torch.stack([transform(img) for img in data])
    
    return data

In [6]:
# 5. Main function
def main(file_path, output_file="vit_features.pth"):
    # Step 1: Load the data
    data,targets = load_data(file_path)
    
    # Step 2: Preprocess the data
    preprocessed_data = preprocess_images(data)
    
    # Step 3: Load the ViT model
    vit_model = load_vit_model()
    
    # Step 4: Extract features
    features = extract_features(vit_model, preprocessed_data)
    
    # Step 5: Save the features and targets
    torch.save({'features': features, 'targets': targets}, output_file)


In [None]:
for i in range(1,21):
    file_path = f'dataset/part_one_dataset/train_data/{i}_train_data.tar.pth'
    main(file_path, output_file=f"ExtractedFeatures/vit_features_train_{i}.pth")
    file_path = f'dataset/part_one_dataset/eval_data/{i}_eval_data.tar.pth'  
    main(file_path, output_file=f"ExtractedFeatures/vit_features_eval_{i}.pth")

We can now load the pre-made features for task 1 and task 2.

In [None]:
# Train Data
data = torch.load("ExtractedFeatures/vit_features_train_1.pth")
features, targets = data['features'], data['targets']
X_train = normalize(features, axis=1)
Y_train = targets

# Eval Models
data = torch.load("ExtractedFeatures/vit_features_eval_1.pth")
features, targets = data['features'], data['targets']
X_test = normalize(features, axis=1)
Y_test = targets

# Number of classes in CIFAR-10
num_classes = 10

  data = torch.load("ExtractedFeatures/ExtractedFeatures/vit_features_train_1.pth")
  data = torch.load("ExtractedFeatures/ExtractedFeatures/vit_features_eval_1.pth")


In [9]:
# Compute class prototypes (mean feature vector for each class)
prototypes = np.zeros((num_classes, X_train.shape[1]))
sizes = np.zeros(num_classes)
for cls in range(num_classes):
    class_features = X_train[Y_train == cls]
    prototypes[cls] = np.mean(class_features, axis=0)
    sizes[cls] = len(class_features)
# Normalize prototypes for cosine similarity
prototypes = normalize(prototypes, axis=1)


At each step we save cosine similarity with the closest mean to see how confident we are that this point belongs to this label.

In [10]:
# Classification function
def classify(sample, prototypes):
    similarities = cosine_similarity(sample.reshape(1, -1), prototypes)
    confidence = np.max(similarities)  # Confidence based on maximum similarity
    return np.argmax(similarities) , confidence

In [11]:
# Evaluate the model
accuracies = []
correct_predictions = 0
for i in range(len(X_test)):
    prediction,_ = classify(X_test[i], prototypes)
    if prediction == Y_test[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(X_test)
accuracies.append(accuracy)

In [12]:
all_accuracies = []
all_accuracies.append(accuracies)

Now we take a weighted average to update the means. This ensures that wrong predictions are devalued while updating the means.

In [13]:
def update_prototype(data_new , prototypes, sizes):
    X_train = normalize(data_new['features'], axis=1)
    preds = np.zeros(len(X_train))
    confidences = np.zeros(len(X_train))

    # Classify each sample and compute confidence
    for i in range(len(X_train)):
        preds[i], confidences[i] = classify(X_train[i], prototypes)

    # Update prototypes with weighted contributions
    for cls in range(num_classes):
        class_indices = np.where(preds == cls)[0]
        if len(class_indices) > 0:
            class_features = X_train[class_indices]
            class_confidences = confidences[class_indices].reshape(-1, 1)  # Reshape for broadcasting

            # Weighted sum of features
            weighted_sum = np.sum(class_features * class_confidences, axis=0)
            weighted_count = np.sum(class_confidences)  # Total confidence as a weight sum

            # Update the prototype
            prototypes[cls] = (
                prototypes[cls] * sizes[cls] + weighted_sum
            ) / (sizes[cls] + weighted_count)
            sizes[cls] += weighted_count  # Update size with the sum of confidences

    # Normalize prototypes for cosine similarity
    prototypes = normalize(prototypes, axis=1)
    return prototypes, sizes

In [None]:
def holdout_accuracy(i , prototypes):
    accuracies = []
    for j in range(1,i+1):
        data = torch.load(f"ExtractedFeatures/vit_features_eval_{j}.pth")
        features, targets = data['features'], data['targets']
        X_test = normalize(features, axis=1)
        Y_test = targets
        correct_predictions = 0
        for i in range(len(X_test)):
            prediction, _= classify(X_test[i], prototypes)
            if prediction == Y_test[i]:
                correct_predictions += 1

        accuracy = correct_predictions / len(X_test)
        accuracies.append(accuracy)
    return accuracies

In [None]:
for i in range(2,11):
    data = torch.load(f"ExtractedFeatures/vit_features_train_{i}.pth")
    prototypes, sizes = update_prototype(data, prototypes, sizes)

    accuracies = holdout_accuracy(i, prototypes)
    all_accuracies.append(accuracies)    


  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_train_{i}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_train_{i}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_train_{i}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFeatures/vit_features_eval_{j}.pth")
  data = torch.load(f"ExtractedFeatures/ExtractedFe

In [16]:
np.save("prototypes.npy", prototypes)
np.save("sizes.npy", sizes)

In [17]:
all_accuracies

[[0.8876],
 [0.8868, 0.8944],
 [0.8852, 0.8956, 0.9052],
 [0.8868, 0.8948, 0.9052, 0.9072],
 [0.8856, 0.8948, 0.906, 0.9064, 0.8996],
 [0.8856, 0.892, 0.906, 0.9072, 0.8996, 0.8972],
 [0.8856, 0.8932, 0.9064, 0.906, 0.8992, 0.8976, 0.8944],
 [0.8856, 0.8924, 0.9064, 0.9072, 0.8996, 0.8964, 0.8948, 0.8936],
 [0.8852, 0.892, 0.906, 0.9068, 0.8996, 0.8964, 0.8952, 0.8936, 0.898],
 [0.8856,
  0.8924,
  0.9052,
  0.9072,
  0.8992,
  0.8968,
  0.8952,
  0.8928,
  0.8976,
  0.9008]]

Thus we that the loss of accuracies on older datasets are marginal, and some of them even improve, suggesting that confidence based updating allows us to get better ideas of the label means, improving our prototypes.