# Part A - (1) Feature Extraction
The code has been modified to split the data into 3 sets: Training (70%), Calibration (15%) and Testing (15%) using sklearn's test_train_split function.

In [None]:
import torch
import timm
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from PIL import Image
import numpy as np
from tqdm import tqdm
import pandas as pd
import os

# 1. Load pretrained Vision Transformer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit = timm.create_model("vit_base_patch16_224", pretrained=True).to(device)
vit.eval() # disable dropout, etc.

# 2. Define preprocessing consistent with ImageNet training
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
])

# 3. Directory containing CelebA images (modify path)
image_dir = "/home/arty/projects/cs342_dataset/img_align_celeba"
image_list = sorted(os.listdir(image_dir))[:20000] # sample subset

# Get labeled attributes from dataset
attr_file = "/home/arty/projects/cs342_dataset/list_attr_celeba.csv"
attr_df = pd.read_csv(attr_file)

# Split images into train (70%). calib (15%), test (15%)
train_set, remaining_set = train_test_split(
    image_list, 
    test_size=0.30, 
    random_state=42, # Use a fixed random_state for reproducibility
    shuffle=True
)
calib_set, test_set = train_test_split(
    remaining_set, 
    test_size=0.50, # 50% of the remaining 30% = 15% of the total
    random_state=42, # Using the same random_state is good practice
    shuffle=True
)

sets = [train_set, calib_set, test_set]

# 4. Extract embeddings
for split_name, image_list in zip(["train", "calib", "test"], sets):
    print(f"Processing {split_name} set with {len(image_list)} images...")

    all_features = []
    with torch.no_grad():
        for fname in tqdm(image_list):
            img = Image.open(os.path.join(image_dir, fname)).convert("RGB")
            x = preprocess(img).unsqueeze(0).to(device)
            features = vit.forward_features(x)[:, 0, :] # shape: (1, 768)
            all_features.append(features.cpu().numpy())
    all_features = np.concatenate(all_features, axis=0)

    # Add 'Smiling' class label as the last column
    smiling_labels = []
    for fname in image_list:
        index = int(fname.split('.')[0]) 
        smiling_label = 1 if attr_df.at[index, 'Smiling'] == 1 else 0
        smiling_labels.append(smiling_label)
    smiling_labels = np.array(smiling_labels).reshape(-1, 1)
    all_features = np.hstack((all_features, smiling_labels))

    print("Feature matrix shape:", all_features.shape) # e.g. (20000, 768 + 1)

    # 5. Save for later use
    np.save(f"celeba_vit_embeddings_{split_name}.npy", all_features)

# Part A - (2) Train Classifiers
Implement a Naive Bayes classifier to predict the Smiling attribute
(1 = smiling, 0 = not smiling) and report test accuracy. The Naive Bayes classifier should
be implemented by you and not be a call to a library function. The features here
are numerical (not categorical). So conditioned on the label, model each feature as an
independent Gaussian. Your algorithm should estimate the mean and variance of each of
these Gaussians, using the training data set

In [55]:
import pandas as pd
import numpy as np
from math import exp
from math import sqrt
from math import pi

train_data = pd.DataFrame(np.load("./celeba_vit_embeddings_train.npy"))
train_data.head()

num_features = train_data.shape[1] - 1

# First separate the data by class
pos_df = train_data[train_data[num_features]==1]
neg_df = train_data[train_data[num_features]==0]

# Now we calculate the mean and std deviation for each feature in both classes
pos_std_devs = pos_df.std()
pos_means = pos_df.mean()
neg_std_devs = neg_df.std()
neg_means = neg_df.mean()

# Calculate PDF for a given input feature vector
def calc_probs(vector):
    total_rows = train_data.shape[0]
    pos_probability = pos_df.shape[0] / total_rows
    print(vector, pos_probability)
    
    for i in range(num_features):
        mean, stddev = pos_means[i], pos_std_devs[i]
        exponent = exp(- (( vector[i] - mean ) ** 2) / (2 * (stddev ** 2)))
        print(exponent)
        pos_probability *= (1 / (sqrt(2 * pi) * stddev)) * exponent
        print(pos_probability)
        
    neg_probability = neg_df.shape[0] / total_rows
    for i in range(num_features):
        mean, stddev = neg_means[i], neg_std_devs[i]
        exponent = exp(- (( vector[i] - mean ) ** 2 / 2 * (stddev ** 2)))
        neg_probability *= (1 / (sqrt(2 * pi) * stddev)) * exponent

    return pos_probability, neg_probability
    
# Run predictions on the training set
print(calc_probs(list(pos_df.iloc[0])))

[1.30446195602417, 2.2370095252990723, -1.0959649085998535, 3.5435221195220947, -0.32467082142829895, 0.5419005751609802, -1.6719592809677124, 3.1564881801605225, 1.7463539838790894, -1.1957743167877197, 1.3328677415847778, 1.2399859428405762, 0.7305935621261597, -0.8814429044723511, 1.5555787086486816, 0.9649535417556763, -0.5953223705291748, -0.47772765159606934, 2.0204246044158936, -0.06251582503318787, 3.1298110485076904, 0.7472555637359619, -1.0814709663391113, 0.10577128827571869, 0.7043524384498596, -3.2460856437683105, 0.6094692945480347, -0.38852065801620483, -4.020289421081543, -1.4311262369155884, -0.33749449253082275, -0.7883509397506714, -2.0449607372283936, -2.167358636856079, -0.5493782162666321, 1.2516337633132935, -0.5461427569389343, 2.626941204071045, -3.3853137493133545, -1.7989264726638794, 2.4696829319000244, -1.0445243120193481, 2.2145121097564697, 1.3821587562561035, -1.2239160537719727, 0.018366938456892967, -1.746224045753479, -1.0477486848831177, -0.197500109