In [1]:
import os
import torch
import random
import torchvision
from PIL import Image
from torchvision import datasets, transforms
from torch import nn
from utils.utils import LoadDataset, RandomErasing, set_seed
from simclr.simclr_model import SimCLR
from byol.byol_model import BYOL
from moco.moco_model import MoCo
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, random_split, Subset
from sklearn.manifold import TSNE
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Replace the path with the path to the dataset on your machine
cat_dog_dataset =  LoadDataset("/home/jovyan/data/cat_dog/", 50).load_data()
vehicles_dataset = LoadDataset("/home/jovyan/data/vehicles/", 50).load_data()
clothing_dataset = LoadDataset("/home/jovyan/data/clothing/", 50).load_data()

Number of images loaded: 2000
Number of images loaded: 6500
Number of images loaded: 3850


In [3]:
def support_vector_machine(feature_vectors, labels, cv_folds=5):
    
    """
    Support Vector Machine classifier
    Parameters:
    feature_vectors: numpy array of shape (n_samples, n_features)
    labels: numpy array of shape (n_samples,)
    cv_folds: int, number of cross-validation folds
    Returns:
    accuracy_train: float, accuracy on the training set
    accuracy_test: float, accuracy on the test set
    cross_val_scores: numpy array of shape (cv_folds,), accuracy scores for each fold
    """

    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_vectors)
    
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.25, random_state=42, stratify=labels)
    
    unique_labels = np.unique(labels)
    if len(unique_labels) == 2:
        classifier = SVC(kernel='linear', random_state=42)
    else:
        classifier = SVC(kernel='linear', decision_function_shape='ovr', random_state=42, break_ties=True)
    
    classifier.fit(X_train, y_train)
    
    y_train_predict = classifier.predict(X_train)
    y_test_predict = classifier.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_train_predict)
    accuracy_test = accuracy_score(y_test, y_test_predict)
    
    cross_val_scores = cross_val_score(classifier, features_scaled, labels, cv=cv_folds)
    
    return accuracy_train, accuracy_test, cross_val_scores

In [4]:
def random_forest(feature_vectors, labels, cv_folds=5):
    
    """
    Random Forest classifier
    Parameters:
    feature_vectors: numpy array of shape (n_samples, n_features)
    labels: numpy array of shape (n_samples,)
    cv_folds: int, number of cross-validation folds
    Returns:
    accuracy_train: float, accuracy on the training set
    accuracy_test: float, accuracy on the test set
    cross_val_scores: numpy array of shape (cv_folds,), accuracy scores for each fold
    """
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_vectors)
    
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.25, random_state=42, stratify=labels)
    
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)
    
    y_train_predict = classifier.predict(X_train)
    y_test_predict = classifier.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_train_predict)
    accuracy_test = accuracy_score(y_test, y_test_predict)
    
    cross_val_scores = cross_val_score(classifier, features_scaled, labels, cv=cv_folds)
    
    return accuracy_train, accuracy_test, cross_val_scores

In [5]:
def k_nearest_neighbors(feature_vectors, labels, cv_folds=5):
    
    """
    K-Nearest Neighbors classifier
    Parameters:
    feature_vectors: numpy array of shape (n_samples, n_features)
    labels: numpy array of shape (n_samples,)
    cv_folds: int, number of cross-validation folds
    Returns:
    accuracy_train: float, accuracy on the training set
    accuracy_test: float, accuracy on the test set
    cross_val_scores: numpy array of shape (cv_folds,), accuracy scores for each fold
    """
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_vectors)
    
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.25, random_state=42, stratify=labels)
    
    classifier = KNeighborsClassifier()
    classifier.fit(X_train, y_train)
    
    y_train_predict = classifier.predict(X_train)
    y_test_predict = classifier.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_train_predict)
    accuracy_test = accuracy_score(y_test, y_test_predict)
    
    cross_val_scores = cross_val_score(classifier, features_scaled, labels, cv=cv_folds)
    
    return accuracy_train, accuracy_test, cross_val_scores


In [6]:
def neural_network_classifier(feature_vectors, labels, cv_folds=5):
    
    """
    Neural Network classifier
    Parameters:
    feature_vectors: numpy array of shape (n_samples, n_features)
    labels: numpy array of shape (n_samples,)
    cv_folds: int, number of cross-validation folds
    Returns:
    accuracy_train: float, accuracy on the training set
    accuracy_test: float, accuracy on the test set
    cross_val_scores: numpy array of shape (cv_folds,), accuracy scores for each fold
    """
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_vectors)
    
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.25, random_state=42, stratify=labels)
    
    classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42, max_iter=1000)
    classifier.fit(X_train, y_train)
    
    y_train_predict = classifier.predict(X_train)
    y_test_predict = classifier.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_train_predict)
    accuracy_test = accuracy_score(y_test, y_test_predict)
    
    cross_val_scores = cross_val_score(classifier, features_scaled, labels, cv=cv_folds)
    
    return accuracy_train, accuracy_test, cross_val_scores

In [7]:
def extracting_feature_vectors_from_simclr(model_path, data_loader):
    
    """Function to extract feature vectors from a pre-trained SimCLR model
    Parameters:
    model_path: str, path to the pre-trained model
    data_loader: torch DataLoader, data loader for the dataset
    Returns:
    feature_vectors: numpy array of shape (n_samples, n_features), extracted feature vectors
    labels: numpy array of shape (n_samples,), labels for the feature vectors
    """
    
    set_seed(42)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resnet = torchvision.models.resnet18()
    backbone = nn.Sequential(*list(resnet.children())[:-1])
    model = SimCLR(backbone)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    feature_vectors = []
    labels = []

    with torch.no_grad():
        for images, label in data_loader:
            images = images.to(device) 
            outputs = model.backbone(images).flatten(start_dim=1)
            feature_vectors.append(outputs.cpu().numpy())
            labels.append(label.numpy())
    feature_vectors = np.concatenate(feature_vectors)
    labels = np.concatenate(labels)

    return feature_vectors, labels

In [8]:
# All the pre-trained models paths different seed values, replace with your own paths
simclr_models = {"seed 0": ["/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_center_cropping.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_cropping.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_color_jitter.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_flipping.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_perspective.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_rotation.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_grayscale.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_gaussian_blur.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_invert.pth", 
                            "/home/jovyan/models/trained_models/seed_zero/simclr/simclr_model_random_erasing.pth"], 
                 
                "seed 42": ["/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_center_cropping.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_cropping.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_color_jitter.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_flipping.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_perspective.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_rotation.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_grayscale.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_gaussian_blur.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_invert.pth", 
                            "/home/jovyan/models/trained_models/seed_42/simclr/simclr_model_random_erasing.pth"],
                 
                "seed 123": ["/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_center_cropping.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_cropping.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_color_jitter.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_flipping.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_perspective.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_rotation.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_grayscale.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_gaussian_blur.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_invert.pth", 
                             "/home/jovyan/models/trained_models/seed_123/simclr/simclr_model_random_erasing.pth"]}

In [9]:
def get_simclr_svm_accuracies(models, dataset):
    
    """Function to get the accuracies of the pre-trained SimCLR models using SVM classifier
    Parameters:
    models: dict, dictionary containing the paths to the pre-trained models
    dataset: torch DataLoader, data loader for the dataset 
    Returns:
    pandas DataFrame, containing the accuracies of the pre-trained models
    """
    
    accuracies = {
        "seed": [],
        "augmentation": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "cross_val_score": []
    }

    for seed, model_paths in models.items():
        for model_path in model_paths:
            augmentation = model_path.split('/')[-1].replace('.pth', '')
            
            
            features, labels = extracting_feature_vectors_from_simclr(model_path, dataset)
            acc_train, acc_test, cross_val_scores = support_vector_machine(features, labels)
            accuracies["seed"].append(seed)
            accuracies["augmentation"].append(augmentation)
            accuracies["accuracy_train"].append(acc_train)
            accuracies["accuracy_test"].append(acc_test)
            accuracies["cross_val_score"].append(cross_val_scores)
    return pd.DataFrame(accuracies)

In [10]:
simclr_svm_cat_dog_accuracies_df = get_simclr_svm_accuracies(simclr_models, cat_dog_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_svm_cat_dog_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_svm_cat_dog_accuracies.csv")

In [11]:
simclr_svm_vehicles_accuracies_df = get_simclr_svm_accuracies(simclr_models, vehicles_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_svm_vehicles_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_svm_vehicles_accuracies.csv")

In [12]:
simclr_svm_clothes_accuracies_df = get_simclr_svm_accuracies(simclr_models, clothing_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_svm_clothes_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_svm_clothes_accuracies.csv")

In [13]:
def get_simclr_knn_accuracies(models, dataset):
    
    """Function to get the accuracies of the pre-trained SimCLR models using K-Nearest Neighbors classifier
    Parameters:
    models: dict, dictionary containing the paths to the pre-trained models
    dataset: torch DataLoader, data loader for the dataset
    Returns:
    pandas DataFrame, containing the accuracies of the pre-trained models
    """
    
    accuracies = {
        "seed": [],
        "augmentation": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "cross_val_score": []
    }

    for seed, model_paths in models.items():
        for model_path in model_paths:
            augmentation = model_path.split('/')[-1].replace('.pth', '')
            
            
            features, labels = extracting_feature_vectors_from_simclr(model_path, dataset)
            acc_train, acc_test, cross_val_scores = k_nearest_neighbors(features, labels)
            accuracies["seed"].append(seed)
            accuracies["augmentation"].append(augmentation)
            accuracies["accuracy_train"].append(acc_train)
            accuracies["accuracy_test"].append(acc_test)
            accuracies["cross_val_score"].append(cross_val_scores)
    return pd.DataFrame(accuracies)

In [14]:
simclr_knn_cat_dog_accuracies_df = get_simclr_knn_accuracies(simclr_models, cat_dog_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_knn_cat_dog_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_knn_cat_dog_accuracies.csv")

In [15]:
simclr_knn_vehicles_accuracies_df = get_simclr_knn_accuracies(simclr_models, vehicles_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_knn_vehicles_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_knn_vehicles_accuracies.csv")

In [16]:
simclr_knn_clothes_accuracies_df = get_simclr_knn_accuracies(simclr_models, clothing_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_knn_clothes_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_knn_clothes_accuracies.csv")

In [17]:
def get_simclr_rf_accuracies(models, dataset):
    
    """Function to get the accuracies of the pre-trained SimCLR models using Random Forest classifier
    Parameters:
    models: dict, dictionary containing the paths to the pre-trained models
    dataset: torch DataLoader, data loader for the dataset
    Returns:
    pandas DataFrame, containing the accuracies of the pre-trained models
    """
    
    accuracies = {
        "seed": [],
        "augmentation": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "cross_val_score": []
    }

    for seed, model_paths in models.items():
        for model_path in model_paths:
            augmentation = model_path.split('/')[-1].replace('.pth', '')
            
            
            features, labels = extracting_feature_vectors_from_simclr(model_path, dataset)
            acc_train, acc_test, cross_val_scores = random_forest(features, labels)
            accuracies["seed"].append(seed)
            accuracies["augmentation"].append(augmentation)
            accuracies["accuracy_train"].append(acc_train)
            accuracies["accuracy_test"].append(acc_test)
            accuracies["cross_val_score"].append(cross_val_scores)
    return pd.DataFrame(accuracies)

In [18]:
simclr_rf_cat_dog_accuracies_df = get_simclr_rf_accuracies(simclr_models, cat_dog_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_rf_cat_dog_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_rf_cat_dog_accuracies.csv")

In [19]:
simclr_rf_vehicles_accuracies_df = get_simclr_rf_accuracies(simclr_models, vehicles_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_rf_vehicles_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_rf_vehicles_accuracies.csv")

In [20]:
simclr_rf_clothes_accuracies_df = get_simclr_rf_accuracies(simclr_models, clothing_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_rf_clothes_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_rf_clothes_accuracies.csv")

In [21]:
def get_simclr_nn_accuracies(models, dataset):
    
    """Function to get the accuracies of the pre-trained SimCLR models using Neural Network classifier
    Parameters:
    models: dict, dictionary containing the paths to the pre-trained models
    dataset: torch DataLoader, data loader for the dataset
    Returns:
    pandas DataFrame, containing the accuracies of the pre-trained models
    """

    accuracies = {
        "seed": [],
        "augmentation": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "cross_val_score": []
    }

    for seed, model_paths in models.items():
        for model_path in model_paths:
            augmentation = model_path.split('/')[-1].replace('.pth', '')
            
            
            features, labels = extracting_feature_vectors_from_simclr(model_path, dataset)
            acc_train, acc_test, cross_val_scores = neural_network_classifier(features, labels)
            accuracies["seed"].append(seed)
            accuracies["augmentation"].append(augmentation)
            accuracies["accuracy_train"].append(acc_train)
            accuracies["accuracy_test"].append(acc_test)
            accuracies["cross_val_score"].append(cross_val_scores)
    return pd.DataFrame(accuracies)

In [22]:
simclr_nn_cat_dog_accuracies_df = get_simclr_nn_accuracies(simclr_models, cat_dog_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_nn_cat_dog_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_nn_cat_dog_accuracies.csv")

In [23]:
simclr_nn_vehicles_accuracies_df = get_simclr_nn_accuracies(simclr_models, vehicles_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_nn_vehicles_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_nn_vehicles_accuracies.csv")

In [24]:
simclr_nn_clothes_accuracies_df = get_simclr_nn_accuracies(simclr_models, clothing_dataset)
# Replace the path with the path where you want to save the accuracies
simclr_nn_clothes_accuracies_df.to_csv("/home/jovyan/scripts/accuracies/simclr/simclr_nn_clothes_accuracies.csv")