In [None]:
import torchvision
import torchvision.transforms as transforms
import torch 
import torch.nn.functional as F
from torch import nn
from torch import optim
from torchvision.models import resnet18
from torchvision.models import resnet50
from urllib.request import urlopen

import numpy as np
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import normalize
from sklearn.svm import OneClassSVM
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc
import pandas as pd
import time

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
def K9_dataloader(dataset="fMNIST",batch_size=16,shuffle_train_set=False):
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # COMP6248 Team K9 Density Estimators dataloader
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Input: dataset e{"fMNIST","CIFAR10","CIFAR100"}, other arguments are self explanatory
  # Outputs: train and test loaders
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Author: Ian
  # Reviewed by: tbc
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*

  #Setup the preprocess function
  if dataset == "fMNIST":
    preprocess_input = transforms.Compose([
                transforms.Resize(256),    #resize 256, then centercrop 224; as per augment.py line 93 and line 97
                transforms.CenterCrop(224),
                transforms.Grayscale(3),   
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])])   #this is in one of their python scripts, and is standard for ImageNet
  else:
    preprocess_input = transforms.Compose([    #no Grayscale for CIFAR10/CIFAR100
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])])

  #Download the data
  if dataset =="fMNIST":
    train_set = torchvision.datasets.FashionMNIST(root='./data/FashionMNIST',train=True,download=True,transform=preprocess_input)
    test_set = torchvision.datasets.FashionMNIST(root='./data/FashionMNIST',train=False,download=True,transform=preprocess_input)
  if dataset =="CIFAR10":
    train_set = torchvision.datasets.CIFAR10(root='./data/CIFAR10',train=True,download=True,transform=preprocess_input)
    test_set = torchvision.datasets.CIFAR10(root='./data/CIFAR10',train=False,download=True,transform=preprocess_input)
  if dataset =="CIFAR100":
    train_set = torchvision.datasets.CIFAR100(root='./data/CIFAR100',train=True,download=True,transform=preprocess_input)
    test_set = torchvision.datasets.CIFAR100(root='./data/CIFAR100',train=False,download=True,transform=preprocess_input)

  #Setup the loaders
  train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,shuffle=shuffle_train_set)
  test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

  print()
  print(f"train_loader batches: {len(train_loader)}, test_loader batches: {len(test_loader)}, batch_size: {batch_size}")
  print(f"train_shuffle set to {shuffle_train_set}")
  print(f"train_set length: {len(train_set)}, test_set length: {len(test_set)}")
  print(f"train_loader and test_loader ready for {dataset}.")
  print()
  return train_loader, test_loader

In [None]:
def K9_resnet1850_ftex(train_loader, test_loader, mdltype="resnet18", pretrained=False):
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # COMP6248 Team K9 Density Estimators feature extractor for resnet 18/50
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Input: mldtype e{"resnet18","resnet50"}, other arguments self explanatory
  # Outputs: 1D array of class AUCs
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Author: Ian
  # Reviewed by: tbc
  ########################################################

  #######################################################################################
  ############### EXTRACT FEATURES FROM RESNET // 
  # based upon https://github.com/ecs-vlc/COMP6248/blob/master/docs/labs/lab6/genfeats.py
  #######################################################################################

  torch.cuda.empty_cache(); import gc; gc.collect()    

  if mdltype == "resnet18":
    model = resnet18(pretrained=pretrained)
  if mdltype == "resnet50":
    model = resnet50(pretrained=pretrained)

  feature_extractor_model = nn.Sequential(*list(model.children())[:-2], nn.AdaptiveAvgPool2d((1,1)))
  feature_extractor_model.eval()
  feature_extractor_model = feature_extractor_model.to(device)

  ##############
  #train set to vects
  ##############
  temp_ft, temp_lb = [], []
  i = 0
  for dt, lb in train_loader:
    if i % 500 == 0: print(f"Extracting batch {i} from train_loader.")
    i+=1
    tempfts = feature_extractor_model(dt.to(device))
    for j in range(tempfts.shape[0]):
      temp_ft.append(tempfts[j].reshape(-1).cpu().detach().numpy())
      temp_lb.append(lb[j].detach().numpy())

  train_ft = np.array(temp_ft)
  train_lb = np.array(temp_lb)
  ##############
  #test set to vects
  ##############
  temp_ft, temp_lb = [], []
  i = 0
  for dt, lb in test_loader:
    if i % 500 == 0: print(f"Extracting batch {i} from test_loader.")
    i+=1
    tempfts = feature_extractor_model(dt.to(device))
    for j in range(tempfts.shape[0]):
      temp_ft.append(tempfts[j].reshape(-1).cpu().detach().numpy())
      temp_lb.append(lb[j].detach().numpy())

  test_ft = np.array(temp_ft)
  test_lb = np.array(temp_lb)

  print()
  print(f"train_ft shape: {train_ft.shape}")
  print(f"train_lb shape: {train_lb.shape}")
  print(f"test_ft shape:  {test_ft.shape}")
  print(f"test_ft shape:  {test_lb.shape}")
  print(f"Feature extraction from {mdltype} (pretrained={pretrained}), is complete.")
  print()

  return train_ft, train_lb, test_ft, test_lb

In [None]:
def K9_stratified_class_sample(data_ft, data_lb, samp_per_cls=500, random_seed = False):
  print(f"Choosing {samp_per_cls} samples per class.  data_lb has {len(np.unique(data_lb))} classes.")

  idxx = np.array([],dtype="int")
  for cls in np.unique(data_lb):
    idxx = np.append(idxx,
                    np.random.choice(np.where(data_lb == cls)[0],size=samp_per_cls))

  data_ft_sampled, data_lb_sampled = data_ft[idxx,:], data_lb[idxx]

  #check
  print("Summary of data_lb_sampled class sample size:")
  i = 1
  for cls in np.unique(data_lb):
    if i % 10 != 0:
      print(f"Cls {cls}: {np.sum(data_lb_sampled == cls)}",end= " | ")
    else:
      print(f"Cls {cls}: {np.sum(data_lb_sampled == cls)}")
    i += 1
  
  return data_ft_sampled, data_lb_sampled

In [None]:
def K9_OCSVM_v2(X_train, y_train, X_test, y_test, kernel_type='rbf'):
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # COMP6248 Team K9 Density Estimators reproduction of OC-SVM 
  # logic based on paper's train.py specification
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Input: features and labels for train and test sets;
  # kernel_type e{'rbf','linear'}.
  # Outputs: 1D array of class AUCs
  #*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*
  # Author: Ian
  # Reviewed by: Marios
  ########################################################

  print("Starting K9 OC-SVM, kernel_type = "+kernel_type)
  print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}")
  print(f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")
  print()
  train_classes, per_class_auc = np.unique(y_train), []
  for one_class in train_classes:
    #Normalise train set; set gamma (both as per the paper)
    OC_X_train = X_train[y_train==one_class,:]
    OC_X_train_normalised = normalize(OC_X_train,norm='l2',axis=1)   
    
    #Fit the OC-SVM
    if kernel_type == 'rbf':
      #gamma = 10/(np.var(OC_X_train_normalised) * OC_X_train_normalised.shape[1])
      #clf = OneClassSVM(kernel='rbf', gamma=gamma)
      clf = OneClassSVM(kernel='rbf')
    else:
      clf = OneClassSVM(kernel='linear')
    clf = clf.fit(OC_X_train_normalised)

    #Use fitted model to make predictions on test
    X_test_normalised = normalize(X_test,norm='l2',axis=1)   
    y_test_pred = clf.predict(X_test_normalised)
    y_test_pred_scores = clf.score_samples(X_test_normalised)
    y_test_pred_AUC = roc_auc_score(1.*(y_test==one_class),y_test_pred_scores)

    #save AUC
    per_class_auc = np.append(per_class_auc, y_test_pred_AUC)

    #Print out results
    y_test_tp, y_test_tn = np.sum(y_test_pred[y_test==one_class] == 1), np.sum(y_test_pred[y_test!=one_class] == -1)
    y_test_fp, y_test_fn = np.sum(y_test_pred[y_test!=one_class] == 1), np.sum(y_test_pred[y_test==one_class] == -1)
    y_test_n_inlier, y_test_n_outlier = np.sum(y_test==one_class), np.sum(y_test!=one_class)
    y_test_fpr, y_test_tpr = y_test_fp/(y_test_fp+y_test_tn), y_test_tp/(y_test_tp+y_test_fn)
    print(f"Class: {one_class}")
    print(f"AUC score: {y_test_pred_AUC: .4f}, Accuracy: {(y_test_tp+y_test_tn)/len(y_test): .4f}")
    print(f"OC_X_train shape: {OC_X_train.shape}, X_test shape: {X_test.shape}.")
    print(f"Results after model application to *test* set: n_inlier: {y_test_n_inlier} ; n_outlier: {y_test_n_outlier}")
    print(f"TP: {y_test_tp}, TN: {y_test_tn}, FP: {y_test_fp}, FN: {y_test_fn}")
    print(f"TPR: {y_test_tpr}, FPR: {y_test_fpr}")
    print()
  print(f"Unweighted mean AUC: {np.mean(per_class_auc): .4f}")
  return per_class_auc

In [None]:
def K9_pd_runsummary():
  import pandas as pd
  run_params  = pd.DataFrame([dataset, batch_size, mdltype, pretrained, samp_per_cls, random_seed, kernel_type,
                        X_train.shape[0], X_train.shape[1], y_train.shape[0], X_test.shape[0], X_test.shape[1], y_test.shape[0],np.mean(res)])
  run_results = pd.DataFrame(res)
  run_summary = pd.concat([run_params,run_results],ignore_index=True)
  return run_summary

### Single Run on fMNIST ResNet18

In [None]:
# Minimal example of loading and running one run
train_loader, test_loader = K9_dataloader(dataset="fMNIST", batch_size=256)
train_ft, train_lb, test_ft, test_lb = K9_resnet1850_ftex(train_loader, test_loader, mdltype="resnet18",pretrained=False)
X_train, y_train = K9_stratified_class_sample(train_ft,train_lb)
X_test, y_test   = test_ft,  test_lb
res = K9_OCSVM(X_train, y_train, X_test, y_test, kernel_type='linear')

### Single Run on CIFAR10 ResNet50

In [None]:
# CIFAR Run example
import time

dataset = "CIFAR10"
batch_size = 16
samp_per_cls = 5000
random_seed = False
mdltype = "resnet50"
pretrained = True
kernel_type = "rbf"

train_loader, test_loader = K9_dataloader(dataset=dataset,batch_size=batch_size)   #LOAD the data
train_ft, train_lb, test_ft, test_lb = K9_resnet1850_ftex(train_loader,test_loader,mdltype=mdltype,pretrained=pretrained) #EXTRACT the features
X_train, y_train = K9_stratified_class_sample(train_ft,train_lb,samp_per_cls=samp_per_cls,random_seed=random_seed) #REDUCE the sample size by stratified sampling
X_test, y_test = test_ft, test_lb                 #KEEP test set the same size
res = K9_OCSVM_v2(X_train, y_train, X_test, y_test, kernel_type)  #Apply OC-SVM
run_summary = K9_pd_runsummary()  #Summarise the run

### Reproduction on all 3 datasets

In [None]:
# reproductions = 5
# datasets = ["fMNIST", "CIFAR10", "CIFAR100"]
# batch_size = 16
# samp_per_cls = 500
# random_seed = False
# mdltypes = ["resnet18","resnet50"]
# save_runsummary = True
# save_runsummary_collection = True

# runsummary_collection = pd.DataFrame([])
# for reproduction in range(reproductions):
#   print(f"Beginning reproduction {reproduction} of {reproductions}.")
#   for dataset in datasets:
#     train_loader, test_loader = K9_dataloader(dataset=dataset,
#                                               batch_size=batch_size)   #LOAD the data
#     for mdltype in mdltypes:
#       pretrained = True if mdltype == "resnet50" else False   #resnet50 pretrained, resnet18 random weights/untrained
#       train_ft, train_lb, test_ft, test_lb = K9_resnet1850_ftex(train_loader,test_loader,mdltype=mdltype,pretrained=pretrained) #EXTRACT the features
#       X_train, y_train = K9_stratified_class_sample(train_ft,train_lb,samp_per_cls=samp_per_cls,random_seed=random_seed) #REDUCE the sample size by stratified sampling
#       X_test, y_test = test_ft, test_lb #KEEP test set the same size

#       for kernel_type in ["linear","rbf"]:
#         res = K9_OCSVM(X_train, y_train, X_test, y_test, kernel_type)  #Apply OC-SVM
        
#         run_summary = K9_pd_runsummary()  #Summarise the run
#         print(run_summary)
#         if save_runsummary: run_summary.to_csv("/content/gdrive/MyDrive/COMP6248CW/run_summary_"+time.strftime("%Y%m%d-%H%M%S")+".csv")

#         runsummary_collection = pd.concat([runsummary_collection,run_summary], axis = 1) #Append as a new column in runsummary
#         if save_runsummary_collection: runsummary_collection.to_csv("/content/gdrive/MyDrive/COMP6248CW/run_summary_collection"+time.strftime("%Y%m%d-%H%M%S")+".csv")