<a href="https://colab.research.google.com/github/BRomans/IdMind/blob/main/svm_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [176]:
# Run this cell to load required libraries and mount your Drive folder
import numpy as np
from numpy.random import choice
from matplotlib import pyplot as plt
from google.colab import drive
import os
from sklearn.svm import SVC
from sklearn.base import clone
from sklearn.metrics import accuracy_score
import pandas as pd
import itertools
import random
from time import time

In [81]:
# Seed value
seed_value = 10

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

In [82]:
drive.mount('/content/drive')
dirpath = "/content/drive/MyDrive/ml2-eeg-biometrics/train-test-data/" 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [215]:
class SVM_one_v_all:
  def __init__(self, C=1.0, kernel='rbf', degree=3, random_state=None):
    self.C = C
    self.kernel = kernel
    self.degree = degree
    self.random_state = random_state
    self.svm = SVC(C=C, kernel=kernel, degree=degree, random_state=random_state, probability=True)

  def fit(self, x_train, y_train, size_factor = 3, verbose=True):
    self.y_train = y_train
    class_values = np.unique(y_train)
    self.class_values = class_values
    svm_list = []

    for c in class_values:
      if verbose: print(c)
      c_inds = np.where(y_train == c)[0]

      non_c_inds = np.where(y_train != c)[0]
      size_factor = size_factor if size_factor < len(non_c_inds)/len(c_inds) else len(non_c_inds)/len(c_inds)

      non_c_inds = choice(non_c_inds, size=int(len(c_inds)*size_factor), replace=False)                 # Take random subset of examples of other classes. size = the number of positive class examples times a user-specified size_factor.
      
      x_subset = np.zeros((len(c_inds)+len(non_c_inds), x_train.shape[1]))
      y_subset = np.zeros(len(c_inds)+len(non_c_inds))

      x_subset[:len(c_inds),:] = x_train[c_inds,:]
      x_subset[len(c_inds):,:] = x_train[non_c_inds,:]

      y_subset[:len(c_inds)] = 1

      clf = clone(self.svm)
      clf.fit(x_subset, y_subset)

      svm_list.append(clf)
    
    self.svm_list = svm_list

  def predict(self, x_test, verbose=True):

    preds = np.zeros(len(x_test))
    max_probs = np.zeros(len(x_test))
    class_values = self.class_values

    for i, model in enumerate(self.svm_list):
      if verbose: print(class_values[i])
      probs = model.predict_proba(x_test)[:,1]
      
      preds[probs > max_probs] = class_values[i]
      max_probs[probs > max_probs] = probs[probs > max_probs]

      if i == len(self.svm_list)-1:
        print(sum(max_probs>0.5)/len(max_probs))
    
    return preds

  def grid_search(self, x_train, y_train, x_valid, y_valid, params, data_name=''):
    
    if  len(params['kernel'])==0 | len(params['C'])==0 | len(params['gamma'])==0 | len(params['degree'])==0:
      raise ValueError("At least one value must be supplied for kernel, C, gamma, and degree.")

    count=0
    filepath = '/content/drive/MyDrive/ml2-eeg-biometrics/classification-results.csv'

    for kernel in params['kernel']:
      for C in params['C']:
        for gamma in params['gamma']:
          for degree in params['degree']:
            print("-"*40)
            print("{}: kernel: {}, C: {}, gamma: {}, degree: {}".format(count, kernel, C, gamma, degree))
            self.svm = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, random_state=self.random_state, probability=True)

            print('Fitting....')
            self.fit(x_train, y_train, verbose=False)

            print('Calculating results....')
            train_pred = self.predict(x_train, verbose=False)
            train_acc = accuracy_score(y_train, train_pred)
            print("train_acc: {0:.3f}".format(train_acc))

            valid_pred = self.predict(x_valid, verbose=False)
            valid_acc = accuracy_score(y_valid, valid_pred)
            print("validation_acc: {0:.3f}".format(valid_acc))

            mode = 'a' if os.path.isfile(filepath) else 'w'

            timestamp = pd.Timestamp.now()
            line = ','.join(map(str, [data_name,count,timestamp,kernel,C,gamma,degree,train_acc,valid_acc]))
            print(line)
            with open(filepath, mode) as file:
              file.write(line + '\n')

            count+=1

In [165]:
# Structure to evaluate classification performance
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def print_results(Y_test, predictions, label_names):
    print(classification_report(Y_test, predictions))
    print("Classification Accuracy: {0:.3f}".format(accuracy_score(Y_test, predictions)))

    conf_mat = confusion_matrix(Y_test, predictions)

    fig = plt.figure(figsize=(6,6))
    width = np.shape(conf_mat)[1]
    height = np.shape(conf_mat)[0]

    plt.figure(figsize=(12,12))
    res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest')
    for i, row in enumerate(conf_mat):
        for j, c in enumerate(row):
            if c>0:
                plt.text(j-.2, i+.1, c, fontsize=16)

    # cb = fig.colorbar(res)
    plt.title('Confusion Matrix')
    # _ = plt.xticks(range(6), label_names, rotation=90)
    # _ = plt.yticks(range(6), label_names)

In [4]:
# Load one set of data.
x_train = np.load(dirpath + 'train_encoding_model_2021-01-19 19:36:47.206950.npy')
x_valid = np.load(dirpath + 'valid_encoding_model_2021-01-19 19:36:47.206950.npy')

y_train = np.load(dirpath + 'y_train.npy')
y_train = y_train.reshape((-1,))

y_valid= np.load(dirpath + 'y_valid.npy', allow_pickle=True)
y_valid = y_valid.reshape((-1,))
y_valid = np.array(y_valid, dtype='int64') # Read in as object vector with allow_pickle, not sure why.

id_train = np.load(dirpath + 'id_train.npy', allow_pickle=True)
id_train = id_train.reshape((-1, 5))

id_valid = np.load(dirpath + 'id_valid.npy', allow_pickle=True)
id_valid = id_valid.reshape((-1, 5))

In [227]:
params = {'kernel':['rbf'], 'C':[10.0], 'gamma':[0.00277], 'degree':[3]}

svm = SVM_one_v_all(random_state=0)

prev=time()

svm.grid_search(x_train, y_train, x_valid, y_valid, params=params)

print(round(time()-prev, 5), " seconds")

----------------------------------------
0: kernel: rbf, C: 10.0, gamma: 0.00277, degree: 3
Fitting....
Calculating results....
0.8859714928732183
train_acc: 0.567
0.8338461538461538
validation_acc: 0.392
,0,2021-01-22 00:01:36.250324,rbf,10.0,0.00277,3,0.5674418604651162,0.3923076923076923
83.50564  seconds


In [76]:
id_train_df = pd.DataFrame(id_train, columns=['Subject', 'Date','Run','Task','Trial'])

# Get the minimum date for each subject.
df = id_train_df.groupby(by='Subject', as_index=False).max('Date')[['Subject','Date']]
min_dates = df.astype(str).agg('-'.join, axis=1).values                                 # Convert (Date) to string and concatenate the Subject and Date

rel_ind = np.in1d(id_train[:,0] + '-' + id_train[:,1].astype(str), min_dates)

x_train_1session = x_train[rel_ind]
y_train_1session = y_train[rel_ind]

rel_ind = np.in1d(id_valid[:,0] + '-' + id_valid[:,1].astype(str), min_dates)

x_valid_1session = x_valid[rel_ind]
y_valid_1session = y_valid[rel_ind]