# Support Vector Machine - Find best one
This notebook tests which SVM works best for our data

author = Caroline Magg <br>
date = 30 August 2020 <br> 

______________________________________
history: <br>
2020-03-09 SVM KFold test <br>
2020-11-09 SVM KFold test with fixed bug <br>
2020-13-09 Get best SVM for KFold <br>
2020-13-09 Test best SVM for KFold with Testset <br>
2020-13-09 Extract best SVM <br>

In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd#
import cv2
import scipy
import logging as log
import skimage.segmentation as segmentation
from scipy.spatial.distance import cdist

In [None]:
log.basicConfig(format='%(levelname)s:%(message)s', level=log.INFO)

### Add dependencies

In [None]:
# add KidsBrainProject main folder to paths
sys.path.append(os.path.abspath('../../'))
sys.path.append(os.path.abspath('../utils/'))

In [None]:
# add path to data here
#path_data = "../../Data/features/"
#path_data = "../support_vector_machine/"
path_data = "../../Data/features_standardized/"

In [None]:
from sklearn import svm
from sklearn.model_selection import KFold
import random
from sklearn.multioutput import MultiOutputRegressor

In [None]:
from SVMSearch import SVMSearch

In [None]:
all_files = [ filename for filename in os.listdir(path_data) if filename.endswith( "csv" ) ]
all_files_error = [os.path.join(path_data,fn) for fn in all_files if 'error_metrics' in fn]
all_files_features = [os.path.join(path_data,fn) for fn in all_files if 'features' in fn]
len(all_files_error), len(all_files_features)

In [None]:
idx = 0
all_files_error[idx], all_files_features[idx]

# Get data range

In [None]:
for idx, fn_features, fn_errors in zip(range(len(all_files_features)), all_files_features, all_files_error):
    # get data
    features = pd.read_csv(fn_features, sep=';')
    errors = pd.read_csv(fn_errors, sep=';')
    print(fn_features)
    print(' feature numbers', len(features), len(errors))
    X = np.array(features)
    y = np.array(errors)
    print(' splits', np.shape(X), np.shape(y))
    for i in range(5): # go through all error metrics
        print('  error metrics:', i, errors.keys()[i])
        print('   ', np.min(y[:,i]), max(y[:,i]), max(y[:,i])- np.min(y[:,i]))
    break

In [None]:
for idx, fn_features, fn_errors in zip(range(len(all_files_features)), all_files_features, all_files_error):
    # get data
    features = pd.read_csv(fn_features, sep=';')
    errors = pd.read_csv(fn_errors, sep=';')
    print(fn_features)
    print(' feature numbers', len(features), len(errors))
    X = np.array(features)
    y = np.array(errors)
    print(' splits', np.shape(X), np.shape(y))
    for i in range(X.shape[1]): # go through all error metrics
        print('  feature:', i, features.keys()[i])
        print('   ', np.min(X[:,i]), max(X[:,i]), max(X[:,i])- np.min(X[:,i]))
    break

In [None]:
for idx, fn_features, fn_errors in zip(range(len(all_files_features)), all_files_features, all_files_error):
    print(fn_features)
    break

In [None]:
all_files_features

# Single-output regression

# Multi-output regression

# Find best 10 in each category

### Single-output best 

results:<br>
* kernel rbf is better than sigmoid kernel <br>
* average surface error and hausdorff dist are negative (Cingulum, Thalamusant - both sides) <br>
* jaccard distance is often the best one, followed by dice coeff <br>
* C value is either low or high

In [None]:
path_rbf="../support_vector_machine/5-fold_rbf"
path_sigmoid="../support_vector_machine/5-fold_sigmoid"

all_files_rbf_single = [os.path.join(path_rbf,x) for x in os.listdir(path_rbf) if '_kfold' in x and 'singleoutput' in x]
all_files_sigmoid_single = [os.path.join(path_sigmoid,x) for x in os.listdir(path_sigmoid) if '_kfold' in x and 'singleoutput' in x]
len(all_files_rbf_single), len(all_files_sigmoid_single)

In [None]:
all_files_rbf_single

In [None]:
best_single_metrics_rbf = []
best_single_score_rbf = []
best_single_c_rbf = []
single_rbf_jaccard = []
best_single_metrics_sigmoid = []
best_single_score_sigmoid = []
best_single_c_sigmoid = []
for fn_rbf, fn_sigmoid in zip(all_files_rbf_single, all_files_sigmoid_single):
    print(fn_rbf.split('/')[-1])
    df = pd.read_csv(fn_rbf, sep=';')
    df2 = df.sort_values('score_test',axis=0)
    best_single_metrics_rbf.append(df2.tail(1)['error_metrics'].values[0])
    best_single_score_rbf.append(df2.tail(1)['score_test'].values[0])
    best_single_c_rbf.append(df2.tail(1)['C'].values[0])
    single_rbf_jaccard.append(df2[df2['error_metrics']=='jaccard_dist'].tail(1)['score_test'].values[0])
    df3 = pd.read_csv(fn_sigmoid, sep=';')
    df4 = df3.sort_values('score_test',axis=0)
    best_single_metrics_sigmoid.append(df4.tail(1)['error_metrics'].values[0])
    best_single_score_sigmoid.append(df4.tail(1)['score_test'].values[0])
    best_single_c_sigmoid.append(df4.tail(1)['C'].values[0])

In [None]:
for idx, m, s,c,b in zip(range(len(best_single_metrics_rbf)), best_single_metrics_rbf, best_single_score_rbf, best_single_c_rbf, single_rbf_jaccard):
    print("{0} & {1} & {2} & {3} & {4} \\\\".format(all_files_rbf_single[idx].split('/')[-1].split('_')[-2], m.replace('_','\\_'),s.round(3),c,b.round(3)))

In [None]:
for m in np.unique(best_single_metrics_rbf):
    print(best_single_metrics_rbf.count(m), m)
for c in np.unique(best_single_c_rbf):
    print(best_single_c_rbf.count(c), c)

In [None]:
plt.hist(best_single_c_rbf)

In [None]:
df = pd.read_csv(all_files_rbf_single[2], sep=';')
df2 = df.sort_values('score_test',axis=0)

In [None]:
df2[df2['error_metrics']=='jaccard_dist']

### Multioutput best

results <br>
* rbf in most cases better or not really worse than sigmoid kernel <br>
* ['dice_coeff', 'jaccard_dist'] is the best combi <br>
* C value is well distributed <br>

In [None]:
path_rbf="../support_vector_machine/5-fold_rbf"
path_sigmoid="../support_vector_machine/5-fold_sigmoid"

all_files_rbf_multi = [os.path.join(path_rbf,x) for x in os.listdir(path_rbf) if '_kfold' in x and 'multi' in x]
all_files_sigmoid_multi = [os.path.join(path_sigmoid,x) for x in os.listdir(path_sigmoid) if '_kfold' in x and 'multi' in x]
len(all_files_rbf_multi), len(all_files_sigmoid_multi)

In [None]:
best_multi_metrics_rbf = []
best_multi_score_rbf = []
dice_jaccard_multi_score_rbf = []
best_multi_c_rbf = []
best_multi_metrics_sigmoid = []
best_multi_score_sigmoid = []
best_multi_c_sigmoid = []
for fn_rbf, fn_sigmoid in zip(all_files_rbf_multi, all_files_sigmoid_multi):
    df = pd.read_csv(fn_rbf, sep=';')
    df2 = df.sort_values('score_test',axis=0)
    best_multi_metrics_rbf.append(df2.tail(1)['error_metrics'].values[0])
    best_multi_score_rbf.append(df2.tail(1)['score_test'].values[0])
    best_multi_c_rbf.append(df2.tail(1)['C'].values[0])
    dice_jaccard_multi_score_rbf.append(df2[df2['error_metrics']=="['dice_coeff', 'jaccard_dist']"].tail(1)['score_test'].values[0])
    df3 = pd.read_csv(fn_sigmoid, sep=';')
    df4 = df3.sort_values('score_test',axis=0)
    best_multi_metrics_sigmoid.append(df4.tail(1)['error_metrics'].values[0])
    best_multi_score_sigmoid.append(df4.tail(1)['score_test'].values[0])
    best_multi_c_sigmoid.append(df4.tail(1)['C'].values[0])

In [None]:
df2['error_metrics'][0]

In [None]:
for idx, m, s,c,b in zip(range(len(best_multi_metrics_rbf)), best_multi_metrics_rbf, best_multi_score_rbf, best_multi_c_rbf, dice_jaccard_multi_score_rbf):
    print("{0} & {1} & {2} & {3} & {4} \\\\".format(all_files_rbf_single[idx].split('/')[-1].split('_')[-2], m.replace('_','\\_').replace("'",""),s.round(3),c,b.round(3)))

In [None]:
for m in np.unique(best_multi_metrics_rbf):
    print(best_multi_metrics_rbf.count(m), m)
for c in np.unique(best_multi_c_rbf):
    print(best_multi_c_rbf.count(c), c)

In [None]:
plt.hist(best_multi_c_rbf)

### Compare single and multi

In [None]:
for idx, s, m in zip(range(len(best_single_score_rbf)), best_single_score_rbf, best_multi_score_rbf):
    print(idx, s, m)

# Extract SVM

In [None]:
search = SVMSearch(all_files_features, all_files_error)

In [None]:
np.shape(search.X[0]), np.shape(search.y[0])

In [None]:
search.y[0]