Classification of the bacteria data using three different classifiers:
1. SVM
2. RandomForest with decision stumps
3. AdaBoost with decision stumps.

The data is each replicate of each species along the rows, and the fluorescense spectra readings along the columns. 

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/mnt/d/dev/summer-bursary-2018/bacteria'

In [3]:
os.listdir('data')

['16ms_32ms_growth_phase_spectra.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv',
 'Classific.py',
 'graphs.pdf',
 'wavelengths.csv']

In [4]:
df = pd.read_csv('data/bacteria.csv', header=[0, 1, 2], index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1042
Columns: 135 entries, (lag, bc, 01) to (stat, se, 07)
dtypes: float64(135)
memory usage: 1.1 MB


# Preprocessing

In [5]:
lag = df['lag'].T
lag = lag.add_prefix('lag_')

log = df['log'].T
log = log.add_prefix('log_')

stat = df['stat'].T
stat = stat.add_prefix('stat_')

df_concat = pd.concat([lag, log, stat], axis=1)
df_concat

Unnamed: 0_level_0,Unnamed: 1_level_0,lag_0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,...,stat_1033,stat_1034,stat_1035,stat_1036,stat_1037,stat_1038,stat_1039,stat_1040,stat_1041,stat_1042
species,replicate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
bc,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,6.552984,8.601113,4.23237,5.802275,6.962677,4.710049,0.341248,-0.20476,1.16026,1.433294
bc,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,4.980498,4.533528,6.002042,4.980405,5.554981,4.980512,-0.57457,0.31921,1.149138,0.638419
bc,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,14.481643,10.493803,10.144153,8.535311,13.572317,12.73263,1.538781,-0.139889,0.419673,-0.139893
bc,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,4.340296,3.837179,5.912723,5.283972,6.667484,1.635564,0.817585,-0.754692,-2.201172,1.069157
bc,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,6.449811,2.821592,2.015373,1.209171,5.105925,3.224761,0.604527,0.604542,1.67926,0.537375
bc,6,-0.97976,-0.06123,0.489875,-1.53094,3e-06,-1.34723,0.551145,-1.95965,-0.55115,-0.4899,...,4.036305,6.92798,5.482071,6.626633,5.662667,7.409666,-0.54207,0.180692,-1.02391,0.662543
bc,7,-0.24007,0.600163,0.540146,-1.2004,1.260425,-1.20039,1.740574,-1.86063,-0.42014,1.860604,...,5.727584,4.619099,4.003135,1.786058,3.387305,1.909361,2.278321,-0.800497,-0.923639,-0.184733
bc,8,0.060945,-0.79228,0.182837,0.853271,-3.04742,2.011285,2.011299,-0.67043,2.4989,-0.60948,...,4.24052,4.683892,1.7725,4.620664,3.481432,4.620498,-0.3797,0.25314,-0.94927,0.949276
bc,9,0.247423,0.123709,-0.74227,1.360885,2.226921,0.123719,0.866028,1.360911,1.1e-05,-0.12371,...,3.227793,4.240516,4.177085,7.21481,4.999896,3.670677,-0.56949,-1.32882,0.885873,0.885887
bc,10,2.628512,0.292028,-0.40884,-5.7415,4.453517,0.878834,-0.87908,-6.38674,3.80936,-3.74994,...,12.76915,2.253729,-3.69186,0.187435,-6.75968,-2.69121,2.681973,-3.74332,3.244011,1.248669


In [6]:
df_concat = df_concat.dropna(axis=0)
df_concat

Unnamed: 0_level_0,Unnamed: 1_level_0,lag_0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,...,stat_1033,stat_1034,stat_1035,stat_1036,stat_1037,stat_1038,stat_1039,stat_1040,stat_1041,stat_1042
species,replicate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
bc,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,6.552984,8.601113,4.23237,5.802275,6.962677,4.710049,0.341248,-0.20476,1.16026,1.433294
bc,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,4.980498,4.533528,6.002042,4.980405,5.554981,4.980512,-0.57457,0.31921,1.149138,0.638419
bc,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,14.481643,10.493803,10.144153,8.535311,13.572317,12.73263,1.538781,-0.139889,0.419673,-0.139893
bc,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,4.340296,3.837179,5.912723,5.283972,6.667484,1.635564,0.817585,-0.754692,-2.201172,1.069157
bc,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,6.449811,2.821592,2.015373,1.209171,5.105925,3.224761,0.604527,0.604542,1.67926,0.537375
bc,6,-0.97976,-0.06123,0.489875,-1.53094,3e-06,-1.34723,0.551145,-1.95965,-0.55115,-0.4899,...,4.036305,6.92798,5.482071,6.626633,5.662667,7.409666,-0.54207,0.180692,-1.02391,0.662543
bc,7,-0.24007,0.600163,0.540146,-1.2004,1.260425,-1.20039,1.740574,-1.86063,-0.42014,1.860604,...,5.727584,4.619099,4.003135,1.786058,3.387305,1.909361,2.278321,-0.800497,-0.923639,-0.184733
bc,8,0.060945,-0.79228,0.182837,0.853271,-3.04742,2.011285,2.011299,-0.67043,2.4989,-0.60948,...,4.24052,4.683892,1.7725,4.620664,3.481432,4.620498,-0.3797,0.25314,-0.94927,0.949276
bc,9,0.247423,0.123709,-0.74227,1.360885,2.226921,0.123719,0.866028,1.360911,1.1e-05,-0.12371,...,3.227793,4.240516,4.177085,7.21481,4.999896,3.670677,-0.56949,-1.32882,0.885873,0.885887
bc,10,2.628512,0.292028,-0.40884,-5.7415,4.453517,0.878834,-0.87908,-6.38674,3.80936,-3.74994,...,12.76915,2.253729,-3.69186,0.187435,-6.75968,-2.69121,2.681973,-3.74332,3.244011,1.248669


In [7]:
X = df_concat.values
X = (X - X.min()) / (X.max() - X.min())

print(X.min(), X.mean(), X.max())

0.0 0.04540645445549405 1.0


In [8]:
y = df_concat.reset_index()['species']
y.head()

0    bc
1    bc
2    bc
3    bc
4    bc
Name: species, dtype: object

In [9]:
y.describe()

count     39
unique     6
top       bc
freq      12
Name: species, dtype: object

In [10]:
print(y)

0     bc
1     bc
2     bc
3     bc
4     bc
5     bc
6     bc
7     bc
8     bc
9     bc
10    bc
11    bc
12    ec
13    ec
14    ec
15    ec
16    ec
17    lm
18    lm
19    lm
20    pa
21    pa
22    pa
23    pa
24    pa
25    pa
26    sa
27    sa
28    sa
29    sa
30    sa
31    sa
32    sa
33    sa
34    se
35    se
36    se
37    se
38    se
Name: species, dtype: object


In [11]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

print(X[:5])
print(y[:5])

[[0.0019723  0.00199147 0.00203222 ... 0.00197709 0.00197957 0.002034  ]
 [0.00199709 0.00196709 0.00196709 ... 0.00201126 0.00201126 0.00198441]
 [0.00198394 0.00195655 0.00195883 ... 0.0019473  0.00199447 0.00200689]
 [0.00197944 0.00194658 0.00198413 ... 0.00198684 0.00194054 0.00201365]
 [0.00201303 0.00194115 0.00192318 ... 0.0021681  0.00191907 0.00174485]]
22    pa
19    lm
18    lm
7     bc
31    sa
Name: species, dtype: object


# PCA + SVM

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)

X_pca = pca.transform(X)
print(X_pca.shape)

(39, 3)


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'gamma': [10 ** -n for n in range(10)],
    'C': [10 ** n for n in range(-9, 2)]
}

cv = StratifiedKFold(n_splits=3)
clf = SVC()

grid_search = GridSearchCV(clf, param_grid, cv=cv, verbose=10, n_jobs=4)
grid_search.fit(X_pca, y)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 220 candidates, totalling 660 fits
[CV] C=1e-09, gamma=1, kernel=linear .................................
[CV] C=1e-09, gamma=1, kernel=linear .................................
[CV] C=1e-09, gamma=1, kernel=linear .................................
[CV] C=1e-09, gamma=1, kernel=rbf ....................................
[CV]  C=1e-09, gamma=1, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-09, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-09, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-09, gamma=0.1, kernel=rbf ..................................
[CV] C=1e-09, gamma=1, kernel=rbf ....................................
[CV]  C=1e-09, gamma=1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1e-09, gamma=1, kernel=rbf ....................................
[CV] C=1e-09, gamma=0.1, kernel=linear ...............................
[CV] C=1e-09, gamma=0.1, kernel=linear ................

[CV] C=1e-08, gamma=0.001, kernel=linear .............................
[CV] C=1e-08, gamma=1, kernel=linear .................................
[CV] C=1e-08, gamma=0.1, kernel=rbf ..................................
[CV]  C=1e-09, gamma=1e-08, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-08, gamma=0.001, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-08, gamma=0.1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-08, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-09, gamma=1e-08, kernel=rbf ................................
[CV] C=1e-08, gamma=0.001, kernel=linear .............................
[CV] C=1e-08, gamma=0.1, kernel=rbf ..................................
[CV] C=1e-08, gamma=1, kernel=linear .................................
[CV]  C=1e-08, gamma=0.001, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-09, gamma=1e-08, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-08, g

[Parallel(n_jobs=4)]: Batch computation too fast (0.0354s.) Setting batch_size=10.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Batch computation too fast (0.1001s.) Setting batch_size=38.
[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:    0.2s


[CV] C=1e-08, gamma=1e-05, kernel=linear .............................
[CV] C=1e-07, gamma=0.1, kernel=rbf ..................................
[CV] C=1e-07, gamma=1e-07, kernel=rbf ................................
[CV] C=1e-06, gamma=0.0001, kernel=linear ............................
[CV]  C=1e-07, gamma=0.1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-08, gamma=1e-05, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-07, gamma=1e-07, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-06, gamma=0.0001, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-08, gamma=1e-05, kernel=linear .............................
[CV] C=1e-07, gamma=0.1, kernel=rbf ..................................
[CV] C=1e-06, gamma=0.0001, kernel=linear ............................
[CV] C=1e-07, gamma=1e-07, kernel=rbf ................................
[CV]  C=1e-08, gamma=1e-05, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-07, gamma=1e

[CV]  C=1e-06, gamma=1e-06, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-07, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1e-07, gamma=0.001, kernel=rbf ................................
[CV] C=1e-06, gamma=1e-06, kernel=linear .............................
[CV]  C=1e-07, gamma=1e-09, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-08, gamma=1e-07, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV] C=1e-06, gamma=1, kernel=linear .................................
[CV] C=1e-08, gamma=1e-07, kernel=rbf ................................
[CV]  C=1e-07, gamma=0.001, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-06, gamma=1e-06, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-06, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-07, gamma=0.0001, kernel=linear ............................
[CV] C=1e-06, gamma=1e-06, kernel=rbf ................................
[C

[CV] C=1e-07, gamma=1e-06, kernel=linear .............................
[CV] C=1e-06, gamma=1e-08, kernel=rbf ................................
[CV] C=1e-06, gamma=0.01, kernel=linear ..............................
[CV] C=1e-08, gamma=1e-09, kernel=rbf ................................
[CV]  C=1e-06, gamma=1e-08, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-07, gamma=1e-06, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-08, gamma=1e-09, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-06, gamma=0.01, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-06, gamma=1e-08, kernel=rbf ................................
[CV] C=1e-07, gamma=1e-06, kernel=linear .............................
[CV] C=1e-08, gamma=1e-09, kernel=rbf ................................
[CV] C=1e-06, gamma=0.01, kernel=linear ..............................
[CV]  C=1e-06, gamma=1e-08, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-07, gamma=1e-06,

[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV]  C=0.0001, gamma=0.001, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=1e-05, gamma=1e-07, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.0001, gamma=0.001, kernel=rbf ...............................
[CV]  C=1e-05, gamma=0.1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=1e-05, gamma=1e-07, kernel=linear .............................
[CV]  C=0.0001, gamma=1e-09, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV] C=0.0001, gamma=1e-09, kernel=rbf ...............................
[CV]  C=0.0001, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-05, gamma=1e-07, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.0001, gamma=0.001, kernel=rbf ...............................
[CV]  C=0.0001, gamma=1e-09, kernel=rbf, score=0.36363636363636365, total=   0.0

[CV]  C=0.0001, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1e-05, gamma=0.001, kernel=rbf ................................
[CV]  C=1e-05, gamma=1e-09, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV]  C=0.001, gamma=0.1, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] C=0.0001, gamma=1e-05, kernel=rbf ...............................
[CV]  C=1e-05, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1e-05, gamma=1e-09, kernel=rbf ................................
[CV]  C=0.0001, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=0.001, gamma=0.01, kernel=linear ..............................
[CV] C=0.0001, gamma=1e-05, kernel=rbf ...............................
[CV] C=1e-05, gamma=0.001, kernel=rbf ................................
[CV]  C=1e-05, gamma=1e-09, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.001, gamma=0.01, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C

[Parallel(n_jobs=4)]: Done 126 tasks      | elapsed:    0.6s


[CV]  C=0.0001, gamma=1e-06, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV] C=0.0001, gamma=1, kernel=linear ................................
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.0001, gamma=1e-06, kernel=rbf ...............................
[CV] C=1e-05, gamma=0.0001, kernel=linear ............................
[CV]  C=0.0001, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=0.0001, gamma=1, kernel=linear ................................
[CV]  C=0.0001, gamma=1e-06, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1e-05, gamma=0.0001, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=1e-05, gamma=0.0001, kernel=rbf ...............................
[CV] C=0.

[CV] C=0.001, gamma=1e-05, kernel=rbf ................................
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.0001, gamma=1e-08, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.001, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.0001, gamma=0.01, kernel=linear, score=0.36363636363636365, total=   0.0s
[CV] C=0.001, gamma=1e-05, kernel=rbf ................................
[CV] C=0.0001, gamma=1e-08, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] C=0.0001, gamma=0.01, kernel=rbf ................................
[CV] C=0.001, gamma=1e-05, kernel=linear .............................
[CV]  C=0.001, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.0001, gamma=1e-08, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV]  C=0.0001, gamma=0.01, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] 

[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] ..... C=0.1, gamma=1e-06, kernel=linear, score=0.5, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.001, gamma=1e-07, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] C=0.1, gamma=1e-06, kernel=linear ...............................
[CV] C=0.001, gamma=1e-08, kernel=linear .............................
[CV] ......... C=0.1, gamma=1, kernel=linear, score=0.5, total=   0.0s
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=1e-06, kernel=linear, score=0.42857142857142855, total=   0.0s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.001, gamma=1e-08, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.1, gamma=1e-06, ker

[CV] C=0.1, gamma=1e-08, kernel=linear ...............................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.42857142857142855, total=   0.0s
[CV]  C=0.01, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=0.01, gamma=1e-05, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV]  C=0.1, gamma=1e-08, kernel=linear, score=0.5454545454545454, total=   0.0s
[CV] C=0.01, gamma=1e-06, kernel=linear ..............................
[CV] C=0.01, gamma=1, kernel=linear ..................................
[CV] C=0.1, gamma=1e-08, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.5454545454545454, total=   0.0s
[CV]  C=0.01, gamma=1e-06, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV]  C=0.01, gamma=1, kernel=linear, score=0.45454545454545453, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] C=0.01, 

[Parallel(n_jobs=4)]: Done 392 tasks      | elapsed:    0.9s


[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] C=0.01, gamma=1e-07, kernel=linear ..............................
[CV]  C=0.01, gamma=0.1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.1, gamma=1e-09, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=0.1, gamma=1e-09, kernel=rbf ..................................
[CV]  C=0.01, gamma=1e-07, kernel=linear, score=0.45454545454545453, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] C=0.01, gamma=1e-07, kernel=rbf .................................
[CV]  C=0.1, gamma=1e-09, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] .............. C=1, gamma=1, kernel=rbf, score=0.5, total=   0.0s
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.36363636363636365, total=   0.0s
[CV] C=1, gamma=1, kernel=lin

[CV] C=10, gamma=1e-05, kernel=linear ................................
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=1e-09, kernel=linear, score=0.7142857142857143, total=   0.0s
[CV] C=1, gamma=1e-09, kernel=linear .................................
[CV]  C=10, gamma=1e-05, kernel=linear, score=0.5714285714285714, total=   0.0s
[CV]  C=1, gamma=0.001, kernel=linear, score=0.7272727272727273, total=   0.0s
[CV] C=10, gamma=1e-05, kernel=linear ................................
[CV] ....... C=1, gamma=1e-09, kernel=linear, score=0.5, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] C=1, gamma=1e-09, kernel=linear .................................
[CV]  C=10, gamma=1e-05, kernel=linear, score=0.5454545454545454, total=   0.0s
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=10, gamma=1e-05, kernel=rbf ...................................
[CV]  C=1, gamma=1e-09, kernel=linear,

[CV]  C=1, gamma=1e-06, kernel=linear, score=0.7272727272727273, total=   0.0s
[CV]  C=10, gamma=1e-08, kernel=linear, score=0.5714285714285714, total=   0.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] C=1, gamma=1e-06, kernel=rbf ....................................
[CV] C=10, gamma=1e-08, kernel=linear ................................
[CV]  C=1, gamma=1e-06, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=10, gamma=0.01, kernel=linear, score=0.5714285714285714, total=   0.0s
[CV]  C=10, gamma=1e-08, kernel=linear, score=0.5454545454545454, total=   0.0s
[CV] C=1, gamma=1e-06, kernel=rbf ....................................
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] C=10, gamma=1e-08, kernel=rbf ...................................
[CV]  C=1, gamma=1e-06, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=10, gamma=1e-08, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=10, gamma=0.01, ker

[Parallel(n_jobs=4)]: Done 660 out of 660 | elapsed:    1.4s finished


In [14]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.54 (+/- 0.16)
CPU times: user 62.5 ms, sys: 15.6 ms, total: 78.1 ms
Wall time: 79.1 ms


In [15]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.55 (+/- 0.20)
CPU times: user 312 ms, sys: 46.9 ms, total: 359 ms
Wall time: 343 ms


# Random Forest with Decision Stumps

In [16]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  from numpy.core.umath_tests import inner1d


Accuracy: 0.47 (+/- 0.16)
CPU times: user 39.2 s, sys: 234 ms, total: 39.4 s
Wall time: 39.7 s


In [17]:
%%time

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.55 (+/- 0.19)
CPU times: user 39.7 s, sys: 266 ms, total: 40 s
Wall time: 40 s


# AdaBoost

In [18]:
%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.41 (+/- 0.16)
CPU times: user 42 s, sys: 0 ns, total: 42 s
Wall time: 42 s


In [19]:
%%time

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.49 (+/- 0.27)
CPU times: user 4min 34s, sys: 0 ns, total: 4min 34s
Wall time: 4min 34s
