Classification of the bacteria data using three different classifiers:
1. SVM
2. RandomForest with decision stumps
3. AdaBoost with decision stumps.

The data is each replicate of each species along the rows, and the fluorescense spectra readings at the stat growth phase along the columns. 

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/mnt/d/dev/summer-bursary-2018/bacteria'

In [3]:
os.listdir('data')

['16ms_32ms_growth_phase_spectra.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv',
 'Classific.py',
 'graphs.pdf',
 'wavelengths.csv']

In [4]:
df = pd.read_csv('data/bacteria.csv', header=[0, 1, 2], index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1042
Columns: 135 entries, (lag, bc, 01) to (stat, se, 07)
dtypes: float64(135)
memory usage: 1.1 MB


# Preprocessing

In [5]:
stat = df['stat'].T
stat

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042
species,replicate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
bc,1,-0.20475,-2.93477,-0.20475,0.614292,2.047664,1.433361,3.071483,0.614301,-0.7508,1.979407,...,6.552984,8.601113,4.23237,5.802275,6.962677,4.710049,0.341248,-0.20476,1.16026,1.433294
bc,2,0.383045,-2.1706,-0.38305,-2.10686,0.319222,-0.51076,-0.25538,-1.2769,0.127691,-0.06384,...,4.980498,4.533528,6.002042,4.980405,5.554981,4.980512,-0.57457,0.31921,1.149138,0.638419
bc,3,-0.279776,0.419667,-0.419668,0.279793,2.378291,1.049258,0.489651,6.155592,3.567415,0.839389,...,14.481643,10.493803,10.144153,8.535311,13.572317,12.73263,1.538781,-0.139889,0.419673,-0.139893
bc,4,-0.691798,0.314453,0.754683,0.125783,1.257884,1.132096,1.635259,1.509468,2.515801,2.389968,...,4.340296,3.837179,5.912723,5.283972,6.667484,1.635564,0.817585,-0.754692,-2.201172,1.069157
bc,5,-0.53737,-3e-06,-0.13434,1.07479,1.343486,0.403044,0.671747,0.604587,1.67937,3.8289,...,6.449811,2.821592,2.015373,1.209171,5.105925,3.224761,0.604527,0.604542,1.67926,0.537375
bc,6,0.361383,-0.54207,-0.78299,0.3614,1.325136,1.264897,-0.1807,1.867256,1.385384,-1.26491,...,4.036305,6.92798,5.482071,6.626633,5.662667,7.409666,-0.54207,0.180692,-1.02391,0.662543
bc,7,0.307881,-0.800487,-0.184728,3.263684,0.9236899,1.539485,1.293172,1.785821,-0.307901,1.785808,...,5.727584,4.619099,4.003135,1.786058,3.387305,1.909361,2.278321,-0.800497,-0.923639,-0.184733
bc,8,1.835246,0.696122,-2.34151,0.696138,-0.12657,1.772062,2.278369,2.974551,-2.91126,1.835349,...,4.24052,4.683892,1.7725,4.620664,3.481432,4.620498,-0.3797,0.25314,-0.94927,0.949276
bc,9,1.202269,-1.0757,-2.08814,-1.07576,1.645302,3.480439,1.455456,2.214844,0.696087,0.949211,...,3.227793,4.240516,4.177085,7.21481,4.999896,3.670677,-0.56949,-1.32882,0.885873,0.885887
bc,10,-4.74055,1.30973,0.311845,-11.8247,-10.1999,-7.69596,-13.7663,-3.62898,-16.1449,-12.1374,...,12.76915,2.253729,-3.69186,0.187435,-6.75968,-2.69121,2.681973,-3.74332,3.244011,1.248669


In [6]:
stat.isnull().values.any()

False

In [7]:
X = stat.values
X = (X - X.min()) / (X.max() - X.min())

print(X.min(), X.mean(), X.max())

0.0 0.04503995307813344 1.0


In [8]:
y = stat.reset_index()['species']
y.head()

0    bc
1    bc
2    bc
3    bc
4    bc
Name: species, dtype: object

In [9]:
y.describe()

count     47
unique     6
top       bc
freq      12
Name: species, dtype: object

In [10]:
print(y)

0     bc
1     bc
2     bc
3     bc
4     bc
5     bc
6     bc
7     bc
8     bc
9     bc
10    bc
11    bc
12    ec
13    ec
14    ec
15    ec
16    ec
17    ec
18    ec
19    ec
20    ec
21    lm
22    lm
23    lm
24    pa
25    pa
26    pa
27    pa
28    pa
29    pa
30    pa
31    pa
32    sa
33    sa
34    sa
35    sa
36    sa
37    sa
38    sa
39    sa
40    se
41    se
42    se
43    se
44    se
45    se
46    se
Name: species, dtype: object


In [11]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

print(X[:5])
print(y[:5])

[[0.00082996 0.00078722 0.00081096 ... 0.00078722 0.00078248 0.00081096]
 [0.00083285 0.0007344  0.00080332 ... 0.00083039 0.00086239 0.0008427 ]
 [0.00079737 0.00081809 0.00081291 ... 0.00084139 0.00088283 0.0008388 ]
 [0.00086549 0.00083203 0.00084318 ... 0.00076511 0.0007428  0.00089895]
 [0.0008073  0.00083427 0.00080191 ... 0.00081269 0.00083427 0.00081269]]
6     bc
1     bc
4     bc
33    sa
2     bc
Name: species, dtype: object


# PCA + SVM

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)

X_pca = pca.transform(X)
print(X_pca.shape)

(47, 1)


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'gamma': [10 ** -n for n in range(6)],
    'C': [10 ** n for n in range(-5, 2)]
}

cv = StratifiedKFold(n_splits=3)
clf = SVC()

grid_search = GridSearchCV(clf, param_grid, cv=cv, verbose=10, n_jobs=4)
grid_search.fit(X_pca, y)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.23529411764705882, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV] ...... C=1e-05, gamma=1, kernel=linear, score=0.25, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] ......... C=1e-05, gamma=1, kernel=rbf, score=0.25, total=   0.0s
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV] C=1e-05, gamma=0.1, kernel=linear ........................

[CV]  C=0.0001, gamma=0.1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.0001, gamma=0.0001, kernel=linear, score=0.23529411764705882, total=   0.0s
[CV] C=1e-05, gamma=0.001, kernel=rbf ................................
[CV] C=1e-05, gamma=1e-05, kernel=rbf ................................
[CV] C=0.0001, gamma=0.01, kernel=linear .............................
[CV] C=0.0001, gamma=0.0001, kernel=linear ...........................
[CV]  C=1e-05, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] ..... C=1e-05, gamma=1e-05, kernel=rbf, score=0.25, total=   0.0s
[CV]  C=0.0001, gamma=0.01, kernel=linear, score=0.23529411764705882, total=   0.0s
[CV]  C=0.0001, gamma=0.0001, kernel=linear, score=0.25, total=   0.0s
[CV] C=0.0001, gamma=0.0001, kernel=rbf ..............................
[CV] C=1e-05, gamma=1e-05, kernel=rbf ................................
[CV] C=0.0001, gamma=0.01, kernel=linear .............................
[CV] C=0.0001, gamma=0.0001, ke

[CV]  C=0.01, gamma=0.1, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] .. C=0.001, gamma=1e-05, kernel=linear, score=0.25, total=   0.0s
[CV] C=0.001, gamma=0.001, kernel=linear .............................
[CV] C=0.001, gamma=1e-05, kernel=linear .............................
[CV]  C=0.001, gamma=1, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=0.01, gamma=0.1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.01, gamma=0.1, kernel=rbf ...................................
[CV] C=0.01, gamma=0.01, kernel=linear ...............................
[CV]  C=0.001, gamma=0.001, kernel=linear, score=0.23529411764705882, total=   0.0s
[CV]  C=0.001, gamma=1e-05, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.001, gamma=0.0

[Parallel(n_jobs=4)]: Batch computation too fast (0.0276s.) Setting batch_size=14.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0951s.) Setting batch_size=58.
[Parallel(n_jobs=4)]: Done  36 tasks      | elapsed:    0.2s


[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.01, kernel=linear, score=0.2857142857142857, total=   0.0s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.1, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=10, gamma=0.001, kernel=rbf, score=0.4117647058823529, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=1, gamma=1, kernel=linear, score=0.47058823529411764, total=   0.0s
[CV] ...... C=10, gamma=0.001, kernel=rbf, score=0.4375, total=   0.0s
[CV] ....... C=0.01, gamma=0.01, kernel=rbf, score=0.25, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] C=1, gamma=1, kernel=linear ........

[CV] C=0.01, gamma=1e-05, kernel=rbf .................................
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=0.01, gamma=1e-05, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV]  C=1, gamma=0.001, kernel=linear, score=0.5714285714285714, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.4117647058823529, total=   0.0s
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ...... C=0.1, gamma=1, kernel=linear, score=0.4375, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ......... C=1, gamma=0.001, kernel=rbf, score=0.25, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ...............

[CV]  C=10, gamma=0.1, kernel=rbf, score=0.5714285714285714, total=   0.0s
[CV]  C=0.1, gamma=0.0001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV]  C=10, gamma=0.01, kernel=linear, score=0.47058823529411764, total=   0.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ...... C=0.1, gamma=0.0001, kernel=rbf, score=0.25, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] .... C=10, gamma=0.01, kernel=linear, score=0.5625, total=   0.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=0.1, gamma=0.0001, kernel=rbf, score=0.2857142857142857, total=   0.0s
[CV] C=0.1, gamma=1e-05, kernel=linear ...............................
[CV]  C=10, gamma=0.01, kernel=linear, score=0.5714285714285714, total=   0.0s
[CV]  C=0.1, gamma=1e-05, kernel=linear

[Parallel(n_jobs=4)]: Done 252 out of 252 | elapsed:    0.5s finished


In [14]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.56 (+/- 0.16)
CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 75.4 ms


In [15]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.56 (+/- 0.18)
CPU times: user 203 ms, sys: 15.6 ms, total: 219 ms
Wall time: 190 ms


# Random Forest with Decision Stumps

In [16]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  from numpy.core.umath_tests import inner1d


Accuracy: 0.55 (+/- 0.17)
CPU times: user 39.4 s, sys: 344 ms, total: 39.7 s
Wall time: 39.9 s


In [17]:
%%time

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.49 (+/- 0.17)
CPU times: user 39.2 s, sys: 156 ms, total: 39.3 s
Wall time: 39.4 s


# AdaBoost

In [18]:
%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.36 (+/- 0.17)
CPU times: user 41.7 s, sys: 31.2 ms, total: 41.8 s
Wall time: 41.8 s


In [19]:
%%time

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.39 (+/- 0.15)
CPU times: user 2min 13s, sys: 0 ns, total: 2min 13s
Wall time: 2min 13s
