Classification of the bacteria data using three different classifiers:
1. SVM
2. RandomForest with decision stumps
3. AdaBoost with decision stumps.

The data is: each permutation of species, replicate and growth phase along the rows; and the fluorescense spectra readings and growth phase along the columns. 

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/mnt/d/dev/summer-bursary-2018/bacteria'

In [3]:
os.listdir('data')

['16ms_32ms_growth_phase_spectra.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv',
 'Classific.py',
 'graphs.pdf',
 'wavelengths.csv']

In [4]:
df = pd.read_csv('data/bacteria.csv', header=[0, 1, 2], index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1042
Columns: 135 entries, (lag, bc, 01) to (stat, se, 07)
dtypes: float64(135)
memory usage: 1.1 MB


# Preprocessing

In [5]:
X = df.reorder_levels([1, 0, 2], axis=1).sort_index(axis=1).T
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042
species,growth_phase,replicate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
bc,lag,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,1.170374,2.730861,3.835922,1.755222,1.235332,4.355984,0.520024,0.715032,-0.32501,-0.52003
bc,lag,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,-0.37345,-0.18704,-2.30297,0.124451,0.809098,1.058059,2.676111,0.622357,0.560118,-0.43565
bc,lag,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,3.097847,4.775991,4.775919,4.259427,2.129823,0.064477,-2.516599,0.516229,-0.903391,1.096991
bc,lag,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,4.257782,3.284733,0.851676,2.493739,3.467134,5.900169,1.520354,2.067671,0.121629,1.09467
bc,lag,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,4.479034,2.488391,3.546049,1.057399,3.981489,6.469838,-0.933,0.062199,-0.4354,1.430629


In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

growth_phases = X.reset_index()['growth_phase']
growth_phases = growth_phases.values  # get the numpy array
growth_phases = growth_phases.reshape(-1, 1)

print(growth_phases[:5])

le = LabelEncoder()
growth_phases = le.fit_transform(growth_phases)

ohe = OneHotEncoder(sparse=False)
growth_phases = ohe.fit_transform(growth_phases.reshape(-1, 1))

print(growth_phases[:5])

[['lag']
 ['lag']
 ['lag']
 ['lag']
 ['lag']]
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


  y = column_or_1d(y, warn=True)


In [7]:
X['lag'] = growth_phases[:, 0]
X['log'] = growth_phases[:, 1]
X['stat'] = growth_phases[:, 2]

X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,1036,1037,1038,1039,1040,1041,1042,lag,log,stat
species,growth_phase,replicate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
bc,lag,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,1.755222,1.235332,4.355984,0.520024,0.715032,-0.32501,-0.52003,1.0,0.0,0.0
bc,lag,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,0.124451,0.809098,1.058059,2.676111,0.622357,0.560118,-0.43565,1.0,0.0,0.0
bc,lag,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,4.259427,2.129823,0.064477,-2.516599,0.516229,-0.903391,1.096991,1.0,0.0,0.0
bc,lag,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,2.493739,3.467134,5.900169,1.520354,2.067671,0.121629,1.09467,1.0,0.0,0.0
bc,lag,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,1.057399,3.981489,6.469838,-0.933,0.062199,-0.4354,1.430629,1.0,0.0,0.0


In [8]:
y = X.reset_index()['species']
y.head()

0    bc
1    bc
2    bc
3    bc
4    bc
Name: species, dtype: object

In [9]:
y.describe()

count     135
unique      6
top        bc
freq       36
Name: species, dtype: object

In [10]:
print(y)

0      bc
1      bc
2      bc
3      bc
4      bc
5      bc
6      bc
7      bc
8      bc
9      bc
10     bc
11     bc
12     bc
13     bc
14     bc
15     bc
16     bc
17     bc
18     bc
19     bc
20     bc
21     bc
22     bc
23     bc
24     bc
25     bc
26     bc
27     bc
28     bc
29     bc
       ..
105    sa
106    sa
107    sa
108    sa
109    sa
110    sa
111    sa
112    sa
113    sa
114    sa
115    sa
116    sa
117    sa
118    se
119    se
120    se
121    se
122    se
123    se
124    se
125    se
126    se
127    se
128    se
129    se
130    se
131    se
132    se
133    se
134    se
Name: species, Length: 135, dtype: object


In [11]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

print(X[:5])
print(y[:5])

                                       0         1         2         3  \
species growth_phase replicate                                           
pa      stat         07         0.192142 -0.384280  1.024756 -0.000003   
ec      stat         09         0.188591 -0.942950 -1.257260  2.074584   
se      stat         05         2.187136 -1.004900 -3.369360 -1.477880   
bc      stat         05        -0.537370 -0.000003 -0.134340  1.074790   
ec      log          01        -1.086490 -1.278220  0.575195  1.789606   

                                       4         5         6         7  \
species growth_phase replicate                                           
pa      stat         07         1.024821 -0.000002  0.960771 -0.896720   
ec      stat         09         1.257332  0.691529  2.829014  0.377195   
se      stat         05        -0.886730  1.182291 -0.532040  1.182304   
bc      stat         05         1.343486  0.403044  0.671747  0.604587   
ec      log          01         3.195

# PCA + SVM

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)

X_pca = pca.transform(X)
print(X_pca.shape)

(135, 2)


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'gamma': [10 ** -n for n in range(6)],
    'C': [10 ** n for n in range(-5, 2)]
}

cv = StratifiedKFold(n_splits=3)
clf = SVC()

grid_search = GridSearchCV(clf, param_grid, cv=cv, verbose=9, n_jobs=4)
grid_search.fit(X_pca, y)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.4666666666666667, total=   0.2s
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.37209302325581

[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed:    5.3s


[CV]  C=0.0001, gamma=0.01, kernel=linear, score=0.44680851063829785, total=  10.0s
[CV] C=0.0001, gamma=0.01, kernel=linear .............................
[CV]  C=0.0001, gamma=0.1, kernel=linear, score=0.44680851063829785, total=   9.6s
[CV] C=0.0001, gamma=0.1, kernel=linear ..............................
[CV]  C=0.0001, gamma=1, kernel=linear, score=0.44680851063829785, total=   9.6s
[CV] C=0.0001, gamma=1, kernel=linear ................................
[CV]  C=0.0001, gamma=0.0001, kernel=linear, score=0.44680851063829785, total=   9.5s
[CV] C=0.0001, gamma=0.0001, kernel=linear ...........................
[CV]  C=0.0001, gamma=0.01, kernel=linear, score=0.4666666666666667, total=   3.0s
[CV] C=0.0001, gamma=0.01, kernel=linear .............................
[CV]  C=0.0001, gamma=0.1, kernel=linear, score=0.4666666666666667, total=   3.0s
[CV] C=0.0001, gamma=0.1, kernel=linear ..............................
[CV]  C=0.0001, gamma=1, kernel=linear, score=0.4666666666666667, total=   

[CV]  C=0.001, gamma=0.0001, kernel=linear, score=0.3488372093023256, total=  32.9s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=0.001, gamma=1e-05, kernel=linear .............................
[CV]  C=0.001, gamma=0.01, kernel=linear, score=0.4666666666666667, total=  13.7s
[CV] C=0.001, gamma=0.01, kernel=linear ..............................
[CV]  C=0.001, gamma=1e-05, kernel=linear, score=0.44680851063829785, total=   8.0s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV

[Parallel(n_jobs=4)]: Done 116 tasks      | elapsed:  4.8min


[CV]  C=0.01, gamma=1, kernel=linear, score=0.3953488372093023, total=  45.6s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=0.01, gamma=0.0001, kernel=linear .............................
[CV]  C=0.01, gamma=0.1, kernel=linear, score=0.3953488372093023, total=  45.5s
[CV] C=0.01, gamma=0.1, ke

[Parallel(n_jobs=4)]: Done 143 tasks      | elapsed:  6.9min


[CV]  C=0.01, gamma=1e-05, kernel=linear, score=0.3953488372093023, total=  45.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.01, gamma=1e-05, kernel=linear, score=0.4222222222222222, total= 1.9min
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.46808510638297873, total=  37.5s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.37209302325581395, total= 1.4min
[CV] C=0.1, gamma=0.1, kerne

[Parallel(n_jobs=4)]: Done 165 tasks      | elapsed: 10.7min


[CV]  C=0.1, gamma=0.0001, kernel=linear, score=0.46808510638297873, total=  37.6s
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.37209302325581395, total= 1.4min
[CV] C=0.1, gamma=0.0001, kernel=linear ..............................
[CV]  C=0.1, gamma=0.01, kernel=linear, score=0.35555555555555557, total= 3.2min
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV]  C=0.1, gamma=0.0001, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV]  C=0.1, gamma=0.0001, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV]  C=0.1, gamma=0.0001, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=0.1, gamma=1e-05, kernel=linear ...............................
[CV]  C=0.1, gamma=1e-05, kernel=linear, score=0.46808510638297873, total=  37.5s
[CV] C=

[Parallel(n_jobs=4)]: Done 191 tasks      | elapsed: 17.1min


[CV]  C=1, gamma=0.01, kernel=linear, score=0.4186046511627907, total= 1.4min
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV]  C=1, gamma=0.01, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV]  C=1, gamma=0.01, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV]  C=1, gamma=0.01, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=1, kernel=linear, score=0.35555555555555557, total= 4.4min
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.48936170212765956, total=  51.6s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.1, kernel=linear, score=0.35555555555555557, total= 4.5min
[CV] C=1, gamma=0.001, kernel=rbf .

[Parallel(n_jobs=4)]: Done 221 tasks      | elapsed: 26.3min


[CV]  C=1, gamma=1e-05, kernel=linear, score=0.35555555555555557, total= 4.4min
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV]  C=10, gamma=0.1, kernel=rbf, score=0.2553191489361702, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV]  C=10, gamma=0.1, kernel=rbf, score=0.26666666666666666, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV]  C=10, gamma=0.1, kernel=rbf, score=0.27906976744186046, total=   0.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=0.01, kernel=linear, score=0.48936170212765956, total= 1.0min
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=1, kernel=linear, score=0.4444444444444444, total= 4.2min
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV]  C=10, gamma=0.1, kernel=linear, score=0.4186046511627907, total= 1.8min
[CV] C=10, gamma=0.01, kernel=rbf

[Parallel(n_jobs=4)]: Done 252 out of 252 | elapsed: 36.9min finished


0.45185185185185184
{'C': 10, 'gamma': 1, 'kernel': 'linear'}


In [None]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [15]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.56 (+/- 0.12)
CPU times: user 55.4 s, sys: 15.6 ms, total: 55.4 s
Wall time: 13min 59s


# Random Forest with Decision Stumps

In [16]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  from numpy.core.umath_tests import inner1d


Accuracy: 0.44 (+/- 0.16)
CPU times: user 35.5 s, sys: 266 ms, total: 35.7 s
Wall time: 35.7 s


In [17]:
%%time

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.39 (+/- 0.10)
CPU times: user 42.5 s, sys: 281 ms, total: 42.8 s
Wall time: 14min 41s


# AdaBoost

In [18]:
%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.32 (+/- 0.12)
CPU times: user 44.3 s, sys: 0 ns, total: 44.3 s
Wall time: 44.3 s


In [19]:
%%time

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.30 (+/- 0.13)
CPU times: user 5min 29s, sys: 15.6 ms, total: 5min 29s
Wall time: 5min 29s
