Classification of the bacteria data using three different classifiers:
1. SVM
2. RandomForest with decision stumps
3. AdaBoost with decision stumps.

The data is: each permutation of species, replicate and growth phase along the rows; and the fluorescense spectra readings and growth phase along the columns. 

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/home/anthony/summer-bursary-2018/bacteria'

In [3]:
os.listdir('data')

['Classific.py',
 'graphs.pdf',
 'growth_stages.csv',
 '16ms_32ms_growth_phase_spectra.csv',
 'PC6allwavelengths.png',
 'wavelengths.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv']

In [4]:
df = pd.read_csv('data/bacteria.csv', header=[0, 1, 2], index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1042
Columns: 135 entries, (lag, bc, 01) to (stat, se, 07)
dtypes: float64(135)
memory usage: 1.1 MB


# Preprocessing

In [None]:
X = df.reorder_levels([1, 0, 2], axis=1).sort_index(axis=1).T
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042
species,growth_phase,replicate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
bc,lag,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,1.170374,2.730861,3.835922,1.755222,1.235332,4.355984,0.520024,0.715032,-0.32501,-0.52003
bc,lag,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,-0.37345,-0.18704,-2.30297,0.124451,0.809098,1.058059,2.676111,0.622357,0.560118,-0.43565
bc,lag,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,3.097847,4.775991,4.775919,4.259427,2.129823,0.064477,-2.516599,0.516229,-0.903391,1.096991
bc,lag,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,4.257782,3.284733,0.851676,2.493739,3.467134,5.900169,1.520354,2.067671,0.121629,1.09467
bc,lag,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,4.479034,2.488391,3.546049,1.057399,3.981489,6.469838,-0.933,0.062199,-0.4354,1.430629


In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
growth_phases = X.reset_index()['growth_phase']
growth_phases = growth_phases.values  # get the numpy array
growth_phases = growth_phases.reshape(-1, 1)

print(growth_phases[:5])

ohe.fit(growth_phases)
growth_phases = ohe.transform(growth_phases)

print(growth_phases[:5])

[['lag']
 ['lag']
 ['lag']
 ['lag']
 ['lag']]
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [None]:
X['lag'] = growth_phases[:, 0]
X['log'] = growth_phases[:, 1]
X['stat'] = growth_phases[:, 2]

X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,1036,1037,1038,1039,1040,1041,1042,lag,log,stat
species,growth_phase,replicate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
bc,lag,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,1.755222,1.235332,4.355984,0.520024,0.715032,-0.32501,-0.52003,1.0,0.0,0.0
bc,lag,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,0.124451,0.809098,1.058059,2.676111,0.622357,0.560118,-0.43565,1.0,0.0,0.0
bc,lag,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,4.259427,2.129823,0.064477,-2.516599,0.516229,-0.903391,1.096991,1.0,0.0,0.0
bc,lag,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,2.493739,3.467134,5.900169,1.520354,2.067671,0.121629,1.09467,1.0,0.0,0.0
bc,lag,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,1.057399,3.981489,6.469838,-0.933,0.062199,-0.4354,1.430629,1.0,0.0,0.0


In [None]:
y = X.reset_index()['species']
y.head()

0    bc
1    bc
2    bc
3    bc
4    bc
Name: species, dtype: object

In [None]:
y.describe()

count     135
unique      6
top        bc
freq       36
Name: species, dtype: object

In [None]:
print(y)

0      bc
1      bc
2      bc
3      bc
4      bc
5      bc
6      bc
7      bc
8      bc
9      bc
10     bc
11     bc
12     bc
13     bc
14     bc
15     bc
16     bc
17     bc
18     bc
19     bc
20     bc
21     bc
22     bc
23     bc
24     bc
25     bc
26     bc
27     bc
28     bc
29     bc
       ..
105    sa
106    sa
107    sa
108    sa
109    sa
110    sa
111    sa
112    sa
113    sa
114    sa
115    sa
116    sa
117    sa
118    se
119    se
120    se
121    se
122    se
123    se
124    se
125    se
126    se
127    se
128    se
129    se
130    se
131    se
132    se
133    se
134    se
Name: species, Length: 135, dtype: object


In [None]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

print(X[:5])
print(y[:5])

                                       0         1         2         3  \
species growth_phase replicate                                           
pa      stat         08         1.786078  0.184770 -0.923820 -3.264350   
bc      lag          01         0.000000  1.560060  0.715025 -0.520060   
se      stat         07         0.423758  1.089653 -0.363220 -1.574020   
pa      log          08        -0.062110 -0.559000  0.434780  0.496915   
bc      lag          03         0.064528  0.645279 -1.032450 -2.064991   

                                       4         5         6         7  \
species growth_phase replicate                                           
pa      stat         08        -1.539810  1.047061  3.202802  0.431145   
bc      lag          01         0.130014 -2.080180  0.000000  1.040112   
se      stat         07        -1.816200  1.331872  1.755656 -0.363240   
pa      log          08        -1.055960  0.993841  1.801346  1.428657   
bc      lag          03        -1.419

# PCA + SVM

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)

X_pca = pca.transform(X)
print(X_pca.shape)

(135, 2)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'gamma': [10 ** -n for n in range(6)],
    'C': [10 ** n for n in range(-5, 2)]
}

cv = StratifiedKFold(n_splits=3)
clf = SVC()

grid_search = GridSearchCV(clf, param_grid, cv=cv, verbose=10, n_jobs=-1)
grid_search.fit(X_pca, y)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 84 candidates, totalling 252 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 13.6min


In [None]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Random Forest with Decision Stumps

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
%%time

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# AdaBoost

In [None]:
%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
%%time

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))