Classification of the bacteria data using three different classifiers:
1. SVM
2. RandomForest with decision stumps
3. AdaBoost with decision stumps.

The data is each replicate of each species along the rows, and the fluorescense spectra readings at the lag growth phase along the columns. 

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/mnt/d/dev/summer-bursary-2018/bacteria'

In [3]:
os.listdir('data')

['16ms_32ms_growth_phase_spectra.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv',
 'Classific.py',
 'graphs.pdf',
 'wavelengths.csv']

In [4]:
df = pd.read_csv('data/bacteria.csv', header=[0, 1, 2], index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043 entries, 0 to 1042
Columns: 135 entries, (lag, bc, 01) to (stat, se, 07)
dtypes: float64(135)
memory usage: 1.1 MB


# Preprocessing

In [5]:
lag = df['lag'].T
lag

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042
species,replicate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
bc,1,0.0,1.56006,0.715025,-0.52006,0.130014,-2.08018,0.0,1.040112,1.235136,-0.19502,...,1.170374,2.730861,3.835922,1.755222,1.235332,4.355984,0.520024,0.715032,-0.32501,-0.52003
bc,2,0.622353,-2.11599,0.062234,2.178335,1.867163,1.120305,0.871343,-1.36926,-1.92942,0.684622,...,-0.37345,-0.18704,-2.30297,0.124451,0.809098,1.058059,2.676111,0.622357,0.560118,-0.43565
bc,3,0.064528,0.645279,-1.03245,-2.064991,-1.419711,1.032515,-2.000491,-0.903453,2.968462,1.032499,...,3.097847,4.775991,4.775919,4.259427,2.129823,0.064477,-2.516599,0.5162295,-0.9033913,1.096991
bc,4,-1.09465,-2.55417,-0.91221,0.851433,3.709871,1.520432,0.851444,-2.06781,-1.58127,0.304085,...,4.257782,3.284733,0.851676,2.493739,3.467134,5.900169,1.520354,2.067671,0.121629,1.09467
bc,5,-1.49281,-0.5598,2.923387,3.047967,2.985793,2.985763,-3.59e-07,-0.99527,-0.93306,-1.99048,...,4.479034,2.488391,3.546049,1.057399,3.981489,6.469838,-0.933,0.062199,-0.4354,1.430629
bc,6,-0.97976,-0.06123,0.489875,-1.53094,3e-06,-1.34723,0.551145,-1.95965,-0.55115,-0.4899,...,4.40969,2.572408,1.960104,2.939695,4.960748,2.449745,-1.4084,0.551118,0.428644,1.959545
bc,7,-0.24007,0.600163,0.540146,-1.2004,1.260425,-1.20039,1.740574,-1.86063,-0.42014,1.860604,...,3.721497,2.281147,1.020509,4.081891,3.121564,3.601508,-0.48014,-0.3601,-1.44039,-0.24007
bc,8,0.060945,-0.79228,0.182837,0.853271,-3.04742,2.011285,2.011299,-0.67043,2.4989,-0.60948,...,1.523849,2.499054,0.975422,1.036091,2.682338,1.523849,0.670394,0.304728,0.731344,-0.30473
bc,9,0.247423,0.123709,-0.74227,1.360885,2.226921,0.123719,0.866028,1.360911,1.1e-05,-0.12371,...,2.474701,3.835622,5.815449,0.866334,2.474814,3.093127,0.123709,0.123711,-0.86597,-0.49485
bc,10,2.628512,0.292028,-0.40884,-5.7415,4.453517,0.878834,-0.87908,-6.38674,3.80936,-3.74994,...,-1.58348,-11.7231,-8.90937,-4.161,-10.0234,-1.11407,4.555887,-3.97274,-2.62882,-3.44933


In [6]:
lag.isnull().values.any()

False

In [7]:
X = lag

In [8]:
y = X.reset_index()['species']
y.head()

0    bc
1    bc
2    bc
3    bc
4    bc
Name: species, dtype: object

In [9]:
y.describe()

count     47
unique     6
top       bc
freq      12
Name: species, dtype: object

In [10]:
print(y)

0     bc
1     bc
2     bc
3     bc
4     bc
5     bc
6     bc
7     bc
8     bc
9     bc
10    bc
11    bc
12    ec
13    ec
14    ec
15    ec
16    ec
17    ec
18    ec
19    ec
20    lm
21    lm
22    lm
23    lm
24    lm
25    pa
26    pa
27    pa
28    pa
29    pa
30    pa
31    sa
32    sa
33    sa
34    sa
35    sa
36    sa
37    sa
38    sa
39    sa
40    sa
41    sa
42    se
43    se
44    se
45    se
46    se
Name: species, dtype: object


In [11]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

print(X[:5])
print(y[:5])

                       0         1         2         3         4     \
species replicate                                                     
bc      03         0.064528  0.645279 -1.032450 -2.064991 -1.419711   
pa      06        -1.641770  0.117268 -0.234540  0.645014 -0.293190   
lm      01        -3.504790  4.778309  6.435558  1.725630  0.191804   
        03         0.519307 -0.259650 -0.259650  2.077325  1.038670   
sa      10        -0.911120 -0.607410  1.397044  0.850412  0.060745   

                        5         6         7         8             9     \
species replicate                                                          
bc      03          1.032515 -2.000491 -0.903453  2.968462  1.032499e+00   
pa      06          0.996838 -0.820940  2.052344 -0.938200 -2.110960e+00   
lm      01         12.464440 -4.282870 -1.150720  2.748759 -7.350220e+00   
        03          0.259666 -2.207160  0.519336 -1.103600  2.140000e-07   
sa      10         -2.065330 -0.789690 -2.6120

# PCA + SVM

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X)

X_pca = pca.transform(X)
print(X_pca.shape)

(47, 2)


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'gamma': [10 ** -n for n in range(6)],
    'C': [10 ** n for n in range(-5, 2)]
}

cv = StratifiedKFold(n_splits=3)
clf = SVC()

grid_search = GridSearchCV(clf, param_grid, cv=cv, verbose=10, n_jobs=4)
grid_search.fit(X_pca, y)

print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=linear .................................
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.5882352941176471, total=   0.0s
[CV]  C=1e-05, gamma=1, kernel=linear, score=0.5294117647058824, total=   0.0s
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV] C=1e-05, gamma=1, kernel=rbf ....................................
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV] C=1e-05, gamma=0.1, kernel=linear ...............................
[CV]  C=1e-05, gamma=1, kernel=rbf, score=0.23076923076923

[Parallel(n_jobs=4)]: Batch computation too fast (0.0470s.) Setting batch_size=8.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Batch computation too fast (0.0739s.) Setting batch_size=42.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s


[CV]  C=0.0001, gamma=1, kernel=linear, score=0.6153846153846154, total=   0.1s
[CV] C=0.0001, gamma=1, kernel=rbf ...................................
[CV]  C=0.0001, gamma=0.001, kernel=linear, score=0.6153846153846154, total=   0.1s
[CV] C=0.0001, gamma=0.001, kernel=rbf ...............................
[CV]  C=0.0001, gamma=1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.0001, gamma=0.0001, kernel=rbf ..............................
[CV]  C=0.0001, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.0001, gamma=0.001, kernel=rbf ...............................
[CV]  C=0.0001, gamma=0.0001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.0001, gamma=0.0001, kernel=rbf ..............................
[CV]  C=0.0001, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.0001, gamma=0.001, kernel=rbf ...............................
[CV]  C=0.0001, gamma=0.0001, kernel=rbf, score=0.3076923076923077, total=   0.0s
[CV

[CV]  C=0.001, gamma=0.0001, kernel=linear, score=0.5294117647058824, total=   0.1s
[CV] C=0.001, gamma=0.0001, kernel=linear ............................
[CV]  C=0.001, gamma=0.0001, kernel=linear, score=0.5882352941176471, total=   0.4s
[CV] C=0.001, gamma=0.0001, kernel=linear ............................
[CV]  C=0.001, gamma=0.0001, kernel=linear, score=0.6153846153846154, total=   1.0s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=0.001, gamma=0.0001, kernel=rbf ...............................
[CV]  C=0.001, gamma=0.0001, kernel=rbf, score=0.3076923076923077, total=   0.0s
[CV] C=0.001, gamma=1e-05, kernel=linear .............................
[CV]  C=0.001, gamma=1e-05, kernel=linear, score=0.5294117647058824, total=  

[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.5294117647058824, total=   7.4s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.47058823529411764, total=  19.5s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.5294117647058824, total= 1.8min
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.6153846153846154, total=  53.3s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=1, gamma=0.001, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV]  C=1, gamma=0.001, kern

[CV]  C=1, gamma=1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV]  C=1, gamma=1, kernel=rbf, score=0.23076923076923078, total=   0.0s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV]  C=1, gamma=0.1, kernel=linear, score=0.5294117647058824, total=  54.0s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV]  C=1, gamma=0.1, kernel=linear, score=0.5294117647058824, total= 1.4min
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV]  C=10, gamma=1e-05, kernel=linear, score=0.5294117647058824, total= 5.0min
[CV] C=10, gamma=1e-05, kernel=linear ................................
[CV]  C=1, gamma=0.1, kernel=linear, score=0.6153846153846154, total=  45.7s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV]  C=1, gamma=0.1, kernel=rbf, score=0.23529411764705882, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ..........

[Parallel(n_jobs=4)]: Done 252 out of 252 | elapsed: 48.5min finished


In [14]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.54 (+/- 0.18)
CPU times: user 219 ms, sys: 0 ns, total: 219 ms
Wall time: 212 ms


In [15]:
%%time

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(grid_search.best_estimator_, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.57 (+/- 0.23)
CPU times: user 375 ms, sys: 15.6 ms, total: 391 ms
Wall time: 375 ms


# Random Forest with Decision Stumps

In [16]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

  from numpy.core.umath_tests import inner1d


Accuracy: 0.44 (+/- 0.12)
CPU times: user 44.3 s, sys: 359 ms, total: 44.7 s
Wall time: 44.7 s


In [17]:
%%time

clf = RandomForestClassifier(n_estimators=1024, max_depth=1)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.47 (+/- 0.10)
CPU times: user 46.3 s, sys: 328 ms, total: 46.6 s
Wall time: 46.6 s


# AdaBoost

In [18]:
%%time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X_pca, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.35 (+/- 0.18)
CPU times: user 50.7 s, sys: 0 ns, total: 50.7 s
Wall time: 50.7 s


In [19]:
%%time

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                        n_estimators=1024)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=20)
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.38 (+/- 0.17)
CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 40s
