In [2]:
import pickle

from metabolitics.preprocessing import MetaboliticsPipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

import numpy as np, pandas as pd
from collections import defaultdict
from itertools import chain, starmap

In [3]:
results = pickle.load(open('results/breast_cancer2.results','rb'))
labels = pickle.load(open('datasets/breast_cancer2_y','rb'))

In [16]:
samples = defaultdict(lambda : [])
[
 samples[key].append(value) for key, value in 
 chain(*map(lambda sample: sample.items(), results))
]

dataset = pd.DataFrame(samples, index=labels)

In [17]:
dataset.T.head()

Unnamed: 0,unhealthy,unhealthy.1,unhealthy.2,healthy,unhealthy.3,unhealthy.4,unhealthy.5,unhealthy.6,healthy.1,healthy.2,...,unhealthy.7,healthy.3,healthy.4,unhealthy.8,unhealthy.9,unhealthy.10,unhealthy.11,unhealthy.12,healthy.5,unhealthy.13
10FTHF5GLUtl_max,8.135672e-06,250.0,-8.64361e-10,2e-06,3e-06,1.854232e-10,-6.618848e-10,-9.646897e-10,-2.165166e-10,2.045454e-09,...,3e-06,7.955769e-10,1.114131e-11,4.504841e-10,100.8991,-5.263701e-11,1.9e-05,-8.662937e-11,-8.585199e-10,4.531557e-10
10FTHF5GLUtl_min,1.135106e-09,0.0,6.664891e-10,2e-06,3e-06,-1.8153e-10,7.204335e-10,0.0,2.136199e-10,-1.568131e-09,...,3e-06,-9.416681e-10,-5.885461e-11,-3.834657e-10,0.0,2.228262e-11,1.9e-05,6.82121e-11,8.677779e-10,-3.458615e-10
10FTHF5GLUtm_max,8.135672e-06,250.0,-8.64361e-10,2e-06,3e-06,1.485887e-10,-6.618848e-10,-9.646897e-10,-1.638686e-10,2.045454e-09,...,3e-06,7.955769e-10,1.000444e-11,4.504841e-10,100.8991,-1.534772e-11,1.9e-05,-8.662937e-11,-3.811351e-10,4.531557e-10
10FTHF5GLUtm_min,1.135106e-09,0.0,6.664891e-10,2e-06,3e-06,-1.8153e-10,7.204335e-10,1.307399e-10,2.106533e-10,-1.568131e-09,...,3e-06,-9.416681e-10,-5.885461e-11,-3.834657e-10,1.212186e-11,0.0,1.9e-05,6.82121e-11,6.842384e-10,-3.458615e-10
10FTHF6GLUtl_max,8.135672e-06,6.647269e-10,-7.013412e-10,2e-06,3e-06,1.884928e-10,-6.858158e-10,-2.514753e-10,-1.141984e-10,1.608118e-09,...,3e-06,8.740528e-10,1.142763e-11,4.482672e-10,-8.736833e-11,-5.263701e-11,1.9e-05,-6.457412e-11,-8.587185e-10,4.483674e-10


In [18]:
balance = labels.count('unhealthy') / len(labels)

print(balance)

0.6477272727272727


### Standard scaling

In [19]:
std_scalar = StandardScaler().fit(dataset, dataset.index)
X, y = std_scalar.transform(dataset), dataset.index

### Train\Test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [21]:
dataset.shape

(88, 15570)

### Feature selection and random oversampling on training dataset

In [28]:
feature_selection = Pipeline([('select_k_best', SelectKBest(k=100))]).fit(X_train, y_train)

X_train_f, y_train_f = RandomOverSampler(random_state=42).\
                       fit_sample(feature_selection.transform(X_train), y_train)

X_test_f = feature_selection.transform(X_test)



In [23]:
models = [

  (RandomForestClassifier, {
    'max_depth': range(5, 20),
    'n_estimators': range(1, 8)
  }),

  (LogisticRegression, {
    'C': np.geomspace(1e-4, 1e3, num = 10),
    'max_iter': range(100, 1000 + 1, 1000)
  }),

  (SVC, {
    'C': np.geomspace(1e-4, 1e2, num = 10),
    'degree': range(1, 5),

    'max_iter': range(50000, 100000 + 1, 10000),
    'kernel': ['rbf', 'linear']
  }),

  (SGDClassifier, {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': np.geomspace(1e-4, 1.0, num = 10),
    'max_iter': range(200, 1000 + 1, 100)
  })
]

cv_estimators = []
for model, params in models:
    cv_model = GridSearchCV(model(random_state=42), params, n_jobs=-1).fit(X_train_f, y_train_f)
    cv_estimators.append(cv_model)

In [24]:
f1_scores = []
binarize = lambda ls: [1 if l == 'unhealthy' else 0 for l in ls]

for estimator in cv_estimators:
    score = f1_score(binarize(estimator.predict(X_test_f)), binarize(y_test))
    f1_scores.append(score)

## best estimator

In [25]:
best_estimator = cv_estimators[np.argmax(f1_scores)].best_estimator_
best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
f1_scores

[0.74285714285714277,
 0.5714285714285714,
 0.66666666666666674,
 0.71794871794871795]

### Scores

In [27]:
metrics = {'recall': recall_score, 'precision': precision_score, 'f1':f1_score,
           'accuracy':accuracy_score}

x_predicted = binarize(best_estimator.predict(X_test_f))
y_test_b    = binarize(y_test)

for metric, f in metrics.items():
    print(metric, f(x_predicted, y_test_b), '\n')

precision 0.764705882353 

f1 0.742857142857 

recall 0.722222222222 

accuracy 0.666666666667 

