In [1]:
import pickle

from metabolitics.preprocessing import MetaboliticsPipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA, NMF
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, StratifiedKFold

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, make_scorer

import numpy as np, pandas as pd
from collections import defaultdict, OrderedDict
from itertools import chain, starmap
from itertools import product

In [2]:
results = pickle.load(open('../results/lung_cancer.results','rb'))
labels = pickle.load(open('../datasets/lung_cancer_y','rb'))

In [3]:
samples = defaultdict(lambda : [])
[
 samples[key].append(value) for key, value in 
 chain(*map(lambda sample: sample.items(), results))
]

dataset = pd.DataFrame(samples, index=labels)

In [4]:
dataset.T.head()

Unnamed: 0,unhealthy,unhealthy.1,unhealthy.2,unhealthy.3,unhealthy.4,unhealthy.5,unhealthy.6,unhealthy.7,unhealthy.8,unhealthy.9,...,healthy,healthy.1,healthy.2,healthy.3,healthy.4,healthy.5,healthy.6,healthy.7,healthy.8,healthy.9
10FTHF5GLUtl_max,1.549267e-10,52.54325,-1.562057e-10,1e-06,1.988906e-09,-9.349321e-11,1.158528e-10,2.870771e-10,-2.408797e-09,9e-06,...,2.787366e-06,1e-06,3e-06,1.089916e-08,-3.309538e-09,2.765432e-11,-2.172413e-10,2e-06,1.994557e-09,1.087363e-08
10FTHF5GLUtl_min,-3.537384e-10,52.54325,2.248726e-10,1e-06,-2.096044e-09,1.354863e-10,-1.773515e-10,-4.308731e-10,0.0,9e-06,...,9.030146e-10,1e-06,3e-06,0.0,0.0,-1.966782e-11,8.535608e-10,2e-06,-2.658276e-09,0.0
10FTHF5GLUtm_max,1.549267e-10,52.54325,-1.393659e-10,1e-06,9.215455e-10,-7.588596e-11,9.475798e-11,2.870771e-10,-2.802523e-10,9e-06,...,2.787366e-06,1e-06,3e-06,1.089916e-08,-3.309538e-09,4.880008e-11,-2.172413e-10,2e-06,1.994557e-09,1.087363e-08
10FTHF5GLUtm_min,-3.537384e-10,52.54325,2.248726e-10,1e-06,-2.096044e-09,1.354863e-10,-1.773515e-10,-4.308731e-10,1.083208e-09,9e-06,...,9.030146e-10,1e-06,3e-06,-1.391783e-09,1.131582e-09,-5.475002e-11,8.535608e-10,2e-06,-2.658276e-09,0.0
10FTHF6GLUtl_max,2.272316e-10,4.623527e-10,-1.681855e-10,1e-06,2.05273e-09,-9.657697e-11,1.411991e-10,5.131824e-10,-1.204398e-09,9e-06,...,2.787366e-06,1e-06,3e-06,2.174738e-08,-6.595895e-09,3.012701e-11,-4.283106e-10,2e-06,3.11518e-09,1.204222e-08


In [5]:
balance = labels.count('unhealthy') / len(labels)

print(balance)

0.5420560747663551


In [6]:
binarize = lambda ls: np.array([1 if l == 'unhealthy' else 0 for l in ls])

sc = StandardScaler()
X = sc.fit_transform(dataset)
# X = dataset
y = binarize(dataset.index)

In [7]:
dataset.shape

(107, 15570)

In [8]:
classifiers = [
    
    (LogisticRegression,{
        'C': np.geomspace(1e-6, 1e6, num=10),
        'max_iter':range(0,50+1, 10)
    }),
     (RandomForestClassifier, {
         'n_estimators':range(3,10)
     })
]
feature_selection = [
    (PCA, {
        'n_components': range(3, 85, 9)
  })
]

In [9]:
def build_pipeline(p):
    pipeline, pipeline_params = [], OrderedDict()
    
    for model, model_params in p:
        name = model.__name__
        
        pipeline.append((name, model()))
        pipeline_params.update({'{}__{}'.format(name, param_name) : values 
                                for param_name, values in model_params.items()})
    
    return Pipeline(pipeline), pipeline_params

In [10]:
NUM_TRIALS = 1
metrics = ['f1', 'recall', 'precision', 'accuracy']
trials = []

for i in range(NUM_TRIALS):
    cv_pipelines = []
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
   
    for pipeline, params in map(build_pipeline, product(feature_selection, classifiers)):
#     for pipeline, params in classifiers:
        cv_pipeline = GridSearchCV(pipeline, params, cv=inner_cv, n_jobs=-1, verbose=1).fit(X, y)
        cv_pipelines.append(cv_pipeline)
        
    best_pipeline = cv_pipelines[np.argmax([i.best_score_ for i in cv_pipelines])]
    cv = cross_validate(best_pipeline.best_estimator_, 
                        X=X, y=y, cv=outer_cv, 
                        scoring=metrics, 
                        return_train_score=False)
    
    trials.append((best_pipeline, cv))
    print("{} trial done".format(i+1))
    print("-"*10)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 38.2min
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed: 45.8min finished


Fitting 10 folds for each of 70 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  5.4min finished


1 trial done
----------


In [36]:
for model, scores in trials:
    try:
        n_components = model.best_estimator_.estimator.named_steps['PCA']
        print('pca n_components: {}'.format(n_components.n_components))
    #     except KeyError:
    #         n_components = model.best_estimator_.named_steps['SelectKBest']
    #         print('kkk n_components: {}'.format(n_components.k))
        steps = list(model.best_estimator_.estimator.named_steps)
        m = model.best_estimator_.estimator.named_steps[steps[steps.index("PCA")-1]]
        print(m)
        print("Score:\t{}".format(model.best_score_))
        print("-"*30)
    except AttributeError:
        continue

pca n_components: None
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Score:	0.8878504672897196
------------------------------


In [37]:
trials_scores = [scores for model.best_estimator_, scores in trials]
trials_means = map(lambda trial_scores: {key: value.mean() 
                                         for key, value in trial_scores.items()}, trials_scores)

stats = pd.DataFrame(list(trials_means))
stats.mean()[2:]

test_accuracy     0.860707
test_f1           0.862207
test_precision    0.932381
test_recall       0.810000
dtype: float64

In [38]:
stats.describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
count,1.0,1.0,1.0,1.0,1.0,1.0
mean,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
std,,,,,,
min,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
25%,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
50%,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
75%,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
max,0.25251,0.015521,0.860707,0.862207,0.932381,0.81
