In [1]:
import pickle

from metabolitics.preprocessing import MetaboliticsPipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.decomposition import PCA, NMF
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, StratifiedKFold

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, make_scorer

import numpy as np, pandas as pd
from collections import defaultdict, OrderedDict
from itertools import chain, starmap
from itertools import product

In [2]:
results = pickle.load(open('../results/breast_cancer2.results','rb'))
labels = pickle.load(open('../datasets/breast_cancer2_y','rb'))

In [3]:
samples = defaultdict(lambda : [])
[
 samples[key].append(value) for key, value in 
 chain(*map(lambda sample: sample.items(), results))
]

dataset = pd.DataFrame(samples, index=labels)

In [4]:
dataset.T.head()

Unnamed: 0,unhealthy,unhealthy.1,unhealthy.2,healthy,unhealthy.3,unhealthy.4,unhealthy.5,unhealthy.6,healthy.1,healthy.2,...,unhealthy.7,healthy.3,healthy.4,unhealthy.8,unhealthy.9,unhealthy.10,unhealthy.11,unhealthy.12,healthy.5,unhealthy.13
10FTHF5GLUtl_max,8.135672e-06,250.0,-8.64361e-10,2e-06,3e-06,1.854232e-10,-6.618848e-10,-9.646897e-10,-2.165166e-10,2.045454e-09,...,3e-06,7.955769e-10,1.114131e-11,4.504841e-10,100.8991,-5.263701e-11,1.9e-05,-8.662937e-11,-8.585199e-10,4.531557e-10
10FTHF5GLUtl_min,1.135106e-09,0.0,6.664891e-10,2e-06,3e-06,-1.8153e-10,7.204335e-10,0.0,2.136199e-10,-1.568131e-09,...,3e-06,-9.416681e-10,-5.885461e-11,-3.834657e-10,0.0,2.228262e-11,1.9e-05,6.82121e-11,8.677779e-10,-3.458615e-10
10FTHF5GLUtm_max,8.135672e-06,250.0,-8.64361e-10,2e-06,3e-06,1.485887e-10,-6.618848e-10,-9.646897e-10,-1.638686e-10,2.045454e-09,...,3e-06,7.955769e-10,1.000444e-11,4.504841e-10,100.8991,-1.534772e-11,1.9e-05,-8.662937e-11,-3.811351e-10,4.531557e-10
10FTHF5GLUtm_min,1.135106e-09,0.0,6.664891e-10,2e-06,3e-06,-1.8153e-10,7.204335e-10,1.307399e-10,2.106533e-10,-1.568131e-09,...,3e-06,-9.416681e-10,-5.885461e-11,-3.834657e-10,1.212186e-11,0.0,1.9e-05,6.82121e-11,6.842384e-10,-3.458615e-10
10FTHF6GLUtl_max,8.135672e-06,6.647269e-10,-7.013412e-10,2e-06,3e-06,1.884928e-10,-6.858158e-10,-2.514753e-10,-1.141984e-10,1.608118e-09,...,3e-06,8.740528e-10,1.142763e-11,4.482672e-10,-8.736833e-11,-5.263701e-11,1.9e-05,-6.457412e-11,-8.587185e-10,4.483674e-10


In [5]:
balance = labels.count('unhealthy') / len(labels)

print(balance)

0.6477272727272727


In [6]:
binarize = lambda ls: np.array([1 if l == 'unhealthy' else 0 for l in ls])

X = dataset
y = binarize(dataset.index)

In [7]:
dataset.shape

(88, 15570)

In [8]:
classifiers = [

  (RandomForestClassifier, {
    'max_depth': range(5, 15),
    'n_estimators': range(1, 15)
  }),

  (LogisticRegression, {
    'C': np.geomspace(1e-6, 1e4, num=10)
  })
]

feature_selection = [

  (PCA, {
    'n_components': range(1, 100+2, 20)
  })
]

In [9]:
def build_pipeline(p):
    pipeline, pipeline_params = [], OrderedDict()
    
    for model, model_params in p:
        name = model.__name__
        
        pipeline.append((name, model()))
        pipeline_params.update({'{}__{}'.format(name, param_name) : values 
                                for param_name, values in model_params.items()})
    
    return Pipeline(pipeline), pipeline_params