In [1]:
import pickle

from metabolitics.preprocessing import MetaboliticsPipeline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

import numpy as np, pandas as pd
from collections import defaultdict
from itertools import chain, starmap

In [2]:
results = pickle.load(open('results/breast_cancer2.results','rb'))
labels = pickle.load(open('datasets/breast_cancer2_y','rb'))

In [3]:
pipe = MetaboliticsPipeline(['reaction-diff',
                             'pathway_transformer'])

pre_processed_results = pipe.fit_transform(results, labels)

In [4]:
samples = defaultdict(lambda : [])
[
 samples[key].append(value) for key, value in 
 chain(*map(lambda sample: sample.items(), pre_processed_results))
]

dataset = pd.DataFrame(samples, index=labels)

In [5]:
dataset.T.head()

Unnamed: 0,unhealthy,unhealthy.1,unhealthy.2,healthy,unhealthy.3,unhealthy.4,unhealthy.5,unhealthy.6,healthy.1,healthy.2,...,unhealthy.7,healthy.3,healthy.4,unhealthy.8,unhealthy.9,unhealthy.10,unhealthy.11,unhealthy.12,healthy.5,unhealthy.13
,-25.490606,-21.389085,-29.843938,-14.52688,-12.18496,4.338045,-7.707883,7.645137,99.308536,31.907804,...,-13.039948,-90.031445,-32.690873,-26.580457,-62.169126,-2.229793,-12.184928,-36.71715,-42.234431,42.581163
Alanine and aspartate metabolism,-100.77703,96.445184,38.29439,-100.777,6.018669,62.624693,24.222962,-100.777038,147.239393,-100.777038,...,-58.700946,24.222962,35.878663,24.222962,24.222962,24.222962,-100.777,274.222962,-205.961401,15.846673
Alkaloid synthesis,7e-06,-1e-06,-1e-06,5.131799e-07,6e-06,-1e-06,-1e-06,-1e-06,-1e-06,-1e-06,...,2e-06,-1e-06,-1e-06,-1e-06,-1e-06,-1e-06,3.8e-05,-1e-06,-1e-06,-1e-06
Aminosugar metabolism,-55.414072,73.618178,256.413877,-44.66139,-55.414075,-12.403328,-12.403328,-12.403328,-33.908704,116.62893,...,-55.414077,-55.41408,-12.403328,-55.41408,154.263339,84.370866,-55.414042,-55.41408,-33.908704,-33.908704
Androgen and estrogen synthesis and metabolism,7e-06,-1e-06,-1e-06,1.983698e-06,6e-06,-1e-06,-1e-06,-1e-06,-1e-06,-1e-06,...,4e-06,-1e-06,-1e-06,-1e-06,-1e-06,-1e-06,3.8e-05,-1e-06,-1e-06,-1e-06


In [6]:
balance = labels.count('unhealthy') / len(labels)

print(balance)

0.6477272727272727


### Standard scaling

In [7]:
std_scalar = StandardScaler().fit(dataset, dataset.index)
X, y = std_scalar.transform(dataset), dataset.index

### Train\Test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [9]:
dataset.shape

(88, 100)

### <del>Feature selection </del>and random oversampling on training dataset

In [10]:
feature_selection = Pipeline([('select_k_best', SelectKBest(k=100))]).fit(X_train, y_train)

X_train_f, y_train_f = RandomOverSampler(random_state=42).\
                       fit_sample(feature_selection.transform(X_train), y_train)

X_test_f = feature_selection.transform(X_test)

In [11]:
models = [

  (RandomForestClassifier, {
    'max_depth': range(5, 20),
    'n_estimators': range(1, 8)
  }),

  (LogisticRegression, {
    'C': np.geomspace(1e-4, 1e3, num = 10),
    'max_iter': range(100, 1000 + 1, 1000)
  }),

  (SVC, {
    'C': np.geomspace(1e-4, 1e2, num = 10),
    'degree': range(1, 5),

    'max_iter': range(50000, 100000 + 1, 10000),
    'kernel': ['rbf', 'linear']
  }),

  (SGDClassifier, {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': np.geomspace(1e-4, 1.0, num = 10),
    'max_iter': range(200, 1000 + 1, 100)
  })
]

cv_estimators = []
for model, params in models:
    cv_model = GridSearchCV(model(random_state=42), params, n_jobs=-1).fit(X_train_f, y_train_f)
    cv_estimators.append(cv_model)

In [12]:
f1_scores = []
binarize = lambda ls: [1 if l == 'unhealthy' else 0 for l in ls]

for estimator in cv_estimators:
    score = f1_score(binarize(estimator.predict(X_test_f)), binarize(y_test))
    f1_scores.append(score)

## best estimator

In [13]:
best_estimator = cv_estimators[np.argmax(f1_scores)].best_estimator_
best_estimator

LogisticRegression(C=0.12915496650148839, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [14]:
f1_scores

[0.61538461538461531,
 0.83333333333333337,
 0.61538461538461531,
 0.7142857142857143]

### Scores

In [15]:
metrics = {'recall': recall_score, 'precision': precision_score, 'f1':f1_score,
           'accuracy':accuracy_score}

x_predicted = binarize(best_estimator.predict(X_test_f))
y_test_b    = binarize(y_test)

for metric, f in metrics.items():
    print(metric, f(x_predicted, y_test_b), '\n')

f1 0.833333333333 

accuracy 0.777777777778 

recall 0.833333333333 

precision 0.833333333333 

