In [1]:
import pickle

from metabolitics.preprocessing import MetaboliticsPipeline

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score

import numpy as np, pandas as pd

In [2]:
pipe = MetaboliticsPipeline(
                ['reaction-diff',
                'feature-selection',
                'pathway_transformer']
                )

In [3]:
X = pickle.load(open("results/breast_cancer2.results","rb"))
y = pickle.load(open("datasets/breast_cancer2_y","rb"))

In [4]:
results = pipe.fit_transform(X, y)
df_raw = dict()
for i in range(len(results)):
    for k in results[i]:
        df_raw.setdefault(k, [])
        df_raw[k].append(results[i][k])
df = pd.DataFrame(df_raw).T
df.columns = y
df

Unnamed: 0,unhealthy,unhealthy.1,unhealthy.2,healthy,unhealthy.3,unhealthy.4,unhealthy.5,unhealthy.6,healthy.1,healthy.2,...,unhealthy.7,healthy.3,healthy.4,unhealthy.8,unhealthy.9,unhealthy.10,unhealthy.11,unhealthy.12,healthy.5,unhealthy.13
,175.953375,-966.903783,-966.903783,175.953363,175.953367,-355.762558,-966.903783,-621.116237,1255.783452,-206.661256,...,175.953365,-681.189497,-333.953908,-1252.618069,-395.475207,-966.903783,175.953398,-966.903783,1111.066143,-308.091805
Exchange/demand reaction,-436.225169,-764.002956,-760.542638,-102.891842,-102.891838,-902.891845,563.774822,-508.46035,83.420101,-290.976511,...,-102.89184,474.071115,-349.60432,563.774822,-395.735654,-169.558511,-102.891806,216.853,39.740777,-765.89299
Fatty acid oxidation,-262.080422,-552.723705,537.91957,53.709046,127.39326,-567.343588,-114.712009,748.445885,-505.811589,-260.237696,...,64.585445,-388.754074,-516.772951,516.866938,-262.08043,158.972201,148.445924,629.18501,-471.488069,-672.606746
Fatty acid synthesis,-148.713742,-46.861898,184.619583,-482.047081,-482.047077,184.619583,-1148.71375,184.619583,184.619583,184.619583,...,-482.047078,-143.047721,184.619583,-1148.71375,-482.047084,-482.047084,-482.047045,-704.438364,184.619583,184.619583
Folate metabolism,134.953592,352.999033,218.395858,100.221257,352.99904,175.373136,565.578054,-313.667634,1019.6657,17.116439,...,1316.045983,224.352505,165.566285,351.235365,1057.602342,594.874847,352.999072,-69.268947,-102.24127,-1102.900165
Glutamate metabolism,-85.063576,-1085.063584,-3085.063584,-347.318484,-2085.063581,300.508711,-1141.074513,914.936416,914.936416,914.936416,...,-85.063581,-3085.063584,914.936416,914.936416,-3085.063584,-1085.063584,-1085.063546,799.527285,321.023181,914.936416
Glutathione metabolism,858.137183,-164.085047,-641.862825,1358.137177,858.13718,1362.63264,358.137175,111.387628,-641.862825,358.137175,...,685.801577,-347.301491,-641.862825,2218.292966,358.137175,-641.862825,358.137214,1711.973523,-641.862825,224.116557
Glycerophospholipid metabolism,866.240856,929.539065,-1737.127601,-737.1276,262.872405,-1737.127601,-1737.127601,-1737.127601,-1737.127601,581.926331,...,-737.127599,-1737.127601,2262.872399,-1737.127601,-1737.127601,-1737.127601,262.872437,-762.202542,2262.872399,-1737.127601
"Glycine, serine, alanine and threonine metabolism",-1223.779845,-1223.779853,-1223.779853,-223.779851,-1223.779846,-1223.779853,-1223.779853,776.220147,776.220147,776.220147,...,-891.202375,187.097479,776.220147,-1223.779853,-1223.779853,-1223.779853,-1223.779814,-1223.779853,182.306913,776.220147
Glycolysis/gluconeogenesis,-541.503052,-541.50306,291.830273,-374.836392,-541.503053,-541.50306,-541.50306,291.830273,291.830273,625.163607,...,-385.838322,291.830273,-373.777914,-541.50306,-541.50306,-541.50306,-541.503021,-541.50306,-541.50306,-541.50306


In [5]:
df.to_csv("results/breast_cancer2_diff.csv", index_label="Pathway")

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
X_train = pipe.fit_transform(X_train, y_train)

In [8]:
X_test = pipe.transform(X_test)

In [9]:
X_train = [list(dic.values()) for dic in X_train]
X_test = [list(dic.values()) for dic in X_test]

In [10]:
models = [ (RandomForestClassifier, {'max_depth': range(10, 20), 'n_estimators':range(1,8)}),
           (LogisticRegression, {'C': np.geomspace(1e-4, 1e3, num=10), 'max_iter':range(100, 1000+1, 1000)}),
           (SVC, {'C': np.geomspace(1e-4, 1e2, num=10), 'degree':range(1, 5),
                  'max_iter':range(50000, 100000+1, 10000), 'kernel':['rbf', 'poly', 'linear', 'sigmoid']}),
           (SGDClassifier, {'penalty':['l1','l2', 'elasticnet'], 'alpha':np.geomspace(1e-4, 1.0, num=10),
                            'epsilon':np.geomspace(1e-4, 1e3, num=10), 'max_iter':range(200, 1000+1, 100)}),
           (MLPClassifier, {'max_iter':range(1000, 10000+1, 1000),
                            'activation':['identity', 'logistic', 'tanh', 'relu']})
         ]

results = []
cv_estimators = []
for model, params in models:
    print(params)
    result = GridSearchCV(model(random_state=42), params).fit(X_train, y_train)
    results.append(result.best_score_)
    cv_estimators.append(result)

best = cv_estimators[np.argmax(results)]
best_model = best.best_estimator_
score = best.best_score_

{'max_depth': range(10, 20), 'n_estimators': range(1, 8)}
{'max_iter': range(100, 1001, 1000), 'C': array([  1.00000000e-04,   5.99484250e-04,   3.59381366e-03,
         2.15443469e-02,   1.29154967e-01,   7.74263683e-01,
         4.64158883e+00,   2.78255940e+01,   1.66810054e+02,
         1.00000000e+03])}
{'degree': range(1, 5), 'max_iter': range(50000, 100001, 10000), 'C': array([  1.00000000e-04,   4.64158883e-04,   2.15443469e-03,
         1.00000000e-02,   4.64158883e-02,   2.15443469e-01,
         1.00000000e+00,   4.64158883e+00,   2.15443469e+01,
         1.00000000e+02]), 'kernel': ['rbf', 'poly', 'linear', 'sigmoid']}








{'penalty': ['l1', 'l2', 'elasticnet'], 'max_iter': range(200, 1001, 100), 'alpha': array([  1.00000000e-04,   2.78255940e-04,   7.74263683e-04,
         2.15443469e-03,   5.99484250e-03,   1.66810054e-02,
         4.64158883e-02,   1.29154967e-01,   3.59381366e-01,
         1.00000000e+00]), 'epsilon': array([  1.00000000e-04,   5.99484250e-04,   3.59381366e-03,
         2.15443469e-02,   1.29154967e-01,   7.74263683e-01,
         4.64158883e+00,   2.78255940e+01,   1.66810054e+02,
         1.00000000e+03])}
{'activation': ['identity', 'logistic', 'tanh', 'relu'], 'max_iter': range(1000, 10001, 1000)}
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fractio

In [15]:
print("CV accuracy is: ", score)
print("Best model is: ",best_model)

CV accuracy is:  0.844827586207
Best model is:  MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [11]:
def eval_model(model, X_train, y_train, X_test, y_test):
        y_train = [i=='healthy' for i in y_train]
        y_test = [i=='healthy' for i in y_test]
        model.fit(X_train, y_train)
        predicted_y = model.predict(X_test)
        precision, recall, accuracy = recall_score(y_test, predicted_y),\
                                           precision_score(y_test,predicted_y),\
                                            accuracy_score(y_test, predicted_y)
        f1 = 2*((1.0/precision)+(1.0/recall))
        return precision, recall, f1, accuracy

In [14]:
pre, rec, f1, accuracy = eval_model(best_model, X_train, y_train, X_test, y_test)
print("Precision: ",pre)
print("Recall: ",rec)
print("F1 score: ",f1)
print("Accuracy: ",accuracy)

Precision:  0.5
Recall:  0.833333333333
F1 score:  6.4
Accuracy:  0.8
