In [2]:
import sys
sys.path.append("../classes")

In [55]:
import pickle
from functools import reduce

from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

from statsmodels.sandbox.stats.multicomp import multipletests
from imblearn.over_sampling import RandomOverSampler

import numpy as np, pandas as pd
from geno_classifier import *
from geno_utils import *
from itertools import starmap

from metabolitics.utils import load_network_model
import GEOparse

from cobra import Model

## GDS3952

In [3]:
breast_cancer_dataset = GEOparse.get_GEO('GDS3952').table

21-Jan-2018 21:00:03 INFO GEOparse - File already exist: using local version.
21-Jan-2018 21:00:03 INFO GEOparse - Parsing ./GDS3952.soft.gz: 
21-Jan-2018 21:00:03 DEBUG GEOparse - DATABASE: Geo
21-Jan-2018 21:00:03 DEBUG GEOparse - DATASET: GDS3952
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_1
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_2
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_3
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_4
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_5
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_6
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_7
21-Jan-2018 21:00:03 DEBUG GEOparse - SUBSET: GDS3952_8
21-Jan-2018 21:00:03 DEBUG GEOparse - DATASET: GDS3952


## sample types

#### unhealthy

In [4]:
unhealthy_raw = '\
#GSM681992 = Value for GSM681992: PBMC_malignant_training_1; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681993 = Value for GSM681993: PBMC_malignant_training_2; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681994 = Value for GSM681994: PBMC_malignant_training_3; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681995 = Value for GSM681995: PBMC_malignant_training_4; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681996 = Value for GSM681996: PBMC_malignant_training_5; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681997 = Value for GSM681997: PBMC_malignant_training_6; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681998 = Value for GSM681998: PBMC_malignant_training_7; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM681999 = Value for GSM681999: PBMC_malignant_training_8; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682000 = Value for GSM682000: PBMC_malignant_training_9; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682001 = Value for GSM682001: PBMC_malignant_training_10; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682055 = Value for GSM682055: PBMC_malignant_validation_1; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682056 = Value for GSM682056: PBMC_malignant_validation_2; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682057 = Value for GSM682057: PBMC_malignant_validation_3; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682058 = Value for GSM682058: PBMC_malignant_validation_4; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682059 = Value for GSM682059: PBMC_malignant_validation_5; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682060 = Value for GSM682060: PBMC_malignant_validation_6; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682061 = Value for GSM682061: PBMC_malignant_validation_7; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682062 = Value for GSM682062: PBMC_malignant_validation_8; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682063 = Value for GSM682063: PBMC_malignant_validation_9; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682064 = Value for GSM682064: PBMC_malignant_validation_10; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682065 = Value for GSM682065: PBMC_malignant_validation_11; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682066 = Value for GSM682066: PBMC_malignant_validation_12; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682067 = Value for GSM682067: PBMC_malignant_validation_13; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682068 = Value for GSM682068: PBMC_malignant_validation_14; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682069 = Value for GSM682069: PBMC_malignant_validation_15; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682070 = Value for GSM682070: PBMC_malignant_validation_16; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682071 = Value for GSM682071: PBMC_malignant_validation_17; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682072 = Value for GSM682072: PBMC_malignant_validation_18; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682073 = Value for GSM682073: PBMC_malignant_validation_19; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682074 = Value for GSM682074: PBMC_malignant_validation_20; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682075 = Value for GSM682075: PBMC_malignant_validation_21; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682076 = Value for GSM682076: PBMC_malignant_validation_22; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682077 = Value for GSM682077: PBMC_malignant_validation_23; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682078 = Value for GSM682078: PBMC_malignant_validation_24; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682079 = Value for GSM682079: PBMC_malignant_validation_25; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682080 = Value for GSM682080: PBMC_malignant_validation_26; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682081 = Value for GSM682081: PBMC_malignant_validation_27; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682082 = Value for GSM682082: PBMC_malignant_validation_28; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682083 = Value for GSM682083: PBMC_malignant_validation_29; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682084 = Value for GSM682084: PBMC_malignant_validation_30; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682085 = Value for GSM682085: PBMC_malignant_validation_31; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682086 = Value for GSM682086: PBMC_malignant_validation_32; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682087 = Value for GSM682087: PBMC_malignant_validation_33; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682088 = Value for GSM682088: PBMC_malignant_validation_34; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682089 = Value for GSM682089: PBMC_malignant_validation_35; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682090 = Value for GSM682090: PBMC_malignant_validation_36; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682091 = Value for GSM682091: PBMC_malignant_validation_37; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682092 = Value for GSM682092: PBMC_malignant_validation_38; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682093 = Value for GSM682093: PBMC_malignant_validation_39; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682094 = Value for GSM682094: PBMC_malignant_validation_40; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682095 = Value for GSM682095: PBMC_malignant_validation_41; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682123 = Value for GSM682123: PBMC_malignant_validation_42; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682124 = Value for GSM682124: PBMC_malignant_validation_43; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682125 = Value for GSM682125: PBMC_malignant_validation_44; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682126 = Value for GSM682126: PBMC_malignant_validation_45; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682127 = Value for GSM682127: PBMC_malignant_validation_46; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy\
#GSM682128 = Value for GSM682128: PBMC_malignant_validation_47; src: PBMCs from patients with diagnosis of invasive breast cancer, confirmed by diagnostic biopsy'

#### healthy

In [6]:
healthy_raw = '\
#GSM681982 = Value for GSM681982: PBMC_normal_training_1; src: PBMCs from patients with normal mammogram\
#GSM681983 = Value for GSM681983: PBMC_normal_training_2; src: PBMCs from patients with normal mammogram\
#GSM681984 = Value for GSM681984: PBMC_normal_training_3; src: PBMCs from patients with normal mammogram\
#GSM681985 = Value for GSM681985: PBMC_normal_training_4; src: PBMCs from patients with normal mammogram\
#GSM681986 = Value for GSM681986: PBMC_normal_training_5; src: PBMCs from patients with normal mammogram\
#GSM681987 = Value for GSM681987: PBMC_normal_training_6; src: PBMCs from patients with normal mammogram\
#GSM681988 = Value for GSM681988: PBMC_normal_training_7; src: PBMCs from patients with normal mammogram\
#GSM681989 = Value for GSM681989: PBMC_normal_training_8; src: PBMCs from patients with normal mammogram\
#GSM681990 = Value for GSM681990: PBMC_normal_training_9; src: PBMCs from patients with normal mammogram\
#GSM681991 = Value for GSM681991: PBMC_normal_training_10; src: PBMCs from patients with normal mammogram\
#GSM682034 = Value for GSM682034: PBMC_normal_validation_1; src: PBMCs from patients with normal mammogram\
#GSM682035 = Value for GSM682035: PBMC_normal_validation_2; src: PBMCs from patients with normal mammogram\
#GSM682036 = Value for GSM682036: PBMC_normal_validation_3; src: PBMCs from patients with normal mammogram\
#GSM682037 = Value for GSM682037: PBMC_normal_validation_4; src: PBMCs from patients with normal mammogram\
#GSM682038 = Value for GSM682038: PBMC_normal_validation_5; src: PBMCs from patients with normal mammogram\
#GSM682039 = Value for GSM682039: PBMC_normal_validation_6; src: PBMCs from patients with normal mammogram\
#GSM682040 = Value for GSM682040: PBMC_normal_validation_7; src: PBMCs from patients with normal mammogram\
#GSM682041 = Value for GSM682041: PBMC_normal_validation_8; src: PBMCs from patients with normal mammogram\
#GSM682042 = Value for GSM682042: PBMC_normal_validation_9; src: PBMCs from patients with normal mammogram\
#GSM682043 = Value for GSM682043: PBMC_normal_validation_10; src: PBMCs from patients with normal mammogram\
#GSM682044 = Value for GSM682044: PBMC_normal_validation_11; src: PBMCs from patients with normal mammogram\
#GSM682045 = Value for GSM682045: PBMC_normal_validation_12; src: PBMCs from patients with normal mammogram\
#GSM682046 = Value for GSM682046: PBMC_normal_validation_13; src: PBMCs from patients with normal mammogram\
#GSM682047 = Value for GSM682047: PBMC_normal_validation_14; src: PBMCs from patients with normal mammogram\
#GSM682048 = Value for GSM682048: PBMC_normal_validation_15; src: PBMCs from patients with normal mammogram\
#GSM682049 = Value for GSM682049: PBMC_normal_validation_16; src: PBMCs from patients with normal mammogram\
#GSM682050 = Value for GSM682050: PBMC_normal_validation_17; src: PBMCs from patients with normal mammogram\
#GSM682051 = Value for GSM682051: PBMC_normal_validation_18; src: PBMCs from patients with normal mammogram\
#GSM682052 = Value for GSM682052: PBMC_normal_validation_19; src: PBMCs from patients with normal mammogram\
#GSM682053 = Value for GSM682053: PBMC_normal_validation_20; src: PBMCs from patients with normal mammogram\
#GSM682054 = Value for GSM682054: PBMC_normal_validation_21; src: PBMCs from patients with normal mammogram'

In [7]:
clean = lambda string: string.replace(':', '').split()
predicate = lambda word: word.startswith('GSM')

healthy = set(filter(predicate, clean(healthy_raw)))
unhealthy = set(filter(predicate, clean(unhealthy_raw)))

### number of samples

In [8]:
print(len(healthy), len(unhealthy))

31 57


In [9]:
healthy & unhealthy

set()

In [10]:
breast_cancer_dataset[breast_cancer_dataset.columns[:5]].head()

Unnamed: 0,ID_REF,IDENTIFIER,GSM682002,GSM682003,GSM682004
0,1007_s_at,MIR4640,5.66,5.43,5.88
1,1053_at,RFC2,7.01,6.37,6.2
2,117_at,HSPA6,6.13,6.51,6.41
3,121_at,PAX8,6.79,6.93,6.4
4,1255_g_at,GUCA1A,2.11,2.11,2.07


In [11]:
labels_ = [(h, 'healthy') for h in healthy] + [(d, 'unhealthy') for d in unhealthy]

In [None]:
X, y = parse_database('GDS3952', labels=dict(labels_), n_jobs=-1)

21-Jan-2018 20:54:12 INFO GEOparse - File already exist: using local version.
21-Jan-2018 20:54:12 INFO GEOparse - Parsing ./GDS3952.soft.gz: 
21-Jan-2018 20:54:12 DEBUG GEOparse - DATABASE: Geo
21-Jan-2018 20:54:12 DEBUG GEOparse - DATASET: GDS3952
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_1
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_2
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_3
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_4
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_5
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_6
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_7
21-Jan-2018 20:54:12 DEBUG GEOparse - SUBSET: GDS3952_8
21-Jan-2018 20:54:12 DEBUG GEOparse - DATASET: GDS3952


### Flux variability analysis

In [None]:
# results = flux_variance_analysis(X_, y_)
# pickle.dump(results, open("../results/breast_cancer_2.results", "wb"))

In [4]:
results = pickle.load(open('../results/breast_cancer_2.results','rb'))
labels = pickle.load(open('../datasets/breast_cancer_2_y','rb'))

X_diff, y = preprocess_results(results, labels, use_diff_score=True, use_pathways=True)

### nested-cross validation over best pipeline (using diff-values)

In [6]:
classifiers = [
    (SVC, {
        'C': np.geomspace(1e-6, 1e6, num=10),
        'class_weight' : ['balanced']
    })
]

feature_selection = [
    (PCA, {
    'n_components': range(3, 81+1, 6)
  })
]

In [7]:
trials, stats = nested_cross_validation(X_diff, y, build_pipelines(feature_selection, classifiers))

Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   10.0s finished


1 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   10.5s finished


2 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   10.7s finished


3 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 340 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   12.7s finished


4 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   11.1s finished


5 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   11.6s finished


6 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   11.8s finished


7 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   10.0s finished


8 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   11.1s finished


9 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    2.8s


10 trial done
----------


[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:   10.5s finished


In [8]:
stats.describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.005147,0.002195,0.73675,0.820307,0.744472,0.923667
std,0.00234,0.000593,0.022716,0.015016,0.019627,0.014778
min,0.003878,0.001877,0.699444,0.799431,0.707738,0.896667
25%,0.003987,0.00194,0.725,0.811627,0.737411,0.914167
50%,0.004263,0.00206,0.732361,0.817685,0.740357,0.925
75%,0.005152,0.002092,0.754444,0.83145,0.751815,0.929167
max,0.011626,0.003862,0.775278,0.847429,0.775595,0.946667


### most significant pathways for breast-cancer

In [6]:
kbest = SelectKBest().fit(X_diff, y)
df, _ = preprocess_results(results, labels, *[True, True, False, False])

In [10]:
SIGNIFICANCE_THRESHOLD = 0.05

In [11]:
significant_pathways_indices = np.argwhere(kbest.pvalues_< SIGNIFICANCE_THRESHOLD)
significant_pathways = df[df.columns[kbest.pvalues_ < SIGNIFICANCE_THRESHOLD]].groupby(df.index)
means = significant_pathways.mean().T
means.reset_index(inplace=True)
means.rename(columns={"index":"Pathway","healthy":"healthy_mean", "unhealthy":"unhealthy_mean"}, inplace=True)
medians = significant_pathways.median().T
medians.reset_index(inplace=True)
medians.rename(columns={"index":"Pathway","healthy":"healthy_median", "unhealthy":"unhealthy_median"}, inplace=True)
medians["P-val"] = kbest.pvalues_[np.argwhere(kbest.pvalues_< SIGNIFICANCE_THRESHOLD)]
singificance_stats = means.merge(medians, left_on=means.Pathway, right_on=medians.Pathway)\
                            .drop("Pathway_y", axis=1)\
                            .rename(columns={"Pathway_x":"Pathway"})\
                            .set_index("Pathway")\
                            .sort_values("P-val")
singificance_stats

Unnamed: 0_level_0,healthy_mean,unhealthy_mean,healthy_median,unhealthy_median,P-val
Pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pyrimidine synthesis,2.658805e-14,80.159983,-23.266364,83.667481,0.000213
Glycolysis/gluconeogenesis,1.696134e-14,-90.341946,-14.75079,-113.18902,0.002633
Fatty acid synthesis,7.334635e-15,-16.163777,-5.119789,-24.159283,0.003926
Vitamin C metabolism,-5.042561e-15,-42.873643,2.288581,-60.21138,0.004268
Glycosphingolipid metabolism,-4.354939e-15,-37.513624,-4.690913,-81.613951,0.005823
Butanoate metabolism,-3.11722e-14,-193.546083,-353.019337,-353.019337,0.013505
Thiamine metabolism,-1.466927e-14,158.743487,-161.737362,49.47379,0.014789
Histidine metabolism,-2.292073e-15,-68.526318,-113.125583,-113.125591,0.017857
"Transport, endoplasmic reticular",1.375244e-14,-37.192206,14.8156,-18.053268,0.019059
"Valine, leucine, and isoleucine metabolism",1.398165e-14,59.890382,-6.061045,21.945343,0.040942


### after p-value correction

In [21]:
corrected_p_values = multipletests(kbest.pvalues_, method='bonferroni')
corrected_pathways = df.columns[corrected_p_values[0]]

In [36]:
metabolitics_pathways = np.array([
'Alanine and aspartate metabolism_dif',
'Arginine and Proline Metabolism_dif',
'Methionine and cysteine metabolism_dif',
'Taurine and hypotaurine metabolism_dif',
'CoA catabolism_dif',
'Fatty acid oxidation_dif',
'Nucleotide interconversion_dif',
'Eicosanoid metabolism_dif',
'Butanoate metabolism_dif',
'Glycolysis/gluconeogenesis_dif',
'Pentose phosphate pathway_dif',
'Urea cycle_dif',
'Glycine, serine, alanine and threonine metabolism_dif',
'Folate metabolism_dif',
'Glutamate metabolism_dif',
'Sphingolipid metabolism_dif',
'Glycerophospholipid metabolism_dif',
'Fatty acid synthesis_dif',
'ROS detoxification_dif',
'Aminosugar metabolism_dif',
'Citric acid cycle_dif',
'CoA synthesis_dif',
'Pyrimidine catabolism_dif',
'Pyruvate metabolism_dif'])

In [43]:
corrected_pathways

Index(['Pyrimidine synthesis'], dtype='object')

## without diff values

In [69]:
df_, y_ = preprocess_results(results, labels, *[False, False, False, False])
kbest_ = SelectKBest().fit(df_, y_)

corrected_p_values_ = multipletests(kbest_.pvalues_, method='bonferroni')
corrected_pathways_ = df_.columns[corrected_p_values_[0]]



In [70]:
significant_reactions = df_.columns[corrected_p_values_[0]]

template = ('_max', ''), ('_min', '')
reactions = set(map(lambda reaction: reduce(lambda a, kv: a.replace(*kv), template, reaction), 
                significant_reactions))

model = load_network_model('recon2')

pathways = set(filter(lambda name : 'Transport' not in name and name != '', 
               [
                model.reactions.get_by_any(reaction)[0].subsystem
                for reaction in reactions 
               ]))

In [71]:
common = [pathway + '_dif' in metabolitics_pathways for pathway in pathways]

np.array(list(pathways))[common]

array(['Glycine, serine, alanine and threonine metabolism'], dtype='<U49')