In [1]:
import sys
sys.path.append("../classes")

In [2]:
import pickle
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

from statsmodels.sandbox.stats.multicomp import multipletests
from imblearn.over_sampling import RandomOverSampler

import numpy as np, pandas as pd
from geno_classifier import *
from geno_utils import *
from itertools import starmap

import GEOparse

### sample types

01	Primary Solid Tumor	TP<br />
02	Recurrent Solid Tumor	TR<br />
03	Primary Blood Derived Cancer - Peripheral Blood	TB<br />
04	Recurrent Blood Derived Cancer - Bone Marrow	TRBM<br />
05	Additional - New Primary	TAP<br />
06	Metastatic	TM<br />
07	Additional Metastatic	TAM<br />
08	Human Tumor Original Cells	THOC<br />
09	Primary Blood Derived Cancer - Bone Marrow	TBM<br />
10	Blood Derived Normal	NB<br />
11	Solid Tissue Normal	NT<br />
12	Buccal Cell Normal	NBC<br />
13	EBV Immortalized Normal	NEBV<br />
14	Bone Marrow Normal	NBM<br />
15	sample type 15	15SH<br />
16	sample type 16	16SH<br />
20	Control Analyte	CELLC<br />
40	Recurrent Blood Derived Cancer - Peripheral Blood	TRB<br />
50	Cell Lines	CELL<br />
60	Primary Xenograft Tissue	XP<br />
61	Cell Line Derived Xenograft Tissue	XCL<br />
99	sample type 99	99SH<br />

#### https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes

In [4]:
healthy_suffix = ('11')
unhealthy_suffix = ('01', '06')

In [5]:
breast_cancer_dataset = pd.read_csv("../datasets/AgilentG4502A_07_3", sep="\t")
breast_cancer_dataset[breast_cancer_dataset.columns[:5]].head()

Unnamed: 0,sample,TCGA-A8-A0A7-01,TCGA-A8-A07G-01,TCGA-A8-A08R-01,TCGA-A8-A06Y-01
0,RNF14,0.496583,1.0045,-0.149417,1.04525
1,DUOXA1,0.578,1.6955,0.79,2.308
2,UBE2Q2,0.2795,0.795167,0.829667,0.479333
3,RNF10,-0.22225,0.632,-0.01775,1.45775
4,RNF11,1.13175,0.85775,-0.1555,-0.0165


In [6]:
columns = breast_cancer_dataset.columns

healthy = set(filter(lambda column: column.endswith(healthy_suffix), columns))
unhealthy = set(filter(lambda column: column.endswith(unhealthy_suffix), columns))

### number of samples

In [6]:
print(len(healthy), len(unhealthy))

63 534


In [7]:
healthy & unhealthy

set()

In [20]:
labels_ = [(h, 'healthy') for h in healthy] + [(d, 'unhealthy') for d in unhealthy]

In [None]:
X, y = parse_database(breast_cancer_dataset, labels=dict(labels_), n_jobs=-1)

### Flux variability analysis

In [3]:
results = pickle.load(open('../results/breast_cancer3.results','rb'))
labels = pickle.load(open('../datasets/breast_cancer3_y','rb'))

X_diff, y = preprocess_results(results, labels, use_diff_score=True, use_pathways=True)

### nested-cross validation over best pipeline (using diff-values)

In [4]:
classifiers = [
    (SVC, {
        'C': np.geomspace(1e-6, 1e6, num=10),
        'class_weight' : ['balanced']
    })
]

feature_selection = [
    (PCA, {
    'n_components': range(3, 81+1, 6)
  })
]

In [5]:
trials, stats = nested_cross_validation(X_diff, y, build_pipelines(feature_selection, classifiers))

Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 230 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 834 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1284 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 15.2min finished


1 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done 486 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1286 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.7min finished


2 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 486 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1286 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.6min finished


3 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done 486 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1286 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.5min finished


4 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 230 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 485 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 835 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1285 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.7min finished


5 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   50.2s
[Parallel(n_jobs=-1)]: Done 486 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 836 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1286 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.7min finished


6 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 231 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done 485 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 835 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1285 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.9min finished


7 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 834 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1284 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.6min finished


8 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 231 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 834 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1284 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.7min finished


9 trial done
----------
Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 483 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 834 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1284 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 14.8min finished


10 trial done
----------


In [6]:
stats.describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.02736,0.005827,0.907263,0.94775,0.952113,0.944375
std,0.001517,0.001068,0.005368,0.003069,0.002683,0.004738
min,0.025272,0.004658,0.901254,0.944038,0.948457,0.936373
25%,0.02645,0.004967,0.902843,0.945329,0.950382,0.941929
50%,0.026987,0.005595,0.906241,0.947312,0.951827,0.944654
75%,0.028754,0.006378,0.912138,0.950291,0.953689,0.94579
max,0.029552,0.007678,0.914785,0.95223,0.957194,0.953075


In [9]:
kbest = SelectKBest().fit(X_diff, y)
df, _ = preprocess_results(results, labels, *[True, True, False, False])

In [10]:
corrected_p_values = multipletests(kbest.pvalues_, method='bonferroni')

### most significant pathways for breast-cancer

In [11]:
np.argwhere(corrected_p_values[0] == True)

array([], shape=(0, 1), dtype=int64)