In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re 

import pandas as pd
import numpy as np

import cancerclass as cacl
from cancerclass.pipeline import benchmark_methods
import importlib 
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [3]:
cacl = importlib.reload(cacl)

In [4]:
def rename_colums(colname: str):
    if 'TCGA' in colname:
        return f"TCGA-{re.split('[_|-|.]',colname)[0]}"
    else:
        return colname

def non_cancer_columns(colnames: str):
    return [c for c in colnames if 'TCGA' not in c]

def clean_data(proteome, metadata):
    # Clean columnar data
    proteome.rename(columns=rename_colums, inplace=True)
    proteome.drop(['gene_symbol', 'gene_name'], axis=1, inplace=True)

    metadata = metadata.loc[[x for x in metadata.index.tolist() if x in proteome.columns], :] # Drop patient entries not in proteome dataset
    return proteome,metadata

In [5]:
cancer_proteomes = pd.read_csv('data/77_cancer_proteomes_CPTAC_itraq.csv', header=0, index_col=0)
cancer_metadata  = pd.read_csv('data/clinical_data_breast_cancer.csv',     header=0, index_col=0)
pam50dset        = pd.read_csv('data/PAM50_proteins.csv',                  header=0, index_col=0)

cancer_proteomes,cancer_metadata = clean_data(cancer_proteomes, cancer_metadata)
cancer_proteomes.head()

Unnamed: 0_level_0,TCGA-AO-A12D,TCGA-C8-A131,TCGA-AO-A12B,TCGA-BH-A18Q,TCGA-C8-A130,TCGA-C8-A138,TCGA-E2-A154,TCGA-C8-A12L,TCGA-A2-A0EX,TCGA-AO-A12D,...,TCGA-AO-A12B,TCGA-A2-A0SW,TCGA-AO-A0JL,TCGA-BH-A0BV,TCGA-A2-A0YM,TCGA-BH-A0C7,TCGA-A2-A0SX,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
RefSeq_accession_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NP_958782,1.096131,2.609943,-0.659828,0.195341,-0.49406,2.765081,0.862659,1.40757,1.185108,1.100688,...,-0.963904,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.39856,0.598585,-0.191285,0.566975
NP_958785,1.11137,2.650422,-0.648742,0.215413,-0.503899,2.779709,0.870186,1.40757,1.192612,1.100688,...,-0.93821,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702
NP_958786,1.11137,2.650422,-0.654285,0.215413,-0.500619,2.779709,0.870186,1.410312,1.18886,1.100688,...,-0.943919,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.392601,0.603993,-0.186022,0.576747
NP_000436,1.107561,2.646374,-0.632113,0.205377,-0.510459,2.797995,0.866423,1.40757,1.185108,1.100688,...,-0.935355,-0.487772,-0.10668,-0.055893,0.65585,-0.552212,-0.392601,0.603993,-0.186022,0.576747
NP_958781,1.11518,2.646374,-0.640428,0.215413,-0.503899,2.787023,0.870186,1.413053,1.200116,1.093358,...,-0.935355,-0.503853,-0.10668,-0.062523,0.651264,-0.556675,-0.395581,0.603993,-0.167079,0.576747


In [6]:
cancer_metadata.head()

Unnamed: 0_level_0,Gender,Age at Initial Pathologic Diagnosis,ER Status,PR Status,HER2 Final Status,Tumor,Tumor--T1 Coded,Node,Node-Coded,Metastasis,...,PAM50 mRNA,SigClust Unsupervised mRNA,SigClust Intrinsic mRNA,miRNA Clusters,methylation Clusters,RPPA Clusters,CN Clusters,Integrated Clusters (with PAM50),Integrated Clusters (no exp),Integrated Clusters (unsup exp)
Complete TCGA ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A2-A0CM,FEMALE,40,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,4,4,Basal,4,2,1,1
TCGA-BH-A18Q,FEMALE,56,Negative,Negative,Negative,T2,T_Other,N1,Positive,M0,...,Basal-like,-12,-13,5,5,Basal,1,2,2,2
TCGA-A7-A0CE,FEMALE,57,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,0,-13,5,5,Basal,1,2,2,2
TCGA-D8-A142,FEMALE,74,Negative,Negative,Negative,T3,T_Other,N0,Negative,M0,...,Basal-like,0,-13,3,5,X,1,2,2,2
TCGA-AO-A0J6,FEMALE,61,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,2,5,Basal,1,2,2,2


In [7]:
cancer_proteomes = cancer_proteomes.T.merge(pd.DataFrame(cancer_metadata['PAM50 mRNA']), left_index=True, right_index=True, how='left')
cancer_proteomes.iloc[-3:,-1] = ["Non-cancer"] * 3

encoded_labels = cacl.ohe_data(cancer_proteomes.iloc[:,-1])
cancer_proteomes.T.tail()

Unnamed: 0,TCGA-AO-A12D,TCGA-C8-A131,TCGA-AO-A12B,TCGA-BH-A18Q,TCGA-C8-A130,TCGA-C8-A138,TCGA-E2-A154,TCGA-C8-A12L,TCGA-A2-A0EX,TCGA-AO-A12D.1,...,TCGA-AO-A12B.1,TCGA-A2-A0SW,TCGA-AO-A0JL,TCGA-BH-A0BV,TCGA-A2-A0YM,TCGA-BH-A0C7,TCGA-A2-A0SX,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
NP_775791,,,,-2.046065,-0.425182,-3.20337,-4.786183,,,,...,,,,,,,,0.049608,-0.646977,0.24059
NP_004065,,,,-1.778435,-0.149673,1.971481,-3.103949,-0.933726,-1.726336,1.294925,...,,,,,,,,,,
NP_068752,-0.633517,4.840325,-1.965192,,,,,,,-0.189341,...,,,,,,,,0.019861,-1.718327,-0.369183
NP_219494,12.666488,0.140736,-2.854835,-3.069752,-0.047997,,,,,13.066445,...,-6.00286,,,,,,,,,
PAM50 mRNA,HER2-enriched,Basal-like,Luminal B,Basal-like,HER2-enriched,HER2-enriched,Luminal A,HER2-enriched,Luminal A,HER2-enriched,...,Luminal B,Luminal B,Basal-like,Luminal A,Basal-like,Luminal B,Basal-like,Non-cancer,Non-cancer,Non-cancer


In [8]:
data = cancer_proteomes.iloc[:,:-1].to_numpy()

imp = SimpleImputer()
data = imp.fit_transform(data)

In [16]:
X_train,X_test,Y_train,Y_test = train_test_split(data, encoded_labels, train_size=0.8, shuffle=True)


model = cacl.train_pipeline(X_train, Y_train, preprocessing_step='standard', imputation_type='none', classifier='bayeslog', n_categories=4)
#model.predict(X_test),Y_test.flatten()

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


In [17]:
model.predict(X_test),np.int32(Y_test.flatten())

NameError: name 'predict_proba' is not defined

In [18]:
cacl.analysis_pipeline(model, (X_train,X_test,Y_train,Y_test), '', cancer_proteomes.iloc[:,-1].to_list())

NameError: name 'predict_proba' is not defined

In [None]:
df_results = benchmark_methods(
    X_train, y_train, 
    X_test, y_test, 
    methods=['mean', 'mice','dream']
)