In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
import re 

import pandas as pd
import numpy as np

import cancerclass as cacl
import importlib 
from sklearn.model_selection import train_test_split


In [28]:
cacl = importlib.reload(cacl)

In [3]:
def rename_colums(colname: str):
    if 'TCGA' in colname:
        return f"TCGA-{re.split('[_|-|.]',colname)[0]}"
    else:
        return colname

def non_cancer_columns(colnames: str):
    return [c for c in colnames if 'TCGA' not in c]

def clean_data(proteome, metadata):
    # Clean columnar data
    proteome.rename(columns=rename_colums, inplace=True)
    proteome.drop(['gene_symbol', 'gene_name'], axis=1, inplace=True)

    metadata = metadata.loc[[x for x in metadata.index.tolist() if x in proteome.columns], :] # Drop patient entries not in proteome dataset
    return proteome,metadata

In [37]:
cancer_proteomes = pd.read_csv('data/77_cancer_proteomes_CPTAC_itraq.csv', header=0, index_col=0)
cancer_metadata  = pd.read_csv('data/clinical_data_breast_cancer.csv',     header=0, index_col=0)
pam50dset        = pd.read_csv('data/PAM50_proteins.csv',                  header=0, index_col=0)

cancer_proteomes,cancer_metadata = clean_data(cancer_proteomes, cancer_metadata)
cancer_proteomes.head()

Unnamed: 0_level_0,TCGA-AO-A12D,TCGA-C8-A131,TCGA-AO-A12B,TCGA-BH-A18Q,TCGA-C8-A130,TCGA-C8-A138,TCGA-E2-A154,TCGA-C8-A12L,TCGA-A2-A0EX,TCGA-AO-A12D,...,TCGA-AO-A12B,TCGA-A2-A0SW,TCGA-AO-A0JL,TCGA-BH-A0BV,TCGA-A2-A0YM,TCGA-BH-A0C7,TCGA-A2-A0SX,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
RefSeq_accession_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NP_958782,1.096131,2.609943,-0.659828,0.195341,-0.49406,2.765081,0.862659,1.40757,1.185108,1.100688,...,-0.963904,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.39856,0.598585,-0.191285,0.566975
NP_958785,1.11137,2.650422,-0.648742,0.215413,-0.503899,2.779709,0.870186,1.40757,1.192612,1.100688,...,-0.93821,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702
NP_958786,1.11137,2.650422,-0.654285,0.215413,-0.500619,2.779709,0.870186,1.410312,1.18886,1.100688,...,-0.943919,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.392601,0.603993,-0.186022,0.576747
NP_000436,1.107561,2.646374,-0.632113,0.205377,-0.510459,2.797995,0.866423,1.40757,1.185108,1.100688,...,-0.935355,-0.487772,-0.10668,-0.055893,0.65585,-0.552212,-0.392601,0.603993,-0.186022,0.576747
NP_958781,1.11518,2.646374,-0.640428,0.215413,-0.503899,2.787023,0.870186,1.413053,1.200116,1.093358,...,-0.935355,-0.503853,-0.10668,-0.062523,0.651264,-0.556675,-0.395581,0.603993,-0.167079,0.576747


In [35]:
cancer_metadata.head()

Unnamed: 0_level_0,Gender,Age at Initial Pathologic Diagnosis,ER Status,PR Status,HER2 Final Status,Tumor,Tumor--T1 Coded,Node,Node-Coded,Metastasis,...,PAM50 mRNA,SigClust Unsupervised mRNA,SigClust Intrinsic mRNA,miRNA Clusters,methylation Clusters,RPPA Clusters,CN Clusters,Integrated Clusters (with PAM50),Integrated Clusters (no exp),Integrated Clusters (unsup exp)
Complete TCGA ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A2-A0CM,FEMALE,40,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,4,4,Basal,4,2,1,1
TCGA-BH-A18Q,FEMALE,56,Negative,Negative,Negative,T2,T_Other,N1,Positive,M0,...,Basal-like,-12,-13,5,5,Basal,1,2,2,2
TCGA-A7-A0CE,FEMALE,57,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,0,-13,5,5,Basal,1,2,2,2
TCGA-D8-A142,FEMALE,74,Negative,Negative,Negative,T3,T_Other,N0,Negative,M0,...,Basal-like,0,-13,3,5,X,1,2,2,2
TCGA-AO-A0J6,FEMALE,61,Negative,Negative,Negative,T2,T_Other,N0,Negative,M0,...,Basal-like,-12,-13,2,5,Basal,1,2,2,2


In [38]:
cancer_proteomes = cancer_proteomes.T.merge(pd.DataFrame(cancer_metadata['PAM50 mRNA']), left_index=True, right_index=True, how='left')
cancer_proteomes.iloc[-3:,-1] = ["Non-cancer"] * 3

encoded_labels = cacl.ohe_data(diagnosed_proteome.iloc[-1])
cancer_proteomes.tail()

Unnamed: 0,NP_958782,NP_958785,NP_958786,NP_000436,NP_958781,NP_958780,NP_958783,NP_958784,NP_112598,NP_001611,...,NP_061134,NP_932347,NP_003593,NP_997203,NP_001191293,NP_775791,NP_004065,NP_068752,NP_219494,PAM50 mRNA
TCGA-BH-A0C7,-0.552212,-0.547749,-0.552212,-0.552212,-0.556675,-0.547749,-0.552212,-0.552212,0.679466,0.487574,...,,,,,,,,,,Luminal B
TCGA-A2-A0SX,-0.39856,-0.392601,-0.392601,-0.392601,-0.395581,-0.392601,-0.392601,-0.392601,-2.504862,0.69481,...,,,,,,,,,,Basal-like
263d3f-I.CPTAC,0.598585,0.606697,0.603993,0.603993,0.603993,0.606697,0.603993,0.603993,-0.602132,2.778263,...,1.520756,,,,-8.020071,0.049608,,0.019861,,Non-cancer
blcdb9-I.CPTAC,-0.191285,-0.183918,-0.186022,-0.186022,-0.167079,-0.183918,-0.186022,-0.186022,-0.340726,1.36733,...,-2.386605,,,,-3.093822,-0.646977,,-1.718327,,Non-cancer
c4155b-C.CPTAC,0.566975,0.578702,0.576747,0.576747,0.576747,0.578702,0.576747,0.576747,-0.205013,3.21519,...,0.518115,,,,-4.602418,0.24059,,-0.369183,,Non-cancer


In [61]:
X_train,X_test,Y_train,Y_test = train_test_split(cancer_proteomes.iloc[:,:-1], cancer_labels_enc, train_size=0.9, shuffle=True)


model = cacl.train_pipeline(X_train, Y_train, imputation_type='mean', classifier='logistic', max_iter=1000)
model.predict(X_test),Y_test.flatten(),np.round(model.predict_proba(X_test),2)

  y = column_or_1d(y, warn=True)


(array([3., 3., 3., 1., 3., 1., 0., 0., 4.]),
 array([3., 3., 3., 1., 2., 1., 0., 0., 4.]),
 array([[0.07, 0.12, 0.31, 0.5 , 0.  ],
        [0.01, 0.14, 0.25, 0.6 , 0.  ],
        [0.  , 0.01, 0.  , 0.98, 0.  ],
        [0.  , 0.99, 0.  , 0.  , 0.  ],
        [0.  , 0.02, 0.05, 0.92, 0.  ],
        [0.  , 0.98, 0.01, 0.01, 0.01],
        [0.84, 0.16, 0.  , 0.  , 0.  ],
        [0.76, 0.23, 0.  , 0.  , 0.  ],
        [0.03, 0.  , 0.  , 0.  , 0.97]]))