In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

## Load the CNA data

In [4]:
x_all_08 = pd.read_pickle("../data/cna/tcga_cna_raw_all_samples_all_chr_0.8_threshold.pkl")
#x_all_09 = pd.read_pickle("../data/cna/tcga_cna_raw_all_samples_all_chr_0.9_threshold.pkl")

### NaN values since the start-stop positions were empty. we'll just fill it with 0

In [5]:
x_all_08.isnull().sum().sum()

362112

In [6]:
x_all_08.fillna(0, inplace=True)

In [7]:
#x_all_09['chr1_5610:5611'][x_all_09['chr1_5610:5611'].isnull()]

In [8]:
x_all_08['tcga_id'] = list(map(lambda x: x[:12], x_all_08.index))
#x_all_09['tcga_id'] = list(map(lambda x: x[:12], x_all_09.index))

In [17]:
chr_col = x_all_08.filter(like="chrX", axis=1).columns

In [18]:
x_all_08_no_x = x_all_08.drop(chr_col, axis=1)

In [9]:
x_all_08.head(5)

Unnamed: 0,chr1_3218610:12354307,chr1_12355487:12355503,chr1_12357088:23957783,chr1_23958921:23959613,chr1_23960868:31621427,chr1_31623667:31625346,chr1_31625389:43476957,chr1_43477796:43478606,chr1_43486624:44635476,chr1_44636481:44637390,...,chrX_150944418:151149982,chrX_151150032:151150133,chrX_151150476:152705438,chrX_152706248:152706348,chrX_152706803:153712179,chrX_153713097:153713787,chrX_153714314:153762695,chrX_153764217:153765166,chrX_153766111:154905589,tcga_id
TCGA-AA-3693-01A-01D-0903-01.bed,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,...,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,TCGA-AA-3693
TCGA-4P-AA8J-01A-11D-A390-01.bed,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,...,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,TCGA-4P-AA8J
TCGA-24-0979-01A-01D-0428-01.bed,0.2214,0.2214,0.2214,0.2214,0.2214,0.2214,0.522517,0.5523,0.5523,-1.5957,...,0.6317,0.6317,0.6317,0.6317,0.6317,0.6317,0.6317,0.6317,0.6317,TCGA-24-0979
TCGA-4W-AA9T-01A-11D-A390-01.bed,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,TCGA-4W-AA9T
TCGA-5P-A9KC-01A-11D-A42I-01.bed,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,...,0.0222,0.0222,0.0222,0.0222,0.0222,0.0222,0.0222,0.0222,0.0222,TCGA-5P-A9KC


In [7]:
translation_table_train = pd.read_pickle('../data/tcga_brca_raw_19036_row_log_norm_train.pkl').iloc[:,:2]
translation_table_test = pd.read_pickle('../data/tcga_brca_raw_19036_row_norm_test.pkl').iloc[:,:2]

In [8]:
translation_table_train.head(5)

Unnamed: 0,tcga_id,Ciriello_subtype
0,TCGA-A1-A0SK,Basal
1,TCGA-A2-A04P,Basal
2,TCGA-A2-A0CM,Basal
3,TCGA-A2-A0D2,Basal
4,TCGA-A2-A0ST,Basal


In [9]:
x_all_08['tcga_id'].value_counts().head(5)

TCGA-A6-2684    7
TCGA-44-2662    7
TCGA-44-2665    7
TCGA-44-2656    7
TCGA-44-4112    7
Name: tcga_id, dtype: int64

In [10]:
translation_table_test.head(5)

Unnamed: 0,tcga_id,subtype
0,TCGA-3C-AAAU,LumA
1,TCGA-3C-AALI,Her2
2,TCGA-3C-AALJ,LumB
3,TCGA-3C-AALK,LumA
4,TCGA-4H-AAAK,LumA


### We need to handle the duplicates by selecting the tumor case in that situation

In [11]:
x_all_08.sort_index(inplace=True)
#x_all_09.sort_index(inplace=True)

In [12]:
x_all_08[x_all_08.index.str.contains("TCGA-44-4112")]

Unnamed: 0,chr1_3218610:12354307,chr1_12355487:12355503,chr1_12357088:23957783,chr1_23958921:23959613,chr1_23960868:31621427,chr1_31623667:31625346,chr1_31625389:43476957,chr1_43477796:43478606,chr1_43486624:44635476,chr1_44636481:44637390,...,chrX_150944418:151149982,chrX_151150032:151150133,chrX_151150476:152705438,chrX_152706248:152706348,chrX_152706803:153712179,chrX_153713097:153713787,chrX_153714314:153762695,chrX_153764217:153765166,chrX_153766111:154905589,tcga_id
TCGA-44-4112-01A-01D-1877-01.bed,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,...,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,TCGA-44-4112
TCGA-44-4112-01A-01D-A273-01.bed,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,...,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,TCGA-44-4112
TCGA-44-4112-01B-06D-A273-01.bed,-0.0908,-0.0908,-0.061466,-0.0515,-0.053801,-0.0353,-0.0353,-0.0353,-0.0353,-0.0353,...,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,TCGA-44-4112
TCGA-44-4112-10A-01D-1450-01.bed,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,...,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,TCGA-44-4112
TCGA-44-4112-10A-01D-1877-01.bed,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,TCGA-44-4112
TCGA-44-4112-10A-01D-A273-01.bed,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,...,-0.0033,-0.0033,-0.0033,-0.0033,-0.0033,-0.0033,-0.0033,-0.0033,-0.0033,TCGA-44-4112
TCGA-44-4112-11A-01D-1877-01.bed,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,...,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,TCGA-44-4112


### We can now drop the duplicates as we are sure we will keep the cancer ones

In [13]:
x_all_08.drop_duplicates(subset="tcga_id", keep="first", inplace=True)
#x_all_09.drop_duplicates(subset="tcga_id", keep="first", inplace=True)

In [14]:
# sanity check
x_all_08[x_all_08.index.str.contains("TCGA-44-4112")]

Unnamed: 0,chr1_3218610:12354307,chr1_12355487:12355503,chr1_12357088:23957783,chr1_23958921:23959613,chr1_23960868:31621427,chr1_31623667:31625346,chr1_31625389:43476957,chr1_43477796:43478606,chr1_43486624:44635476,chr1_44636481:44637390,...,chrX_150944418:151149982,chrX_151150032:151150133,chrX_151150476:152705438,chrX_152706248:152706348,chrX_152706803:153712179,chrX_153713097:153713787,chrX_153714314:153762695,chrX_153764217:153765166,chrX_153766111:154905589,tcga_id
TCGA-44-4112-01A-01D-1877-01.bed,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,...,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,TCGA-44-4112


In [15]:
x_cna_train_08 = pd.merge(x_all_08, translation_table_train, on='tcga_id')
x_cna_test_08 = pd.merge(x_all_08, translation_table_test, on='tcga_id')

#x_cna_train_09 = pd.merge(x_all_09, translation_table_train, on='tcga_id')
#x_cna_test_09 = pd.merge(x_all_09, translation_table_test, on='tcga_id')

In [17]:
#x_cna_train_09.isnull().sum().sum()

In [20]:
x_cna_train_08.to_pickle("../data/cna_brca_train_0.8_threshold.pkl")
x_cna_test_08.to_pickle("../data/cna_brca_test_0.8_treshold.pkl")

#x_cna_train_09.to_pickle("../data/cna_brca_train_0.9_threshold.pkl")
#x_cna_test_09.to_pickle("../data/cna_brca_test_0.9_treshold.pkl")

## Scale the data and train the model

In [26]:
y_train = x_cna_train_08['Ciriello_subtype']
y_test = x_cna_test_08['subtype']

x_cna_train_08.drop(['tcga_id', 'Ciriello_subtype'], axis=1, inplace=True)
x_cna_test_08.drop(['tcga_id', 'subtype'], axis=1, inplace=True)

In [28]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(x_cna_train_08), columns=x_cna_train_08.columns)
X_test = pd.DataFrame(scaler.transform(x_cna_test_08), columns=x_cna_test_08.columns)

In [29]:
values=[0.001, 0.01, 0.1, 1, 10, 100]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)
        
        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.1696969696969697, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.1664547313627068
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.503030303030303, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
C: 0.01, Accuracy: 0.5079680237962446
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5212121212121212, 0.5276073619631901, 0.5214723926380368, 0.5337423312883436, 0.5214723926380368]
C: 0.1, Accuracy: 0.5251013199479457
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.703030303030303, 0.7055214723926381, 0.7055214723926381, 0.7423312883435583, 0.6871165644171779]
C: 1, Accuracy: 0.7087042201152631
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.6787878787878788, 0.6073619631901841, 0.6687116564417178, 0.6871165644171779, 0.6809815950920245]
C: 10, Accu



Fold 2 of 5




Fold 3 of 5




Fold 4 of 5




Fold 5 of 5
Results: [0.6060606060606061, 0.5644171779141104, 0.6257668711656442, 0.6380368098159509, 0.6196319018404908]
C: 100, Accuracy: 0.6107826733593604




[0.1664547313627068,
 0.5079680237962446,
 0.5251013199479457,
 0.7087042201152631,
 0.6645919315857967,
 0.6107826733593604]

In [30]:
clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=1, multi_class="auto").fit(X_train, y_train)

In [31]:
final_score = clf.score(X_test, y_test)
print('Confusion matrix\n', confusion_matrix(y_test, clf.predict(X_test)))
print('Accuracy', final_score)

Confusion matrix
 [[ 39   0   4   0   0]
 [  6   0   3   7   0]
 [  1   0 115  15   0]
 [  2   2  14  14   0]
 [  1   0  11   2   0]]
Accuracy 0.711864406779661


# Do the same using CNA with 0.9 threshold

In [32]:
y_train = x_cna_train_09['Ciriello_subtype']
y_test = x_cna_test_09['subtype']

x_cna_train_09.drop(['tcga_id', 'Ciriello_subtype'], axis=1, inplace=True)
x_cna_test_09.drop(['tcga_id', 'subtype'], axis=1, inplace=True)

In [40]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(x_cna_train_09), columns=x_cna_train_09.columns)
X_test = pd.DataFrame(scaler.transform(x_cna_test_09), columns=x_cna_test_09.columns)

In [34]:
values=[0.001, 0.01, 0.1, 1, 10, 100]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)
        
        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.1696969696969697, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.1664547313627068
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.503030303030303, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
C: 0.01, Accuracy: 0.5079680237962446
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5454545454545454, 0.5276073619631901, 0.5276073619631901, 0.5705521472392638, 0.5398773006134969]
C: 0.1, Accuracy: 0.5422197434467374
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7151515151515152, 0.7239263803680982, 0.7300613496932515, 0.7300613496932515, 0.6871165644171779]
C: 1, Accuracy: 0.7172634318646588
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.6848484848484848, 0.6871165644171779, 0.6503067484662577, 0.6932515337423313, 0.6687116564417178]
C: 10, Acc

[0.1664547313627068,
 0.5079680237962446,
 0.5422197434467374,
 0.7172634318646588,
 0.6768469975831939,
 0.6510652537646402]

In [35]:
clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=1, multi_class="auto").fit(X_train, y_train)

In [41]:
final_score = clf.score(X_test, y_test)
print('Confusion matrix\n', confusion_matrix(y_test, clf.predict(X_test)))
print('Accuracy', final_score)

Confusion matrix
 [[ 39   0   3   1   0]
 [  5   1   5   5   0]
 [  1   1 115  14   0]
 [  2   2  14  14   0]
 [  0   0  11   3   0]]
Accuracy 0.7161016949152542


In [61]:
pd.read_pickle("../data/cna/tcga_cna_raw_all_samples_all_chr.pkl").head(5)

Unnamed: 0,chr1_0:4371,chr1_4372:4373,chr1_4374:9615,chr1_9616:9617,chr1_9618:12627,chr1_12628:12629,chr1_12630:18074,chr1_18075:18076,chr1_18077:18633,chr1_18634:18635,...,chr20_0:8345,chr20_8346:14270,chr20_14271:18077,chr20_18078:18080,chr21_0:164,chr21_165:5556,chr21_5557:5559,chr21_5560:15774,chr21_15775:15776,chr22_0:13709
TCGA-AA-3693-01A-01D-0903-01.bed,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,...,-0.7608,-0.157315,0.9405,0.9405,-0.0193,-0.0193,-0.0193,-0.0193,-0.0193,0.0457
TCGA-4P-AA8J-01A-11D-A390-01.bed,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,...,0.0381,0.0381,0.077167,0.0424,-0.0609,-0.0609,-0.0609,-0.059522,-0.0562,-0.182834
TCGA-24-0979-01A-01D-0428-01.bed,0.2214,0.2214,0.2214,0.2214,0.2214,0.2214,0.522517,0.5523,0.5523,-1.5957,...,-0.100367,0.367999,0.304042,0.1481,0.4255,0.346981,0.3877,0.465632,0.4889,-0.3109
TCGA-4W-AA9T-01A-11D-A390-01.bed,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0279,0.020121,0.0171,0.0171,0.0276,0.0276,0.0276,0.0276,0.0276,0.0063
TCGA-5P-A9KC-01A-11D-A42I-01.bed,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,...,-0.006,-0.006,-0.006,-0.006,-0.516,-0.516,-0.516,-0.199739,-0.0135,-0.476522
