In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

## Load the CNA data

In [2]:
x_all_08 = pd.read_pickle("../data/cna/tcga_cna_raw_all_samples_all_chr_0.8_threshold_0.6_X.pkl")

In [3]:
x_all_08.isnull().sum().sum()

0

In [4]:
x_all_08['tcga_id'] = list(map(lambda x: x[:12], x_all_08.index))

In [5]:
chr_col = x_all_08.filter(like="chrX", axis=1).columns

In [6]:
x_all_08_no_x = x_all_08.drop(chr_col, axis=1)

In [7]:
x_all_08.head(5)

Unnamed: 0,chr1_3218610:12354307,chr1_12354308:12355503,chr1_12355504:23957783,chr1_23957784:23959613,chr1_23959614:31621427,chr1_31621428:31625346,chr1_31625347:43476957,chr1_43476958:43478606,chr1_43478607:44635476,chr1_44635477:44637390,...,chrX_136126022:136579768,chrX_136579769:137573188,chrX_137573189:137573864,chrX_137573865:139678717,chrX_139678718:143124651,chrX_143124652:143127987,chrX_143127988:149512905,chrX_149512906:149516745,chrX_149516745:154905589,tcga_id
TCGA-AA-3693-01A-01D-0903-01.bed,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,...,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,-0.0004,TCGA-AA-3693
TCGA-4P-AA8J-01A-11D-A390-01.bed,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,...,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,TCGA-4P-AA8J
TCGA-24-0979-01A-01D-0428-01.bed,0.2214,0.2214,0.2214,0.2214,0.2214,0.2214,0.522461,0.5523,0.5523,-0.5217,...,0.6146,0.6146,0.6146,0.6146,0.830798,0.9363,0.674009,0.5941,0.616682,TCGA-24-0979
TCGA-4W-AA9T-01A-11D-A390-01.bed,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,0.0017,TCGA-4W-AA9T
TCGA-5P-A9KC-01A-11D-A42I-01.bed,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,...,-0.0012,-0.0012,-0.0012,-0.0012,-0.0012,-0.0012,0.007224,0.0103,0.015502,TCGA-5P-A9KC


In [8]:
x_all_08_no_x.head()

Unnamed: 0,chr1_3218610:12354307,chr1_12354308:12355503,chr1_12355504:23957783,chr1_23957784:23959613,chr1_23959614:31621427,chr1_31621428:31625346,chr1_31625347:43476957,chr1_43476958:43478606,chr1_43478607:44635476,chr1_44635477:44637390,...,chr20_38042046:62219837,chr21_15347621:15688269,chr21_15688270:25585384,chr21_25585385:25588339,chr21_25588340:42868131,chr21_42868132:42868213,chr21_42868213:47678774,chr22_17423930:47741458,chr22_47741458:49331012,tcga_id
TCGA-AA-3693-01A-01D-0903-01.bed,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,-0.0066,...,0.9405,-0.0193,-0.0193,-0.0193,-0.0193,-0.0193,-0.0193,0.0457,0.0457,TCGA-AA-3693
TCGA-4P-AA8J-01A-11D-A390-01.bed,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,-0.0409,...,0.040559,-0.0609,-0.0609,-0.0609,-0.059522,-0.0562,-0.0562,-0.182834,-0.2245,TCGA-4P-AA8J
TCGA-24-0979-01A-01D-0428-01.bed,0.2214,0.2214,0.2214,0.2214,0.2214,0.2214,0.522461,0.5523,0.5523,-0.5217,...,0.147779,0.4255,0.346995,0.3877,0.465625,0.4889,0.4889,-0.3109,-0.3109,TCGA-24-0979
TCGA-4W-AA9T-01A-11D-A390-01.bed,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,0.0053,...,0.0171,0.0276,0.0276,0.0276,0.0276,0.0276,0.005338,0.0063,0.0063,TCGA-4W-AA9T
TCGA-5P-A9KC-01A-11D-A42I-01.bed,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,-0.0214,...,-0.002116,-0.516,-0.516,-0.516,-0.19977,-0.0135,-0.0135,-0.476522,-0.4776,TCGA-5P-A9KC


In [9]:
translation_table_train = pd.read_pickle('../data/tcga_brca_raw_19036_row_log_norm_train.pkl').iloc[:,:2]
translation_table_test = pd.read_pickle('../data/tcga_brca_raw_19036_row_norm_test.pkl').iloc[:,:2]

In [10]:
translation_table_train.head(5)

Unnamed: 0,tcga_id,Ciriello_subtype
0,TCGA-A1-A0SK,Basal
1,TCGA-A2-A04P,Basal
2,TCGA-A2-A0CM,Basal
3,TCGA-A2-A0D2,Basal
4,TCGA-A2-A0ST,Basal


In [11]:
x_all_08['tcga_id'].value_counts().head(5)

TCGA-A6-2684    7
TCGA-44-2662    7
TCGA-44-4112    7
TCGA-44-2656    7
TCGA-44-2665    7
Name: tcga_id, dtype: int64

In [12]:
translation_table_test.head(5)

Unnamed: 0,tcga_id,subtype
0,TCGA-3C-AAAU,LumA
1,TCGA-3C-AALI,Her2
2,TCGA-3C-AALJ,LumB
3,TCGA-3C-AALK,LumA
4,TCGA-4H-AAAK,LumA


### We need to handle the duplicates by selecting the tumor case in that situation

In [13]:
x_all_08.sort_index(inplace=True)

In [14]:
x_all_08[x_all_08.index.str.contains("TCGA-44-4112")]

Unnamed: 0,chr1_3218610:12354307,chr1_12354308:12355503,chr1_12355504:23957783,chr1_23957784:23959613,chr1_23959614:31621427,chr1_31621428:31625346,chr1_31625347:43476957,chr1_43476958:43478606,chr1_43478607:44635476,chr1_44635477:44637390,...,chrX_136126022:136579768,chrX_136579769:137573188,chrX_137573189:137573864,chrX_137573865:139678717,chrX_139678718:143124651,chrX_143124652:143127987,chrX_143127988:149512905,chrX_149512906:149516745,chrX_149516745:154905589,tcga_id
TCGA-44-4112-01A-01D-1877-01.bed,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,...,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,TCGA-44-4112
TCGA-44-4112-01A-01D-A273-01.bed,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,-0.0278,...,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,-0.0199,TCGA-44-4112
TCGA-44-4112-01B-06D-A273-01.bed,-0.0908,-0.0908,-0.061471,-0.0515,-0.053801,-0.0353,-0.0353,-0.0353,-0.0353,-0.0353,...,-0.0887,-0.0887,-0.0887,-0.091336,-0.0649,-0.0649,-0.0649,-0.0649,-0.0649,TCGA-44-4112
TCGA-44-4112-10A-01D-1450-01.bed,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,-0.0047,...,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,TCGA-44-4112
TCGA-44-4112-10A-01D-1877-01.bed,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,-0.0085,...,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,TCGA-44-4112
TCGA-44-4112-10A-01D-A273-01.bed,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,-0.0068,...,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,-0.0005,0.004379,-0.0033,-0.0033,TCGA-44-4112
TCGA-44-4112-11A-01D-1877-01.bed,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,0.0042,...,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,TCGA-44-4112


### We can now drop the duplicates as we are sure we will keep the cancer ones

In [15]:
x_all_08.drop_duplicates(subset="tcga_id", keep="first", inplace=True)

In [16]:
# sanity check
x_all_08[x_all_08.index.str.contains("TCGA-44-4112")]

Unnamed: 0,chr1_3218610:12354307,chr1_12354308:12355503,chr1_12355504:23957783,chr1_23957784:23959613,chr1_23959614:31621427,chr1_31621428:31625346,chr1_31625347:43476957,chr1_43476958:43478606,chr1_43478607:44635476,chr1_44635477:44637390,...,chrX_136126022:136579768,chrX_136579769:137573188,chrX_137573189:137573864,chrX_137573865:139678717,chrX_139678718:143124651,chrX_143124652:143127987,chrX_143127988:149512905,chrX_149512906:149516745,chrX_149516745:154905589,tcga_id
TCGA-44-4112-01A-01D-1877-01.bed,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,0.0069,...,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,-0.0017,TCGA-44-4112


In [17]:
x_all_08.shape

(11162, 385)

In [18]:
x_all_08.to_pickle("../data/cna/tcga_cna_raw_all_samples_all_chr_0.8_threshold_0.6_X_corr_brca_no_dup.pkl")

In [18]:
x_cna_train_08 = pd.merge(x_all_08, translation_table_train, on='tcga_id')
x_cna_test_08 = pd.merge(x_all_08, translation_table_test, on='tcga_id')

In [19]:
x_cna_train_08.head()

Unnamed: 0,chr1_3218610:12354307,chr1_12354308:12355503,chr1_12355504:23957783,chr1_23957784:23959613,chr1_23959614:31621427,chr1_31621428:31625346,chr1_31625347:43476957,chr1_43476958:43478606,chr1_43478607:44635476,chr1_44635477:44637390,...,chrX_136579769:137573188,chrX_137573189:137573864,chrX_137573865:139678717,chrX_139678718:143124651,chrX_143124652:143127987,chrX_143127988:149512905,chrX_149512906:149516745,chrX_149516745:154905589,tcga_id,Ciriello_subtype
0,0.0034,0.0034,0.0034,0.0034,0.0034,0.0034,0.0034,0.0034,0.0034,0.0034,...,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,0.0016,-0.003511,TCGA-A1-A0SB,Normal
1,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,-0.3934,...,-0.0019,-0.0019,-0.0019,-0.0019,-0.0019,-0.0019,-0.0019,-0.0019,TCGA-A1-A0SD,LumA
2,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,-0.0313,...,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,0.0068,TCGA-A1-A0SE,LumA
3,-0.0562,-0.0562,-0.000124,0.1703,-0.17239,-0.3483,0.153454,0.1824,0.1824,0.1824,...,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,0.0012,TCGA-A1-A0SF,LumA
4,-0.320372,-0.3398,-0.341225,-0.342,-0.213506,0.0055,0.0055,0.0055,0.0055,0.0055,...,0.2237,0.2237,0.2237,0.2237,0.2237,0.2237,0.2237,0.2237,TCGA-A1-A0SH,LumA


In [21]:
x_cna_train_08.to_pickle("../data/cna_brca_train_0.8_threshold_0.6_chrX_corr_brca.pkl")
x_cna_test_08.to_pickle("../data/cna_brca_test_0.8_treshold_0.6_chrX_corr_brca.pkl")

## Scale the data and train the model

In [2]:
# For CNA+RNA training
#x_cna_train_08 = pd.read_pickle("../data/hybrids/tcga_brca_cna_rna_meta_train.pkl")
#x_cna_test_08 = pd.read_pickle("../data/hybrids/tcga_brca_cna_rna_meta_test.pkl")

In [21]:
# For CNA+miRNA+RNA

x_cna_train_08 = pd.read_pickle("../data/hybrids/tcga_brca_cna_mirna_rna_meta_train.pkl")
x_cna_test_08 = pd.read_pickle("../data/hybrids/tcga_brca_cna_mirna_rna_meta_test.pkl")

In [20]:
x_cna_train_08.dtypes[x_cna_train_08.dtypes !='float64']

tcga_id             object
Ciriello_subtype    object
dtype: object

In [21]:
x_cna_test_08.dtypes[x_cna_test_08.dtypes !='float64']

tcga_id    object
subtype    object
dtype: object

In [22]:
y_train = x_cna_train_08['Ciriello_subtype']
y_test = x_cna_test_08['subtype']

x_cna_train_08.drop(['tcga_id', 'Ciriello_subtype'], axis=1, inplace=True)
x_cna_test_08.drop(['tcga_id', 'subtype'], axis=1, inplace=True)

In [23]:
y_train.value_counts()

LumA      415
LumB      176
Basal     136
Her2       65
Normal     25
Name: Ciriello_subtype, dtype: int64

In [24]:
y_test.value_counts()

LumA      131
Basal      43
LumB       32
Her2       16
Normal     14
Name: subtype, dtype: int64

In [25]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(x_cna_train_08), columns=x_cna_train_08.columns)
X_test = pd.DataFrame(scaler.transform(x_cna_test_08), columns=x_cna_test_08.columns)

In [26]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []
all_reports = []
subtypes = ["Basal", "Her2", "LumA", "LumB", "Normal"]


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []
    full_report = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)
        
        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        full_report.append(classification_report(y_cv_val, clf.predict(X_cv_val), target_names=subtypes, output_dict=True))
        i+=1
    
    i=1
    mean_scores.append(np.mean(scores))
    all_reports.append(full_report)
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.1696969696969697, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.1664547313627068
Fold 1 of 5
Fold 2 of 5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.503030303030303, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
C: 0.01, Accuracy: 0.5079680237962446
Fold 1 of 5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 2 of 5
Fold 3 of 5
Fold 4 of 5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 5 of 5
Results: [0.5333333333333333, 0.5460122699386503, 0.5337423312883436, 0.5644171779141104, 0.5337423312883436]
C: 0.1, Accuracy: 0.5422494887525562
Fold 1 of 5


  'precision', 'predicted', average, warn_for)


Fold 2 of 5


  'precision', 'predicted', average, warn_for)


Fold 3 of 5


  'precision', 'predicted', average, warn_for)


Fold 4 of 5


  'precision', 'predicted', average, warn_for)


Fold 5 of 5


  'precision', 'predicted', average, warn_for)


Results: [0.7515151515151515, 0.7300613496932515, 0.7300613496932515, 0.7239263803680982, 0.6809815950920245]
C: 1, Accuracy: 0.7233091652723556
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.703030303030303, 0.6134969325153374, 0.6748466257668712, 0.6993865030674846, 0.7177914110429447]
C: 10, Accuracy: 0.6817103550845882
Fold 1 of 5


KeyboardInterrupt: 

In [29]:
from statistics import stdev
stdev([0.8597560975609756, 0.8650306748466258, 0.8895705521472392, 0.852760736196319, 0.8404907975460123])

0.018174306781810878

In [27]:
clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=1, multi_class="auto").fit(X_train, y_train)

In [28]:
final_score = clf.score(X_test, y_test)
report = classification_report(y_test, clf.predict(X_test), target_names=subtypes, output_dict=True)
print('Confusion matrix\n', confusion_matrix(y_test, clf.predict(X_test)))
print('Accuracy', final_score)

Confusion matrix
 [[ 39   0   4   0   0]
 [  6   1   5   4   0]
 [  1   0 116  14   0]
 [  2   1  15  14   0]
 [  1   0  11   2   0]]
Accuracy 0.7203389830508474


  'precision', 'predicted', average, warn_for)


In [32]:
report

{'Basal': {'precision': 0.7959183673469388,
  'recall': 0.9069767441860465,
  'f1-score': 0.8478260869565216,
  'support': 43},
 'Her2': {'precision': 0.5,
  'recall': 0.0625,
  'f1-score': 0.1111111111111111,
  'support': 16},
 'LumA': {'precision': 0.7682119205298014,
  'recall': 0.8854961832061069,
  'f1-score': 0.822695035460993,
  'support': 131},
 'LumB': {'precision': 0.4117647058823529,
  'recall': 0.4375,
  'f1-score': 0.42424242424242425,
  'support': 32},
 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 14},
 'micro avg': {'precision': 0.7203389830508474,
  'recall': 0.7203389830508474,
  'f1-score': 0.7203389830508474,
  'support': 236},
 'macro avg': {'precision': 0.4951789987518186,
  'recall': 0.4584945854784307,
  'f1-score': 0.44117493155420995,
  'support': 236},
 'weighted avg': {'precision': 0.6611725507354138,
  'recall': 0.7203389830508474,
  'f1-score': 0.6761996048222705,
  'support': 236}}

### Get the values to fill in the thesis tables

#### Train data

In [30]:
len(all_reports)

5

In [31]:
from statistics import stdev

subtypes = ["Basal", "Her2", "LumA", "LumB", "Normal"]
mean_precisions = []
mean_recalls =[]
weights_train=[136,65,415,176,25]
#weights_train=[135,65,415,176,25]

for i in range(0,5):
    dict_aux = all_reports[3][i]
    arr_pre = []
    arr_rec = []
    for sub in subtypes:
        arr_pre.append(dict_aux[sub]['precision'])
        arr_rec.append(dict_aux[sub]['recall'])
    mean_precisions.append(np.average(arr_pre, weights=weights_train))
    mean_recalls.append(np.average(arr_rec, weights=weights_train))

print("PRECISION")
print(mean_precisions)
print('{}+-{}'.format(np.mean(mean_precisions), stdev(mean_precisions)))
print("----------------")
print('RECALL')
print(mean_recalls)
print('{}+-{}'.format(np.mean(mean_recalls), stdev(mean_recalls)))

PRECISION
[0.749707018266184, 0.7017606628377743, 0.7056697986956932, 0.6897205020379639, 0.6770914193243018]
0.7047898802323834+-0.027485407944563712
----------------
RECALL
[0.7515882730081016, 0.7300615880787239, 0.7299812839592521, 0.7237667812943211, 0.6806965734750312]
0.723218899963086+-0.026008757990514445


### Test data

In [35]:
#weights_test=[43, 16, 131, 32, 14]
weights_test=[36, 15, 128, 26, 14]
mean_precisions = []
mean_recalls = []


dict_aux = report
arr_pre = []
arr_rec = []
for sub in subtypes:
    arr_pre.append(dict_aux[sub]['precision'])
    arr_rec.append(dict_aux[sub]['recall'])
mean_precisions.append(np.average(arr_pre, weights=weights_test))
mean_recalls.append(np.average(arr_rec, weights=weights_test))
    
print(mean_precisions)
print(mean_recalls)

[0.9236277808097569]
[0.9178082191780822]
