# Logistic Regression on rna+miRNA data
#### Using Logistic Regression for breast cancer subtype classification using rna+miRNA data

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

import seaborn as sn
import matplotlib.pyplot as plt

## Load and split the dataset

In [2]:
# Training set

X_train = pd.read_pickle("../data/hybrids/tcga_brca_mirna_rna_meta_train.pkl")
X_test = pd.read_pickle("../data/hybrids/tcga_brca_mirna_rna_meta_test.pkl")

In [3]:
X_train.head(5)

Unnamed: 0,tcga_id,Ciriello_subtype,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-A1-A0SB,Normal,14.03037,14.033059,14.044442,16.502933,13.396463,10.454968,9.264031,11.437857,...,4.591546,4.161964,3.409239,4.598872,5.531009,1.354325,6.060968,8.260453,6.58493,6.264032
1,TCGA-A1-A0SD,LumA,14.03039,14.018248,14.024003,15.632852,11.397605,9.059588,10.521528,12.229467,...,4.332482,5.186712,3.498551,4.837809,5.003388,1.670736,5.537954,7.526989,6.057077,6.103539
2,TCGA-A1-A0SE,LumA,13.034042,13.017645,13.036697,15.423974,12.028745,8.700684,9.50448,10.946024,...,4.588648,5.451175,3.531089,5.119407,4.906919,2.167065,5.632838,7.283633,5.605025,5.97631
3,TCGA-A1-A0SF,LumA,12.784891,12.786417,12.795264,14.333694,11.943461,8.215924,9.273612,11.028055,...,3.877071,5.236947,3.238446,4.720469,4.991145,2.732529,5.498687,8.112964,5.620719,5.024942
4,TCGA-A1-A0SH,LumA,12.962609,12.971607,12.983236,15.481879,12.813933,9.307462,9.775111,10.530101,...,4.129731,4.04037,3.19239,4.500302,4.173154,1.074448,5.819994,7.818159,5.777777,5.537696


In [4]:
X_test.head(5)

Unnamed: 0,tcga_id,expert_PAM50_subtypes,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-3C-AAAU,LumA,13.514627,13.504685,13.530334,14.562495,7.978139,8.680035,10.444366,10.798507,...,4.567296,5.69187,3.937222,5.626097,5.681502,4.0009,6.008093,7.55784,6.645893,6.206293
1,TCGA-3C-AALI,Her2,13.328474,13.330931,13.339006,14.510022,9.554742,9.018468,9.092649,11.673093,...,5.929355,5.671528,2.897347,4.46249,5.88714,3.630205,4.953424,8.115251,6.059523,4.608662
2,TCGA-3C-AALJ,LumB,13.432159,13.418758,13.441694,13.415155,9.243957,9.297123,11.387266,12.511639,...,4.121627,7.067952,2.447379,4.728413,4.531598,4.292929,4.889808,8.129297,5.678453,5.011173
3,TCGA-3C-AALK,LumA,13.550832,13.560693,13.567933,14.662512,11.497175,8.396544,10.350582,11.639559,...,4.338196,5.075789,2.678619,4.336446,4.66849,3.364291,4.906769,8.08143,5.284074,5.352912
4,TCGA-4H-AAAK,LumA,13.831441,13.830614,13.839637,14.433812,11.662663,8.475744,10.70667,12.007394,...,4.438037,5.200766,2.567688,4.385974,4.685163,1.019857,5.333096,7.668533,5.412756,5.709837


In [5]:
print("Training data shape is {}".format(X_train.shape))
print("Test data shape is {}".format(X_test.shape))

Training data shape is (816, 20809)
Test data shape is (219, 20809)


In [6]:
y_train = X_train["Ciriello_subtype"]
y_test = X_test["expert_PAM50_subtypes"]

In [7]:
X_train.drop(['tcga_id', 'Ciriello_subtype'], axis="columns", inplace=True)
X_test.drop(['tcga_id', 'expert_PAM50_subtypes'], axis="columns", inplace=True)

## Now we can train the model (check many parameters for c)

In [8]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []
    conf_matrix = np.zeros([5,5])
    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        conf = confusion_matrix(y_cv_val, clf.predict(X_cv_val))
        conf_matrix = np.add(conf_matrix, conf)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print(conf_matrix)
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5060975609756098, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
[[  0.   0. 135.   0.   0.]
 [  0.   0.  65.   0.   0.]
 [  0.   0. 415.   0.   0.]
 [  0.   0. 176.   0.   0.]
 [  0.   0.  25.   0.   0.]]
C: 0.001, Accuracy: 0.508581475385306
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7987804878048781, 0.7730061349693251, 0.7914110429447853, 0.8098159509202454, 0.7791411042944786]
[[131.   3.   1.   0.   0.]
 [  5.  33.  18.   9.   0.]
 [  0.   0. 408.   7.   0.]
 [  0.   0. 103.  73.   0.]
 [  5.   2.  18.   0.   0.]]
C: 0.01, Accuracy: 0.7904309441867425
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8414634146341463, 0.852760736196319, 0.8711656441717791, 0.852760736196319, 0.852760736196319]
[[131.   3.   0.   1.   0.]
 [  2.  48.   5.  10.   0.]
 [  0.   5. 385.  25.   0.]
 [  0.   5.  41. 130.   0.]
 [  4.   3.  14.   1.   3.]]
C:

[0.508581475385306,
 0.7904309441867425,
 0.8541822534789765,
 0.8713302409097711,
 0.8689136615292533,
 0.8627712105341911,
 0.8406703576238217,
 0.8210534191231483]

### Do the same, but Min-Max scaling the data

In [9]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [10]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []
    conf_matrix = np.zeros([5,5])
    
    for train_index, test_index in skf.split(X_train_scaled, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        conf = confusion_matrix(y_cv_val, clf.predict(X_cv_val))
        conf_matrix = np.add(conf_matrix, conf)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print(conf_matrix)
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5060975609756098, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
[[  0.   0. 135.   0.   0.]
 [  0.   0.  65.   0.   0.]
 [  0.   0. 415.   0.   0.]
 [  0.   0. 176.   0.   0.]
 [  0.   0.  25.   0.   0.]]
C: 0.001, Accuracy: 0.508581475385306
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7987804878048781, 0.7730061349693251, 0.7914110429447853, 0.8098159509202454, 0.7791411042944786]
[[131.   3.   1.   0.   0.]
 [  5.  33.  18.   9.   0.]
 [  0.   0. 408.   7.   0.]
 [  0.   0. 103.  73.   0.]
 [  5.   2.  18.   0.   0.]]
C: 0.01, Accuracy: 0.7904309441867425
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8414634146341463, 0.852760736196319, 0.8711656441717791, 0.852760736196319, 0.852760736196319]
[[131.   3.   0.   1.   0.]
 [  2.  48.   5.  10.   0.]
 [  0.   5. 385.  25.   0.]
 [  0.   5.  41. 130.   0.]
 [  4.   3.  14.   1.   3.]]
C:

[0.508581475385306,
 0.7904309441867425,
 0.8541822534789765,
 0.8713302409097711,
 0.8689136615292533,
 0.8627712105341911,
 0.8406703576238217,
 0.8210534191231483]

### Run the best one on the test set

In [12]:
clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=1, multi_class="auto").fit(X_train_scaled, y_train)

In [13]:
final_score = clf.score(X_test_scaled, y_test)
print('Confusion matrix\n', confusion_matrix(y_test, clf.predict(X_test_scaled)))
print('Accuracy', final_score)

Confusion matrix
 [[ 36   0   0   0   0]
 [  0  15   0   0   0]
 [  0   2 121   5   0]
 [  0   0   2  24   0]
 [  0   3   5   1   5]]
Accuracy 0.9178082191780822
