# Logistic Regression on miRNA data
#### Using Logistic Regression for breast cancer subtype classification using miRNA data

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

import seaborn as sn
import matplotlib.pyplot as plt

## Load and split the dataset

In [2]:
# Training set

X_train = pd.read_csv("../data/miRNA_filtered_norm_minmax_scaled_train.csv")
X_test = pd.read_csv("../data/miRNA_filtered_norm_minmax_scaled_test.csv")

In [3]:
print("Training data shape is {}".format(X_train.shape))
print("Test data shape is {}".format(X_test.shape))

Training data shape is (816, 1453)
Test data shape is (219, 1453)


In [4]:
X_train.head(5)

Unnamed: 0,Ciriello_subtype,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,...,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,Normal,0.614483,0.615422,0.616654,0.76344,0.730785,0.765885,0.423307,0.44208,0.450989,...,0.254704,0.367894,0.531535,0.34825,0.649007,0.327235,0.698757,0.623754,0.802414,0.595058
1,LumA,0.614487,0.612364,0.612432,0.620357,0.51329,0.50695,0.662141,0.55751,0.559171,...,0.091089,0.178479,0.325113,0.215624,0.222885,0.33416,0.548087,0.5696,0.609707,0.549584
2,LumA,0.408115,0.405773,0.408521,0.586008,0.581964,0.44035,0.468976,0.370363,0.366458,...,0.140863,0.298284,0.292191,0.0,0.406782,0.462696,0.630315,0.522823,0.643014,0.437431
3,LumA,0.356508,0.358032,0.358657,0.406713,0.572684,0.350395,0.425127,0.382324,0.388288,...,0.322349,0.354435,0.433593,0.0,0.174568,0.299786,0.698418,0.527117,0.69673,0.478774
4,LumA,0.393319,0.396268,0.397479,0.59553,0.6674,0.552947,0.520376,0.309714,0.30936,...,0.235648,0.0,0.221282,0.0,0.312214,0.0,0.460259,0.524508,0.671783,0.512628


In [5]:
X_test.head(5)

Unnamed: 0,expert_PAM50_subtypes,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,...,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,LumA,0.507658,0.506331,0.510473,0.444339,0.141221,0.436518,0.647486,0.348852,0.352557,...,0.26168,0.43931,0.435063,0.176375,0.234688,0.461588,0.865044,0.577635,0.279963,0.526705
1,Her2,0.4691,0.470456,0.470958,0.43571,0.31277,0.49932,0.390757,0.476381,0.480865,...,0.276382,0.149409,0.516766,0.0,0.110189,0.215038,0.740475,0.692815,0.378244,0.166833
2,LumB,0.490576,0.48859,0.492166,0.255661,0.278953,0.551029,0.826569,0.598655,0.605945,...,0.082782,0.187904,0.357597,0.0,0.128089,0.255567,0.828066,0.765934,0.346043,0.433473
3,LumA,0.515157,0.517895,0.518239,0.460787,0.524124,0.383912,0.629674,0.471492,0.47714,...,0.203907,0.130859,0.34778,0.0,0.326379,0.273539,0.777517,0.52733,0.597141,0.345026
4,LumA,0.573279,0.573624,0.574355,0.423177,0.542131,0.398609,0.697305,0.525128,0.529506,...,0.16964,0.035027,0.249237,0.0,0.174222,0.176249,0.549784,0.481104,0.600268,0.361265


In [6]:
y_train = X_train["Ciriello_subtype"]
y_test = X_test["expert_PAM50_subtypes"]

In [7]:
X_train.drop(['Ciriello_subtype'], axis="columns", inplace=True)
X_test.drop(['expert_PAM50_subtypes'], axis="columns", inplace=True)

## Now we can train the model (check many parameters for c)

In [8]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.16463414634146342, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.16544216669160555
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5060975609756098, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
C: 0.01, Accuracy: 0.508581475385306
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7073170731707317, 0.7116564417177914, 0.7055214723926381, 0.6993865030674846, 0.7177914110429447]
C: 0.1, Accuracy: 0.7083345802783181
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8170731707317073, 0.7730061349693251, 0.7914110429447853, 0.7914110429447853, 0.7914110429447853]
C: 1, Accuracy: 0.7928624869070775
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.774390243902439, 0.7791411042944786, 0.754601226993865, 0.7423312883435583, 0.7484662576687117]
C: 10, Acc

[0.16544216669160555,
 0.508581475385306,
 0.7083345802783181,
 0.7928624869070775,
 0.7597860242406106,
 0.7683749812958253,
 0.7475310489301212,
 0.745099506209786]

# Do the same with z-scaled data

In [9]:
# Training set

X_train = pd.read_csv("../data/miRNA_filtered_norm_z_scaled_train.csv")
X_test = pd.read_csv("../data/miRNA_filtered_norm_z_scaled_test.csv")

In [10]:
print("Training data shape is {}".format(X_train.shape))
print("Test data shape is {}".format(X_test.shape))

Training data shape is (816, 1453)
Test data shape is (219, 1453)


In [11]:
y_train = X_train["Ciriello_subtype"]
y_test = X_test["expert_PAM50_subtypes"]

In [12]:
X_train.drop(['Ciriello_subtype'], axis="columns", inplace=True)
X_test.drop(['expert_PAM50_subtypes'], axis="columns", inplace=True)

### Train model with 5-fold CV

In [13]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.16463414634146342, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.16544216669160555
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.6829268292682927, 0.6748466257668712, 0.6932515337423313, 0.6993865030674846, 0.6809815950920245]
C: 0.01, Accuracy: 0.6862786173874009
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8048780487804879, 0.7852760736196319, 0.7975460122699386, 0.7852760736196319, 0.7852760736196319]
C: 0.1, Accuracy: 0.7916504563818644
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.774390243902439, 0.7423312883435583, 0.7668711656441718, 0.7607361963190185, 0.754601226993865]
C: 1, Accuracy: 0.7597860242406106
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7865853658536586, 0.7300613496932515, 0.7668711656441718, 0.7730061349693251, 0.754601226993865]
C:



Fold 5 of 5
Results: [0.75, 0.6932515337423313, 0.6871165644171779, 0.7484662576687117, 0.6932515337423313]
C: 10000, Accuracy: 0.7144171779141104


[0.16544216669160555,
 0.6862786173874009,
 0.7916504563818644,
 0.7597860242406106,
 0.7622250486308544,
 0.7352835552895407,
 0.7217866227742032,
 0.7144171779141104]