# Logistic Regression on miRNA data
#### Using Logistic Regression for breast cancer subtype classification using miRNA data

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sn
import matplotlib.pyplot as plt

## Load and split the dataset

In [3]:
# Training set

X_train = pd.read_csv("../data/miRNA_filtered_norm_scaled_train.csv")
X_test = pd.read_csv("../data/miRNA_filtered_norm_scaled_test.csv")

In [4]:
print("Training data shape is {}".format(X_train.shape))
print("Test data shape is {}".format(X_test.shape))

Training data shape is (816, 1772)
Test data shape is (219, 1772)


In [5]:
X_train.head(5)

Unnamed: 0,Ciriello_subtype,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,...,hsa-mir-941-1,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,Normal,14.03037,14.033059,14.044442,16.502933,13.396463,10.454968,9.264031,11.437857,11.490396,...,0.0,3.291905,0.525771,4.282995,1.677401,0.0,5.599551,6.007131,12.010321,15.851634
1,LumA,14.03039,14.018248,14.024003,15.632852,11.397605,9.059588,10.521528,12.229467,12.236976,...,0.0,2.013493,0.325539,1.470888,1.712899,0.0,4.392142,5.675821,10.137212,15.615404
2,LumA,13.034042,13.017645,13.036697,15.423974,12.028745,8.700684,9.50448,10.946024,10.907025,...,0.0,1.809602,0.0,2.684479,2.371771,0.0,5.051083,5.389651,10.460951,15.032799
3,LumA,12.784891,12.786417,12.795264,14.333694,11.943461,8.215924,9.273612,11.028055,11.057679,...,0.0,2.685329,0.0,1.152029,1.536697,0.0,5.596833,5.415917,10.983075,15.247564
4,LumA,12.962609,12.971607,12.983236,15.481879,12.813933,9.307462,9.775111,10.530101,10.512984,...,0.0,1.370444,0.0,2.060394,0.0,0.0,3.688321,5.399958,10.740593,15.423429


In [6]:
X_train['Ciriello_subtype'].value_counts()

LumA      415
LumB      176
Basal     135
Her2       65
Normal     25
Name: Ciriello_subtype, dtype: int64

In [5]:
X_test.head(5)

Unnamed: 0,expert_PAM50_subtypes,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,...,hsa-mir-941-1,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,LumA,13.514627,13.504685,13.530334,14.562495,7.978139,8.680035,10.444366,10.798507,10.811095,...,0.0,2.694435,0.266281,1.548775,2.366092,0.0,6.932103,5.724983,6.932103,15.496556
1,Her2,13.328474,13.330931,13.339006,14.510022,9.554742,9.018468,9.092649,11.673093,11.696572,...,0.0,3.200443,0.0,0.727173,1.10228,0.0,5.933862,6.429633,7.887388,13.627101
2,LumB,13.432159,13.418758,13.441694,13.415155,9.243957,9.297123,11.387266,12.511639,12.559778,...,0.0,2.214673,0.0,0.845298,1.310029,0.0,6.635774,6.876964,7.574399,15.012233
3,LumA,13.550832,13.560693,13.567933,14.662512,11.497175,8.396544,10.350582,11.639559,11.670866,...,0.0,2.153877,0.0,2.153877,1.402156,0.0,6.230695,5.41722,10.01507,14.552775
4,LumA,13.831441,13.830614,13.839637,14.433812,11.662663,8.475744,10.70667,12.007394,12.032254,...,0.0,1.543579,0.0,1.149743,0.903448,0.0,4.405742,5.134419,10.04546,14.63713


In [6]:
X_test['expert_PAM50_subtypes'].value_counts()

LumA      128
Basal      36
LumB       26
Her2       15
Normal     14
Name: expert_PAM50_subtypes, dtype: int64

In [7]:
y_train = X_train["Ciriello_subtype"]
y_test = X_test["expert_PAM50_subtypes"]

In [8]:
X_train.drop(['Ciriello_subtype'], axis="columns", inplace=True)
X_test.drop(['expert_PAM50_subtypes'], axis="columns", inplace=True)

In [9]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Now we can train the model (check many parameters for c)

In [25]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []
subtypes = ["Basal", "Her2", "LumA", "LumB", "Normal"]


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []
    full_report = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]
        
        print(y_cv_val.value_counts())

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        full_report.append(classification_report(y_cv_val, clf.predict(X_cv_val), target_names=subtypes, output_dict=True))
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))
    print(full_report)

mean_scores

Fold 1 of 5
LumA      83
LumB      36
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 2 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 3 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 4 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 5 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Results: [0.16463414634146342, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.16544216669160555
[{'Basal': {'precision': 0.16463414634146342, 'recall': 1.0, 'f1-score': 0.28272251308900526, 'support': 27}, 'Her2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'LumA': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 83}, 'LumB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 36}, 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'micro avg': {'precision': 0.16463414634146342, 'recall': 0.16463414634146342, 'f1-score': 0.16463414634146342, 'support': 164}, 'macro avg': {'precision': 0.032926829268292684, 'recall': 0.2, 'f1-score': 0.056

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 3 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 4 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 5 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Results: [0.5060975609756098, 0.50920245398773, 0.50920245398773, 0.50920245398773, 0.50920245398773]
C: 0.01, Accuracy: 0.508581475385306
[{'Basal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'Her2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'LumA': {'precision': 0.5060975609756098, 'recall': 1.0, 'f1-score': 0.6720647773279352, 'support': 83}, 'LumB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 36}, 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'micro avg': {'precision': 0.5060975609756098, 'recall': 0.5060975609756098, 'f1-score': 0.5060975609756098, 'support': 164}, 'macro avg': {'precision': 0.10121951219512196, 'recall': 0.2, 'f1-score': 0.13441295546558704, 'support': 164}, 'weighted avg': {'precision': 0.25613474122546104, 'recall': 0.5060975609756098, 'f1-score': 0.3401

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 2 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 3 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fold 4 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 5 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Results: [0.7073170731707317, 0.7116564417177914, 0.7116564417177914, 0.6993865030674846, 0.7177914110429447]
C: 0.1, Accuracy: 0.7095615741433487
[{'Basal': {'precision': 0.9310344827586207, 'recall': 1.0, 'f1-score': 0.9642857142857143, 'support': 27}, 'Her2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'LumA': {'precision': 0.6587301587301587, 'recall': 1.0, 'f1-score': 0.7942583732057417, 'support': 83}, 'LumB': {'precision': 0.6666666666666666, 'recall': 0.16666666666666666, 'f1-score': 0.26666666666666666, 'support': 36}, 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'micro avg': {'precision': 0.7073170731707317, 'recall': 0.7073170731707317, 'f1-score': 0.7073170731707317, 'support': 164}, 'macro avg': {'precision': 0.4512862616310892, 'recall': 0.4333333333333333, 'f1-score': 0.40504215083162454, 'support': 164}, 'weighted avg': {'precision': 0.6330032573724752, 'recall': 0.7073170731707317, 'f1-score': 0.6192631662304319, 's

  'precision', 'predicted', average, warn_for)


Fold 2 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)


Fold 3 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64
Fold 4 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)


Fold 5 of 5
LumA      83
LumB      35
Basal     27
Her2      13
Normal     5
Name: Ciriello_subtype, dtype: int64


  'precision', 'predicted', average, warn_for)


Results: [0.8170731707317073, 0.7730061349693251, 0.7914110429447853, 0.7914110429447853, 0.7914110429447853]
C: 1, Accuracy: 0.7928624869070775
[{'Basal': {'precision': 0.9642857142857143, 'recall': 1.0, 'f1-score': 0.9818181818181818, 'support': 27}, 'Her2': {'precision': 0.7, 'recall': 0.5384615384615384, 'f1-score': 0.608695652173913, 'support': 13}, 'LumA': {'precision': 0.8080808080808081, 'recall': 0.963855421686747, 'f1-score': 0.8791208791208791, 'support': 83}, 'LumB': {'precision': 0.7407407407407407, 'recall': 0.5555555555555556, 'f1-score': 0.634920634920635, 'support': 36}, 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'micro avg': {'precision': 0.8170731707317073, 'recall': 0.8170731707317073, 'f1-score': 0.8170731707317073, 'support': 164}, 'macro avg': {'precision': 0.6426214526214526, 'recall': 0.6115745031407682, 'f1-score': 0.6209110696067218, 'support': 164}, 'weighted avg': {'precision': 0.7858115123359026, 'recall': 0.817073170731707

  'precision', 'predicted', average, warn_for)


Results: [0.7439024390243902, 0.7730061349693251, 0.7484662576687117, 0.7423312883435583, 0.7668711656441718]
C: 1000, Accuracy: 0.7549154571300314
[{'Basal': {'precision': 0.9615384615384616, 'recall': 0.9259259259259259, 'f1-score': 0.9433962264150944, 'support': 27}, 'Her2': {'precision': 0.5, 'recall': 0.3076923076923077, 'f1-score': 0.380952380952381, 'support': 13}, 'LumA': {'precision': 0.7956989247311828, 'recall': 0.891566265060241, 'f1-score': 0.8409090909090909, 'support': 83}, 'LumB': {'precision': 0.5277777777777778, 'recall': 0.5277777777777778, 'f1-score': 0.5277777777777778, 'support': 36}, 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5}, 'micro avg': {'precision': 0.7439024390243902, 'recall': 0.7439024390243902, 'f1-score': 0.7439024390243903, 'support': 164}, 'macro avg': {'precision': 0.5570030328094845, 'recall': 0.5305924552912504, 'f1-score': 0.5386070952108688, 'support': 164}, 'weighted avg': {'precision': 0.7164911537452844, 'recall'

[0.16544216669160555,
 0.508581475385306,
 0.7095615741433487,
 0.7928624869070775,
 0.7622325303007632,
 0.7610130181056413,
 0.7549154571300314,
 0.7389645368846327]

In [None]:
clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=1, multi_class="auto").fit(X_train, y_train)

In [12]:
final_score = clf.score(X_test, y_test)
print('Confusion matrix\n', confusion_matrix(y_test, clf.predict(X_test)))
print('Accuracy', final_score)
report = classification_report(y_test, clf.predict(X_test), target_names=subtypes, output_dict=True)

Confusion matrix
 [[ 35   0   0   1   0]
 [  1  11   2   1   0]
 [  0   2 117   9   0]
 [  0   1   7  18   0]
 [  2   2   7   1   2]]
Accuracy 0.8356164383561644


In [14]:
report

{'Basal': {'precision': 0.9210526315789473,
  'recall': 0.9722222222222222,
  'f1-score': 0.9459459459459458,
  'support': 36},
 'Her2': {'precision': 0.6875,
  'recall': 0.7333333333333333,
  'f1-score': 0.7096774193548386,
  'support': 15},
 'LumA': {'precision': 0.8796992481203008,
  'recall': 0.9140625,
  'f1-score': 0.896551724137931,
  'support': 128},
 'LumB': {'precision': 0.6,
  'recall': 0.6923076923076923,
  'f1-score': 0.6428571428571429,
  'support': 26},
 'Normal': {'precision': 1.0,
  'recall': 0.14285714285714285,
  'f1-score': 0.25,
  'support': 14},
 'micro avg': {'precision': 0.8356164383561644,
  'recall': 0.8356164383561644,
  'f1-score': 0.8356164383561644,
  'support': 219},
 'macro avg': {'precision': 0.8176503759398497,
  'recall': 0.6909565781440782,
  'f1-score': 0.6890064464591716,
  'support': 219},
 'weighted avg': {'precision': 0.8478168881106877,
  'recall': 0.8356164383561644,
  'f1-score': 0.8204206472525915,
  'support': 219}}

In [64]:
full_report[1]

{'Basal': {'precision': 0.9583333333333334,
  'recall': 0.8518518518518519,
  'f1-score': 0.9019607843137256,
  'support': 27},
 'Her2': {'precision': 0.8571428571428571,
  'recall': 0.46153846153846156,
  'f1-score': 0.6,
  'support': 13},
 'LumA': {'precision': 0.7448979591836735,
  'recall': 0.8795180722891566,
  'f1-score': 0.8066298342541436,
  'support': 83},
 'LumB': {'precision': 0.5757575757575758,
  'recall': 0.5428571428571428,
  'f1-score': 0.5588235294117646,
  'support': 35},
 'Normal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 5},
 'micro avg': {'precision': 0.7423312883435583,
  'recall': 0.7423312883435583,
  'f1-score': 0.7423312883435583,
  'support': 163},
 'macro avg': {'precision': 0.627226345083488,
  'recall': 0.5471531057073225,
  'f1-score': 0.5734828295959267,
  'support': 163},
 'weighted avg': {'precision': 0.7300362141510258,
  'recall': 0.7423312883435583,
  'f1-score': 0.7279879812820631,
  'support': 163}}

In [27]:
mean_precisions = []
mean_recalls =[]
weights_avg=[36,15,128,26,14]

for i in range(0,5):
    dict_aux = full_report[i]
    arr_pre = []
    arr_rec = []
    for sub in subtypes:
        arr_pre.append(dict_aux[sub]['precision'])
        arr_rec.append(dict_aux[sub]['recall'])
    mean_precisions.append(np.average(arr_pre, weights=weights_avg))
    mean_recalls.append(np.average(arr_rec, weights=weights_avg))

print("PRECISION")
print(mean_precisions)
print('{}+-{}'.format(np.mean(mean_precisions), stdev(mean_precisions)))
print("----------------")
print('RECALL')
print(mean_recalls)
print('{}+-{}'.format(np.mean(mean_recalls), stdev(mean_recalls)))

PRECISION
[0.7339176579795996, 0.7199715917915527, 0.7306398384801184, 0.712036483469511, 0.7227601952448434]
0.7238651533931251+-0.008706999958446178
----------------
RECALL
[0.7498820948235043, 0.7501476829088645, 0.7398438351730975, 0.7388434162159199, 0.7733772160245175]
0.7504188490291807+-0.013903864697498275


## Do the same with Ridge

In [14]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf_ridge = LogisticRegression(random_state=0, solver='liblinear', penalty="l2", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf_ridge.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.5060975609756098, 0.50920245398773, 0.50920245398773, 0.5153374233128835, 0.50920245398773]
C: 0.001, Accuracy: 0.5098084692503366
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7317073170731707, 0.7055214723926381, 0.6809815950920245, 0.7300613496932515, 0.7300613496932515]
C: 0.01, Accuracy: 0.7156666167888673
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8170731707317073, 0.7791411042944786, 0.8159509202453987, 0.8282208588957055, 0.7668711656441718]
C: 0.1, Accuracy: 0.8014514439622925
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8292682926829268, 0.7730061349693251, 0.7791411042944786, 0.7975460122699386, 0.7791411042944786]
C: 1, Accuracy: 0.7916205297022294
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7987804878048781, 0.7791411042944786, 0.7730061349693251, 0.803680981595092, 0.7668711656441718]
C: 10, A

[0.5098084692503366,
 0.7156666167888673,
 0.8014514439622925,
 0.7916205297022294,
 0.7842959748615891,
 0.785515487056711,
 0.7867424809217417,
 0.781834505461619]

In [15]:
clf_ridge = LogisticRegression(random_state=0, solver='liblinear', penalty="l2", C=0.1, multi_class="auto").fit(X_train, y_train)

In [16]:
final_score = clf_ridge.score(X_test, y_test)
print('Confusion matrix\n', confusion_matrix(y_test, clf_ridge.predict(X_test)))
print('Accuracy', final_score)

Confusion matrix
 [[ 36   0   0   0   0]
 [  0  12   2   1   0]
 [  0   0 124   4   0]
 [  0   1   9  16   0]
 [  1   0  12   1   0]]
Accuracy 0.8584474885844748


# Do the same with z-scaled data

In [9]:
# Training set

X_train = pd.read_csv("../data/miRNA_filtered_norm_z_scaled_train.csv")
X_test = pd.read_csv("../data/miRNA_filtered_norm_z_scaled_test.csv")

In [10]:
print("Training data shape is {}".format(X_train.shape))
print("Test data shape is {}".format(X_test.shape))

Training data shape is (816, 1453)
Test data shape is (219, 1453)


In [11]:
y_train = X_train["Ciriello_subtype"]
y_test = X_test["expert_PAM50_subtypes"]

In [12]:
X_train.drop(['Ciriello_subtype'], axis="columns", inplace=True)
X_test.drop(['expert_PAM50_subtypes'], axis="columns", inplace=True)

### Train model with 5-fold CV

In [13]:
values=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
i=1
results = pd.DataFrame(columns=["Index", "C", "Accuracy"])
mean_scores = []


skf = StratifiedKFold(n_splits=5)
for c in values:
    scores = []

    for train_index, test_index in skf.split(X_train, y_train):
        print("Fold {} of 5".format(i))
        X_cv_train, X_cv_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_cv_train, y_cv_val = y_train.iloc[train_index], y_train.iloc[test_index]

        clf = LogisticRegression(random_state=0, solver='liblinear', penalty="l1", C=c, multi_class="auto").fit(X_cv_train, y_cv_train)

        score = clf.score(X_cv_val, y_cv_val)
        results = results.append({'Fold': i, 'C' : c , 'Score' : score}, ignore_index=True)
        scores.append(score)
        i+=1

    i=1
    mean_scores.append(np.mean(scores))
    print('Results: {}'.format(scores))
    print('C: {}, Accuracy: {}'.format(c, np.mean(scores)))

mean_scores

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.16463414634146342, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411, 0.1656441717791411]
C: 0.001, Accuracy: 0.16544216669160555
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.6829268292682927, 0.6748466257668712, 0.6932515337423313, 0.6993865030674846, 0.6809815950920245]
C: 0.01, Accuracy: 0.6862786173874009
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.8048780487804879, 0.7852760736196319, 0.7975460122699386, 0.7852760736196319, 0.7852760736196319]
C: 0.1, Accuracy: 0.7916504563818644
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.774390243902439, 0.7423312883435583, 0.7668711656441718, 0.7607361963190185, 0.754601226993865]
C: 1, Accuracy: 0.7597860242406106
Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
Results: [0.7865853658536586, 0.7300613496932515, 0.7668711656441718, 0.7730061349693251, 0.754601226993865]
C:



Fold 5 of 5
Results: [0.75, 0.6932515337423313, 0.6871165644171779, 0.7484662576687117, 0.6932515337423313]
C: 10000, Accuracy: 0.7144171779141104


[0.16544216669160555,
 0.6862786173874009,
 0.7916504563818644,
 0.7597860242406106,
 0.7622250486308544,
 0.7352835552895407,
 0.7217866227742032,
 0.7144171779141104]