In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm

In [43]:
def status_map(status):
    if 'no_relapse' in status or 'NoRelapse' in status:
        return 0
    else:
        return 1

class BcData:
    def __init__(self):
        self.data = pd.read_csv("data/data_good.csv")
        self.total = pd.read_csv("data/Total_old.csv", names=["gsm", "status"])
        self._drop_grey()
        self._log_table()

    # Drop grey columns
    def _drop_grey(self):
        status_list =['relapse', 'no_relapse', 'test1relapse',
                  'test1no_relapse', 'test2relapse',
                  'test2no_relapse', 'NewTest1_Relapse',
                  'NewTest1_NoRelapse', 'NewTest2_Relapse', 'NewTest2_NoRelapse']
        self.gsm_series = self.total[self.total.status.isin(status_list)].gsm
        new_cols = pd.Series(["GeneSymbol"]).append(self.gsm_series)

        self.total = self.total[self.total.gsm.isin(self.gsm_series)]
        self.data = self.data.filter(items=new_cols)

    # Group rows by gene leaving max median row
    def _groupby_gene(self):
        return self.data\
            .groupby("GeneSymbol", as_index=False, sort=False)\
            .apply(lambda f: f.loc[f.median(axis=1).idxmax()])

    def _log_table(self):
        self.data = np.log2(self.data.iloc[:, 1:])
        self.data.insert(0, "GeneSymbol", df.data.iloc[:, 0])

    def _get_status(self):
        return self.total.status.map(status_map)

    # Drop rows with quantile less than threshold (values = {7, 8, 9})
    def filter_percentile(self, quantile=1, threshold=9):
        q = self.data.quantile(q=quantile, axis=1)
        index = q[q >= threshold].index.values
        self.data = self.data.loc[index, :]

    # Drop rows with max/min diff less than threshold (values = {1.5, 2})
    def filter_diff_percentile(self, qmax=1, qmin=0, threshold=2):
        threshold = np.log2(threshold)
        max = self.data.quantile(q=qmax, axis=1)
        min = self.data.quantile(q=qmin, axis=1)
        index = max[max - min >= threshold].index.values
        self.data = self.data.loc[index, :]

    # Get (X, y)
    def get_data(self):
        return self._groupby_gene(), self._get_status()

In [44]:
df = BcData()
df.data

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,8.085887,7.168682,6.587984,7.722179,7.942245,7.827552,8.529509,8.177509,7.104756,...,7.276683,7.124783,7.434695,7.516606,7.321612,6.940813,7.072663,7.299620,7.016519,6.955359
1,STAT1,7.215892,6.344216,5.860707,7.079538,7.408219,7.211402,7.573147,7.176143,6.471859,...,6.410635,6.430200,6.710751,5.913739,6.577861,5.878689,6.063203,6.219805,6.101205,6.118268
2,STAT1,9.134603,8.451038,7.579670,9.779734,9.591954,9.990217,9.944285,9.512452,8.765760,...,8.922837,8.390225,9.423664,8.167669,9.558530,8.047386,7.946754,7.608927,7.937962,8.781497
3,GAPDH,12.158026,11.094282,12.228632,12.620419,12.248156,11.656219,12.275234,12.285795,11.776943,...,11.362377,11.076295,11.730678,10.114888,10.768449,10.796315,10.738075,10.029563,11.426453,10.844870
4,GAPDH,12.191482,11.436701,12.447930,12.838816,12.413879,11.771444,12.419728,12.253520,11.886142,...,11.824044,11.579387,12.354924,10.977766,11.561135,11.234554,11.090972,11.443612,11.942170,11.546567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,NAA40,5.767247,5.652168,5.662317,5.650879,5.495909,5.193153,5.423397,6.083643,6.209853,...,5.888770,5.900077,6.360449,5.387842,6.116325,5.860739,6.294257,5.796691,5.413347,5.855691
15550,BTRC,5.714644,5.444177,6.008467,5.590210,5.377617,5.527033,5.845658,5.388899,5.497909,...,6.105841,5.623829,5.943659,6.097038,5.682910,5.956912,5.790339,6.133189,5.955422,5.845658
15551,TBX10,5.571185,5.457801,5.817575,5.847338,6.531254,5.546283,5.637451,5.352289,5.474624,...,5.425788,5.325538,5.164026,5.396297,5.335473,5.378463,5.393852,5.791715,5.533890,5.360719
15552,KCNE4,5.004802,8.477475,7.935460,5.653321,6.980882,6.842803,8.392695,5.064982,5.972883,...,5.643798,7.851693,5.519869,5.741812,6.723873,5.830428,6.494056,7.350126,7.483824,5.525765


In [45]:
X, y = df.get_data()
X

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,10.118876,9.451877,8.354805,10.610259,10.741121,10.795358,10.542674,9.963624,9.622380,...,9.512118,9.142033,10.445532,8.601886,10.321116,8.950999,8.635794,8.230515,8.700245,9.350323
1,GAPDH,13.171502,12.639870,13.351215,13.747857,13.461415,12.820907,13.300210,13.056116,12.847682,...,13.142397,12.627935,13.665591,12.370290,12.719370,12.529614,12.599213,12.723268,13.369066,12.730598
2,ACTB,12.972974,12.792522,13.104928,13.195644,13.287268,12.845088,12.976763,12.989742,13.131984,...,13.815934,13.505725,13.895954,13.399865,13.878779,13.374089,13.549797,13.626952,13.740487,13.258825
3,PRPF8,8.128170,9.802132,9.263711,8.182230,8.693173,8.526859,9.587142,8.660406,8.918318,...,9.266135,9.311937,8.627198,8.643070,9.391888,9.204001,8.580696,9.419642,8.443959,9.562798
4,CAPNS1,10.173527,10.089729,11.119064,10.515148,9.709678,10.922785,10.338892,10.666046,10.487619,...,10.045186,10.489998,10.127981,10.036435,10.479315,10.630677,10.537510,9.951876,10.058966,10.253008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10502,OR7E156P,6.531940,6.427398,6.563658,6.477344,5.985362,6.575580,6.472582,6.796728,6.354320,...,6.824857,6.471791,7.638950,6.735766,6.755809,6.964768,6.865931,6.957973,6.527740,6.838687
10503,ALS2CL,7.085956,6.939873,7.856531,7.485306,6.988912,6.916453,7.167569,7.412231,6.800408,...,7.628963,7.307183,7.661764,7.863015,7.599251,7.334407,7.522221,8.125796,7.469381,7.742586
10504,C4orf34,4.224581,4.511664,4.333331,4.291869,4.323838,4.412483,4.256294,4.216835,4.409296,...,4.856343,4.384768,4.144715,4.405958,4.100053,4.686943,4.346276,4.522991,4.438193,4.336719
10505,TBX10,5.571185,5.457801,5.817575,5.847338,6.531254,5.546283,5.637451,5.352289,5.474624,...,5.425788,5.325538,5.164026,5.396297,5.335473,5.378463,5.393852,5.791715,5.533890,5.360719


In [46]:
X.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10497,10498,10499,10500,10501,10502,10503,10504,10505,10506
GeneSymbol,STAT1,GAPDH,ACTB,PRPF8,CAPNS1,RPL35,RPL28,EIF4G2,EIF3D,PARK7,...,LOC100507009,OR7E47P,EGOT,LOC100510224,ZNF324B,OR7E156P,ALS2CL,C4orf34,TBX10,KCNE4
GSM441628,10.1189,13.1715,12.973,8.12817,10.1735,11.5605,12.5509,10.913,9.63697,11.2945,...,6.17986,6.7671,5.86382,5.69652,5.99727,6.53194,7.08596,4.22458,5.57119,5.0048
GSM441629,9.45188,12.6399,12.7925,9.80213,10.0897,11.0815,12.3546,11.0464,9.35136,11.1511,...,6.76474,6.33582,5.65642,5.5358,6.21999,6.4274,6.93987,4.51166,5.4578,8.47747
GSM441643,8.3548,13.3512,13.1049,9.26371,11.1191,10.9123,12.8378,10.7683,9.21377,11.0424,...,6.22703,6.71261,6.41299,6.25083,6.22703,6.56366,7.85653,4.33333,5.81757,7.93546
GSM441644,10.6103,13.7479,13.1956,8.18223,10.5151,11.4458,12.9752,10.3845,9.00243,11.004,...,6.27291,6.71663,6.99154,5.13919,6.222,6.47734,7.48531,4.29187,5.84734,5.65332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM79256,8.951,12.5296,13.3741,9.204,10.6307,11.5134,12.4074,10.4458,9.94693,10.7103,...,7.00995,6.97119,6.52152,5.95256,6.25076,6.96477,7.33441,4.68694,5.37846,5.83043
GSM79307,8.63579,12.5992,13.5498,8.5807,10.5375,11.0951,12.2959,10.4591,9.25498,11.1421,...,7.03424,7.16659,8.3352,5.59474,6.13713,6.86593,7.52222,4.34628,5.39385,6.49406
GSM79194,8.23052,12.7233,13.627,9.41964,9.95188,10.5958,11.9644,9.53527,9.56916,10.5923,...,7.82551,7.28478,6.33529,5.88596,6.5041,6.95797,8.1258,4.52299,5.79172,7.35013
GSM79179,8.70025,13.3691,13.7405,8.44396,10.059,11.2763,12.0185,10.7946,9.82516,10.5452,...,7.35875,6.92516,6.19943,5.05401,6.18803,6.52774,7.46938,4.43819,5.53389,7.48382


In [42]:
df.data.groupby("GeneSymbol", as_index=False, sort=False).apply(lambda f: f.loc[f.median(axis=1).idxmax()])

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,10.118876,9.451877,8.354805,10.610259,10.741121,10.795358,10.542674,9.963624,9.622380,...,9.512118,9.142033,10.445532,8.601886,10.321116,8.950999,8.635794,8.230515,8.700245,9.350323
1,GAPDH,13.171502,12.639870,13.351215,13.747857,13.461415,12.820907,13.300210,13.056116,12.847682,...,13.142397,12.627935,13.665591,12.370290,12.719370,12.529614,12.599213,12.723268,13.369066,12.730598
2,ACTB,12.972974,12.792522,13.104928,13.195644,13.287268,12.845088,12.976763,12.989742,13.131984,...,13.815934,13.505725,13.895954,13.399865,13.878779,13.374089,13.549797,13.626952,13.740487,13.258825
3,PRPF8,8.128170,9.802132,9.263711,8.182230,8.693173,8.526859,9.587142,8.660406,8.918318,...,9.266135,9.311937,8.627198,8.643070,9.391888,9.204001,8.580696,9.419642,8.443959,9.562798
4,CAPNS1,10.173527,10.089729,11.119064,10.515148,9.709678,10.922785,10.338892,10.666046,10.487619,...,10.045186,10.489998,10.127981,10.036435,10.479315,10.630677,10.537510,9.951876,10.058966,10.253008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10502,OR7E156P,6.531940,6.427398,6.563658,6.477344,5.985362,6.575580,6.472582,6.796728,6.354320,...,6.824857,6.471791,7.638950,6.735766,6.755809,6.964768,6.865931,6.957973,6.527740,6.838687
10503,ALS2CL,7.085956,6.939873,7.856531,7.485306,6.988912,6.916453,7.167569,7.412231,6.800408,...,7.628963,7.307183,7.661764,7.863015,7.599251,7.334407,7.522221,8.125796,7.469381,7.742586
10504,C4orf34,4.224581,4.511664,4.333331,4.291869,4.323838,4.412483,4.256294,4.216835,4.409296,...,4.856343,4.384768,4.144715,4.405958,4.100053,4.686943,4.346276,4.522991,4.438193,4.336719
10505,TBX10,5.571185,5.457801,5.817575,5.847338,6.531254,5.546283,5.637451,5.352289,5.474624,...,5.425788,5.325538,5.164026,5.396297,5.335473,5.378463,5.393852,5.791715,5.533890,5.360719


In [36]:
res.insert(0, "GeneSymbol", df.data.iloc[:, 0])
res

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,8.085887,7.168682,6.587984,7.722179,7.942245,7.827552,8.529509,8.177509,7.104756,...,7.276683,7.124783,7.434695,7.516606,7.321612,6.940813,7.072663,7.299620,7.016519,6.955359
1,STAT1,7.215892,6.344216,5.860707,7.079538,7.408219,7.211402,7.573147,7.176143,6.471859,...,6.410635,6.430200,6.710751,5.913739,6.577861,5.878689,6.063203,6.219805,6.101205,6.118268
2,STAT1,9.134603,8.451038,7.579670,9.779734,9.591954,9.990217,9.944285,9.512452,8.765760,...,8.922837,8.390225,9.423664,8.167669,9.558530,8.047386,7.946754,7.608927,7.937962,8.781497
3,GAPDH,12.158026,11.094282,12.228632,12.620419,12.248156,11.656219,12.275234,12.285795,11.776943,...,11.362377,11.076295,11.730678,10.114888,10.768449,10.796315,10.738075,10.029563,11.426453,10.844870
4,GAPDH,12.191482,11.436701,12.447930,12.838816,12.413879,11.771444,12.419728,12.253520,11.886142,...,11.824044,11.579387,12.354924,10.977766,11.561135,11.234554,11.090972,11.443612,11.942170,11.546567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,NAA40,5.767247,5.652168,5.662317,5.650879,5.495909,5.193153,5.423397,6.083643,6.209853,...,5.888770,5.900077,6.360449,5.387842,6.116325,5.860739,6.294257,5.796691,5.413347,5.855691
15550,BTRC,5.714644,5.444177,6.008467,5.590210,5.377617,5.527033,5.845658,5.388899,5.497909,...,6.105841,5.623829,5.943659,6.097038,5.682910,5.956912,5.790339,6.133189,5.955422,5.845658
15551,TBX10,5.571185,5.457801,5.817575,5.847338,6.531254,5.546283,5.637451,5.352289,5.474624,...,5.425788,5.325538,5.164026,5.396297,5.335473,5.378463,5.393852,5.791715,5.533890,5.360719
15552,KCNE4,5.004802,8.477475,7.935460,5.653321,6.980882,6.842803,8.392695,5.064982,5.972883,...,5.643798,7.851693,5.519869,5.741812,6.723873,5.830428,6.494056,7.350126,7.483824,5.525765


In [None]:
# X = df.filter_percentile(quantile=1, threshold=9)
X = df.filter_diff_percentile(qmax=0.75, qmin=0.25, threshold=1.8)
y = df.get_status()
print("Number of features: {}".format(len(X.columns)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Set dual = True if number of features > number of examples and vice versa
clf = svm.LinearSVC(penalty='l1', dual=False, C=0.1, max_iter=10000)
# clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, X, y, cv=5)
scores


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Set dual = True if number of features > number of examples and vice versa
clf = svm.LinearSVC(penalty='l1', dual=False, C=0.1, max_iter=10000)
# clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, X, y, cv=5)
scores


array([0.76666667, 0.73333333, 0.73333333, 0.72483221, 0.73154362])