In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm

In [68]:
def status_map(status):
    if 'no_relapse' in status or 'NoRelapse' in status:
        return 0
    else:
        return 1

class BcData:
    def __init__(self):
        self.data = pd.read_csv("data/data_good.csv")
        self.total = pd.read_csv("data/Total_old.csv", names=["gsm", "status"])
        self._drop_grey()
        self._median_and_log()

    # Drop grey columns
    def _drop_grey(self):
        status_list =['relapse', 'no_relapse', 'test1relapse',
                  'test1no_relapse', 'test2relapse',
                  'test2no_relapse', 'NewTest1_Relapse',
                  'NewTest1_NoRelapse', 'NewTest2_Relapse', 'NewTest2_NoRelapse']
        self.gsm_series = self.total[self.total.status.isin(status_list)].gsm
        new_cols = pd.Series(["GeneSymbol"]).append(self.gsm_series)

        self.total = self.total[self.total.gsm.isin(self.gsm_series)]
        self.data = self.data.filter(items=new_cols)

    # Group rows by median + log
    def _median_and_log(self):
        grouped = self.data.groupby(['GeneSymbol']).median()
        self.data = np.log(grouped)

    # Drop rows with quantile less than threshold
    def filter_percentile(self, quantile=1, threshold=9):
        q = self.data.quantile(q=quantile, axis=1)
        index = q[q >= threshold].index.values
        return self.data.loc[index, :].T

    # Drop rows with max/min diff less than threshold
    def filter_diff_percentile(self, qmax=1, qmin=0, threshold=2):
        max = self.data.quantile(q=qmax, axis=1)
        min = self.data.quantile(q=qmin, axis=1)
        index = max[max - min >= threshold].index.values
        return self.data.loc[index, :].T

    def get_status(self):
        return self.total.status.map(status_map)

In [87]:
df = BcData()
df.data

Unnamed: 0_level_0,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,GSM441690,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,4.059420,4.125846,4.261423,4.208891,4.173578,4.367125,4.170052,4.115277,4.263574,4.506097,...,4.654275,4.274173,4.664881,4.841696,4.524626,4.531064,4.442445,5.130762,4.534371,4.409195
A2M,7.250621,7.015533,7.343277,7.091867,7.409191,6.577331,7.409887,7.276370,7.521524,7.415741,...,7.819769,6.706868,7.266317,7.995327,8.013664,7.647380,7.595518,7.695076,8.014537,8.413800
A4GALT,4.149443,4.269429,4.165371,4.297533,4.096815,4.698260,4.393498,4.206948,4.185714,4.334078,...,3.833979,4.014249,3.802835,3.954304,4.017890,4.043528,3.960771,3.924244,4.134121,4.009921
AAAS,5.495491,5.420292,5.413764,5.489838,5.029686,5.456436,5.052468,5.326788,5.283397,5.323858,...,5.481734,5.451669,5.289615,5.503668,5.338955,5.359535,5.658918,5.608028,5.520725,5.353880
AACS,5.526568,5.456867,5.721675,5.377826,5.196545,5.377027,5.836672,5.673175,5.308639,5.164717,...,6.546118,5.423160,5.284345,4.954869,5.261773,5.456248,4.788766,5.279940,5.576775,5.494419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZWINT,5.930530,5.121228,6.201211,5.597388,5.732969,5.244901,6.298365,5.976645,5.456760,6.757985,...,5.522149,5.576396,5.659350,4.235866,5.221890,5.140476,4.772674,4.780811,5.821290,4.422917
ZXDC,5.004436,5.405870,5.220383,4.735057,4.867888,5.445715,4.977134,4.546081,5.044657,4.981824,...,4.659791,5.108802,4.914917,5.018326,4.790271,4.924082,4.909304,4.955559,4.765766,4.803562
ZYX,5.367937,5.559140,5.909452,5.575134,5.405439,5.264163,5.918659,5.739645,5.542463,5.698880,...,6.233840,6.033469,6.461617,5.817522,6.624238,6.286778,6.142490,6.495577,6.037938,6.402736
ZZEF1,4.147237,4.154740,4.302374,4.052619,3.781124,4.056973,4.201757,4.510221,4.335452,4.236138,...,4.172880,4.234815,4.463964,4.501783,4.312624,4.310297,4.501754,4.562437,4.307719,4.310096


In [99]:
# X = df.filter_percentile(quantile=1, threshold=9)
X = df.filter_diff_percentile(qmax=0.75, qmin=0.25, threshold=1.8)
y = df.get_status()
print("Number of features: {}".format(len(X.columns)))

Number of features: 17


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Set dual = True if number of features > number of examples and vice versa
clf = svm.LinearSVC(penalty='l1', dual=False, C=0.1, max_iter=10000)
# clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, X, y, cv=5)
scores


array([0.76666667, 0.73333333, 0.73333333, 0.72483221, 0.73154362])