In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm

In [2]:
def status_map(status):
    if 'no_relapse' in status or 'NoRelapse' in status:
        return 0
    else:
        return 1

class BcData:
    def __init__(self):
        self.data = pd.read_csv("data/data_good.csv")
        self.total = pd.read_csv("data/Total_old.csv", names=["gsm", "status"])
        self._drop_grey()
        # self._median_and_log()

    # Drop grey columns
    def _drop_grey(self):
        status_list =['relapse', 'no_relapse', 'test1relapse',
                  'test1no_relapse', 'test2relapse',
                  'test2no_relapse', 'NewTest1_Relapse',
                  'NewTest1_NoRelapse', 'NewTest2_Relapse', 'NewTest2_NoRelapse']
        self.gsm_series = self.total[self.total.status.isin(status_list)].gsm
        new_cols = pd.Series(["GeneSymbol"]).append(self.gsm_series)

        self.total = self.total[self.total.gsm.isin(self.gsm_series)]
        self.data = self.data.filter(items=new_cols)

    # Group rows by median + log
    def _median_and_log(self):
        grouped = self.data.groupby(['GeneSymbol']).median()
        self.data = np.log(grouped)

    # Drop rows with quantile less than threshold
    def filter_percentile(self, quantile=1, threshold=9):
        q = self.data.quantile(q=quantile, axis=1)
        index = q[q >= threshold].index.values
        return self.data.loc[index, :].T

    # Drop rows with max/min diff less than threshold
    def filter_diff_percentile(self, qmax=1, qmin=0, threshold=2):
        max = self.data.quantile(q=qmax, axis=1)
        min = self.data.quantile(q=qmin, axis=1)
        index = max[max - min >= threshold].index.values
        return self.data.loc[index, :].T

    def get_status(self):
        return self.total.status.map(status_map)

In [9]:
df = BcData()
df.data

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,271.7030,143.8760,96.2013,211.1580,245.9540,227.1580,369.5200,289.5180,137.6400,...,155.0600,139.5640,173.0080,183.1150,159.9650,122.8550,134.6120,157.5450,129.4740,124.1000
1,STAT1,148.6620,81.2455,58.1097,135.2550,169.8620,148.2000,190.4340,144.6220,88.7613,...,85.0733,86.2349,104.7460,60.2855,95.5286,58.8385,66.8661,74.5329,68.6508,69.4676
2,STAT1,562.0690,349.9580,191.2970,879.0090,771.7310,1017.0800,985.2080,730.3540,435.2680,...,485.3350,335.5130,686.7610,287.5500,754.0570,264.5480,246.7240,195.2160,245.2250,440.0420
3,GAPDH,4570.1500,2186.3100,4799.3800,6296.8700,4864.7700,3227.5400,4956.9400,4993.3600,3509.2400,...,2632.7900,2159.2200,3398.4900,1108.8800,1744.3200,1778.3400,1707.9800,1045.2000,2752.3600,1839.2100
4,GAPDH,4677.3700,2771.9800,5587.2800,7326.0300,5456.9500,3495.8900,5479.1200,4882.8900,3785.1700,...,3625.7000,3060.1500,5238.4500,2016.6800,3021.6800,2409.5600,2181.3000,2785.2900,3935.0600,2991.3200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,NAA40,54.4646,50.2889,50.6439,50.2440,45.1267,36.5843,42.9146,67.8202,74.0205,...,59.2511,59.7173,82.1648,41.8699,69.3741,58.1110,78.4802,55.5876,42.6167,57.9080
15550,BTRC,52.5145,43.5372,64.3767,48.1749,41.5742,46.1108,57.5067,41.9006,45.1893,...,68.8718,49.3107,61.5488,68.4528,51.3720,62.1168,55.3434,70.1898,62.0527,57.5067
15551,TBX10,47.5438,43.9503,56.3981,57.5737,92.4918,46.7302,49.7785,40.8507,44.4658,...,42.9858,40.1002,35.8531,42.1160,40.3773,41.5986,42.0447,55.3962,46.3305,41.0901
15552,KCNE4,32.1067,356.4300,244.8000,50.3291,126.3150,114.7860,336.0880,33.4743,62.8083,...,49.9980,230.9910,45.8824,53.5128,105.7030,56.9028,90.1375,163.1580,179.0010,46.0703


In [25]:
newd = df.data.groupby('GeneSymbol', as_index=False, sort=False).median()
newd
# newd.loc["STAT1"]

Unnamed: 0,GeneSymbol,GSM441628,GSM441629,GSM441643,GSM441644,GSM441657,GSM441663,GSM441672,GSM441677,GSM441689,...,GSM79316,GSM79301,GSM79303,GSM79278,GSM79158,GSM79256,GSM79307,GSM79194,GSM79179,GSM79182
0,STAT1,287.4710,231.6150,126.2110,324.3710,380.5880,375.2780,766.1780,424.8210,194.2830,...,244.6190,163.3910,535.4710,287.5500,550.7670,191.2820,170.4080,161.3690,212.8650,239.2640
1,GAPDH,5770.8550,3999.5000,7214.9650,9534.6200,6744.7450,4756.1800,6815.5950,5792.2550,5108.3450,...,4997.8000,3877.9000,7405.9750,3039.0050,3969.8250,3712.7300,3926.8850,3531.0350,6213.5200,4015.3550
2,ACTB,5127.6200,4373.3500,5493.4700,5778.5800,5986.8200,4875.8400,6258.4700,6281.9800,5381.5000,...,8177.3500,6920.2600,8289.2400,6147.5100,8729.9600,5875.2300,6985.3500,7237.3000,7172.8800,6363.6000
3,PRPF8,279.7840,892.7620,614.6880,290.4670,413.9100,368.8420,769.1610,404.6150,483.8170,...,615.7220,635.5830,395.4080,399.7820,671.8000,589.7670,382.8660,684.8490,348.2450,756.2910
4,CAPNS1,1154.8800,1089.7100,2224.1900,1463.4400,837.3450,1941.2700,1295.1400,1624.8000,1435.7800,...,1056.5800,1438.1500,1118.9900,1050.1900,1427.5400,1585.4500,1486.3000,990.4060,1066.7200,1220.2900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10502,OR7E156P,92.5358,86.0676,94.5928,89.0994,63.3539,95.3777,88.8058,111.1780,81.8165,...,113.3670,88.7571,199.3210,106.5780,108.0690,124.9120,116.6410,124.3250,92.2668,114.4590
10503,ALS2CL,135.8580,122.7750,231.7670,179.1850,127.0200,120.7980,143.7650,170.3350,111.4620,...,197.9460,158.3730,202.4980,232.8110,193.9110,161.3900,183.8290,279.3240,177.2180,214.1660
10504,C4orf34,18.6950,22.8111,20.1587,19.5876,20.0265,21.2956,19.1105,18.5949,21.2486,...,28.9671,20.8904,17.6882,21.1995,17.1490,25.7579,20.3404,22.9909,21.6785,20.2061
10505,TBX10,47.5438,43.9503,56.3981,57.5737,92.4918,46.7302,49.7785,40.8507,44.4658,...,42.9858,40.1002,35.8531,42.1160,40.3773,41.5986,42.0447,55.3962,46.3305,41.0901


In [99]:
# X = df.filter_percentile(quantile=1, threshold=9)
X = df.filter_diff_percentile(qmax=0.75, qmin=0.25, threshold=1.8)
y = df.get_status()
print("Number of features: {}".format(len(X.columns)))

Number of features: 17


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Set dual = True if number of features > number of examples and vice versa
clf = svm.LinearSVC(penalty='l1', dual=False, C=0.1, max_iter=10000)
# clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, X, y, cv=5)
scores


array([0.76666667, 0.73333333, 0.73333333, 0.72483221, 0.73154362])