In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm

In [54]:
def status_map(status):
    if 'no_relapse' in status or 'NoRelapse' in status:
        return 0
    else:
        return 1

class BcData:
    def __init__(self):
        self.data = pd.read_csv("data/data_good.csv")
        self.total = pd.read_csv("data/Total_old.csv", names=["gsm", "status"])
        self._drop_grey()
        self._median_and_log()

    def _drop_grey(self):
        status_list =['relapse', 'no_relapse', 'test1relapse',
                  'test1no_relapse', 'test2relapse',
                  'test2no_relapse', 'NewTest1_Relapse',
                  'NewTest1_NoRelapse', 'NewTest2_Relapse', 'NewTest2_NoRelapse']
        self.gsm_series = self.total[self.total.status.isin(status_list)].gsm
        new_cols = pd.Series(["GeneSymbol"]).append(self.gsm_series)

        # Drop grey status
        self.total = self.total[self.total.gsm.isin(self.gsm_series)]
        self.data = self.data.filter(items=new_cols)

    def _median_and_log(self):
        grouped = self.data.groupby(['GeneSymbol']).median()
        self.data = np.log(grouped)

    def filter_percentile(self, quantile=0.9, threshold=7):
        q = self.data.quantile(q=quantile, axis=1)
        index = q[q >= threshold].index.values
        return self.data.loc[index, :].T

    def get_status(self):
        return self.total.status.map(status_map)

In [55]:
df = BcData()
df.total

Unnamed: 0,gsm,status
1,GSM441628,relapse
2,GSM441629,relapse
3,GSM441643,relapse
4,GSM441644,relapse
5,GSM441657,relapse
...,...,...
995,GSM79256,NewTest2_NoRelapse
996,GSM79307,NewTest2_NoRelapse
997,GSM79194,NewTest2_NoRelapse
998,GSM79179,NewTest2_NoRelapse


In [71]:
X = df.filter_percentile(threshold=8)
y = df.get_status()

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)



0.7449392712550608