## Ансамбль алгоритмов

In [5]:
import pandas as pd 
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score
from scipy.stats import mode

In [6]:
class EMAggregator:
    
    def __init__(self, test_index, t_matrix, alg_answers, max_iter=5000, eps=0.000005):
        df_temp = pd.DataFrame()
        for index, alg in enumerate(alg_answers):
            df_temp["a_" + str(index)+"_0"] = pd.Series(alg).apply(lambda x: float(x == 0))
            df_temp["a_" + str(index)+"_1"] = pd.Series(alg).apply(lambda x: float(x == 1))
        n =  df_temp.T.as_matrix().reshape(int(len(df_temp.columns) / 2.0) , 2, len(df_temp)).swapaxes(1, 2)
        t_matrix = pd.get_dummies(t_matrix)[[0, 1]].as_matrix().astype(float)
        self.__test_index = test_index
        self.__em_algorithm(t_matrix, n, max_iter, eps)


    def __em_algorithm(self, t_matrix, n, max_iter=1000, eps=0.001):
        it = 0
        pi = np.repeat([[[1.0, 0.0], [0.0, 1.0]]], [n.shape[0]], axis=0)

        while it < max_iter:
            p = t_matrix * n
            s_0 = np.sum(p[:, :, 0], axis=1) + np.sum(t_matrix[:, 0] * n[:, :, 1], axis=1)
            s_1 = np.sum(p[:, :, 1], axis=1) + np.sum(t_matrix[:, 1] * n[:, :, 0], axis=1)

            pi_0 = np.sum(p[:, :, 0], axis=1)
            pi_0[s_0 != 0] = pi_0[s_0 != 0] * 1.0 / s_0[s_0 != 0]
            pi_0[s_0 == 0] = 0.0

            pi_1 = np.sum(p[:, :, 1], axis=1)
            pi_1[s_1 != 0] = pi_1[s_1 != 0] * 1.0 / s_1[s_1 != 0]
            pi_1[s_1 == 0] = 0.0

            pi_new = np.column_stack((pi_0, 1-pi_0, 1-pi_1, pi_1))
            pi_new = pi_new.reshape(pi_new.shape[0], 2, 2)

            if np.all(np.abs(pi_new - pi) < eps):
                self.__t_matrix = t_matrix
                return

            pi = pi_new.copy()
            pr = np.sum(t_matrix, axis=0) * 1.0 / len(t_matrix)

            n_t_0 = np.transpose(n, (1, 0, 2))
            pi_n_0 = np.power(pi[:, 0, :], n_t_0)
            t_0 = np.multiply.reduce(pi_n_0, axis=(1, 2)) * pr[0]

            n_t_1 = np.transpose(n, (1, 0, 2))
            pi_n_1 = np.power(pi[:, 1,:], n_t_1)
            t_1 = np.multiply.reduce(pi_n_1, axis=(1, 2)) * pr[1]

            t = t_0 + t_1
            t_0 = t_0  * 1.0 / t
            t_1 = t_1 * 1.0 / t

            t_matrix = np.column_stack((t_0, t_1))
            it += 1

        self.__t_matrix = t_matrix
        return

    def predict_proba(self):
        return self.__t_matrix[self.__test_index]

## Результаты кросс-валидации

In [72]:
words_cv = pd.read_csv("csv/words_results.csv").drop("Unnamed: 0",axis=1)
words_cv ["ibs"] = words_cv["ibs"].astype(int)
hrv_cv = pd.read_csv("csv/hrv_results.csv").drop("Unnamed: 0",axis=1)
hrv_cv["ibs"] = hrv_cv["ibs"].astype(int)
stat_cv = pd.read_csv("csv/stat_results.csv").drop("Unnamed: 0",axis=1)
stat_cv["ibs"] = stat_cv["ibs"].astype(int)
nn_cv = pd.read_csv("csv/nn_results.csv").drop("Unnamed: 0",axis=1)
nn_cv["ibs"] = nn_cv["ibs"].astype(int)
cnn_cv = pd.read_csv("csv/cnn_results.csv").drop("Unnamed: 0",axis=1)
cnn_cv["ibs"] = cnn_cv["ibs"].astype(int)

df_cv = pd.merge(nn_cv, cnn_cv, on=["ECG_ID", "ibs"])
df_cv = pd.merge(df_cv, stat_cv, on=["ECG_ID", "ibs"])
df_cv = pd.merge(df_cv, hrv_cv, on=["ECG_ID", "ibs"])
df_cv = pd.merge(df_cv, words_cv, on=["ECG_ID", "ibs"])

In [75]:
df_cv["Patient"] = df_cv["ECG_ID"].apply(lambda x: int(x.split("_")[0]))

pr_columns = [u'NN_PR', u'STAT_PR_LR', u'STAT_PR_XGB', u'HRV_PR_LR', u'HRV_PR_XGB',
              u'WORDS_PR_LR', u'WORDS_PR_LR_w2v', u'WORDS_PR_xgb', 'CNN_PR']
p_columns = ['WORDS_P_LR', 'WORDS_P_xgb', 'WORDS_P_xgb_new', 'STAT_P_NEW_LR', 'STAT_P_NEW_XGB', 'WORDS_P_LR_w2v', 
             'WORDS_P_LR_new', 'STAT_P_XGB', 'NN_P2', 'HRV_P_NEW_XGB', 'HRV_P_LR', 'CNN_PRP', 
             'HRV_P_NEW_LR', 'HRV_P_XGB', 'WORDS_P_LR_new_w2v', 'STAT_P_LR', 'CNN_P']

In [77]:
for i in pr_columns:
    print "alg = ", i, roc_auc_score(df_cv["ibs"], df_cv[i])

alg =  NN_PR 0.798869820545
alg =  STAT_PR_LR 0.822809588718
alg =  STAT_PR_XGB 0.804293964519
alg =  HRV_PR_LR 0.631921740645
alg =  HRV_PR_XGB 0.744771863118
alg =  WORDS_PR_LR 0.802354391071
alg =  WORDS_PR_LR_w2v 0.800922130097
alg =  WORDS_PR_xgb 0.755876369487
alg =  CNN_PR 0.741807415986


In [78]:
def patient_score(df, y_test):
    new_df = pd.DataFrame({"Q": df["ibs"]==y_test, "Label":df["Patient"]})
    return new_df.groupby("Label")["Q"].mean().mean()

In [79]:
for i in p_columns:
    print "alg = ", i
    print "f1", f1_score(df_cv["ibs"], df_cv[i])

alg =  WORDS_P_LR
f1 0.688350088705
alg =  WORDS_P_xgb
f1 0.660049627792
alg =  WORDS_P_xgb_new
f1 0.660074165637
alg =  STAT_P_NEW_LR
f1 0.705723905724
alg =  STAT_P_NEW_XGB
f1 0.702567313713
alg =  WORDS_P_LR_w2v
f1 0.692671394799
alg =  WORDS_P_LR_new
f1 0.696774193548
alg =  STAT_P_XGB
f1 0.684789644013
alg =  NN_P2
f1 0.703125
alg =  HRV_P_NEW_XGB
f1 0.675370919881
alg =  HRV_P_LR
f1 0.50773993808
alg =  CNN_PRP
f1 0.6463878327
alg =  HRV_P_NEW_LR
f1 0.592048929664
alg =  HRV_P_XGB
f1 0.662051604783
alg =  WORDS_P_LR_new_w2v
f1 0.699029126214
alg =  STAT_P_LR
f1 0.717197452229
alg =  CNN_P
f1 0.724450194049


In [80]:
for i in sorted(p_columns):
    print "alg = ", i
    print "patient_score", patient_score(df_cv, df_cv[i])

alg =  CNN_P
patient_score 0.671087886272
alg =  CNN_PRP
patient_score 0.720765362471
alg =  HRV_P_LR
patient_score 0.704972939316
alg =  HRV_P_NEW_LR
patient_score 0.763215466963
alg =  HRV_P_NEW_XGB
patient_score 0.769397948293
alg =  HRV_P_XGB
patient_score 0.750306649688
alg =  NN_P2
patient_score 0.760268541781
alg =  STAT_P_LR
patient_score 0.744164909625
alg =  STAT_P_NEW_LR
patient_score 0.74466433239
alg =  STAT_P_NEW_XGB
patient_score 0.740366452729
alg =  STAT_P_XGB
patient_score 0.730083302342
alg =  WORDS_P_LR
patient_score 0.730991274462
alg =  WORDS_P_LR_new
patient_score 0.741210016811
alg =  WORDS_P_LR_new_w2v
patient_score 0.750129459405
alg =  WORDS_P_LR_w2v
patient_score 0.733832088599
alg =  WORDS_P_xgb
patient_score 0.712603159442
alg =  WORDS_P_xgb_new
patient_score 0.71453881457


In [110]:
p_new_columns = ['CNN_P', 'HRV_P_NEW_XGB', 'WORDS_P_xgb_new', 'STAT_P_LR', 'NN_P2',
                 'HRV_P_NEW_LR']
results = []
for i in p_new_columns:
    results.append(df_cv[i])
test_majority = mode(results)[0][0]
t_matrix = np.array(list(test_majority))
em = EMAggregator(df_cv.index, t_matrix, results)
em_proba = em.predict_proba()
print "roc_auc", roc_auc_score(df_cv["ibs"], em_proba[:,1])
em_p = np.zeros(len(em_proba))
em_p[em_proba[:, 1] > 0.5] = 1

print "patient score", patient_score(df_cv, em_p)
print "f1", f1_score(df_cv.ibs, em_p)

print "patient score", patient_score(df_cv, test_majority)
print "f1", f1_score(df_cv.ibs, em_p, test_majority)

roc_auc 0.873837408914
patient score 0.81086136483
f1 0.778497409326
patient score 0.806345750996
f1 0.778497409326


## РЕЗУЛЬТАТЫ НА ТЕСТОВОЙ ВЫБОРКЕ

In [87]:
nn = pd.read_csv("nn_test_results.csv").drop("Unnamed: 0",axis=1)
nn = nn.rename(columns={"CNN_P":"NN_P", "CNN_PR":"NN_PR"})
nn["ibs"] = nn["ibs"].astype(int)

In [88]:
words = pd.read_csv("words_test_results.csv").drop("Unnamed: 0",axis=1)
words["ibs"] = words["ibs"].astype(int)
hrv = pd.read_csv("hrv_test_results.csv").drop("Unnamed: 0",axis=1)
hrv["ibs"] = hrv["ibs"].astype(int)
stat = pd.read_csv("test_stat_results.csv").drop("Unnamed: 0",axis=1)
stat["ibs"] = stat["ibs"].astype(int)
cnn = pd.read_csv("cnn_test_results.csv").drop("Unnamed: 0",axis=1)
cnn["ibs"] = cnn["ibs"].astype(int)

In [89]:
df = pd.merge(nn, cnn, on=["ECG_ID", "ibs"])
df = pd.merge(df, stat, on=["ECG_ID", "ibs"])
df = pd.merge(df, hrv, on=["ECG_ID", "ibs"])
df = pd.merge(df, words, on=["ECG_ID", "ibs"])

In [90]:
pr_columns = [u'NN_PR', u'STAT_PR_LR', u'STAT_PR_XGB', u'HRV_PR_LR', u'HRV_PR_XGB',
              u'WORDS_PR_LR', u'WORDS_PR_LR_w2v', u'WORDS_PR_xgb', 'CNN_PR']
p_columns = ['WORDS_P_LR', 'WORDS_P_xgb', 'WORDS_P_xgb_new', 'STAT_P_NEW_LR', 'STAT_P_NEW_XGB', 'WORDS_P_LR_w2v', 
             'WORDS_P_LR_new', 'STAT_P_XGB', 'NN_P', 'HRV_P_NEW_XGB', 'HRV_P_LR', 
             'HRV_P_NEW_LR', 'HRV_P_XGB', 'WORDS_P_LR_new_w2v', 'STAT_P_LR', 'CNN_P']

In [91]:
df["Patient"] = df["ECG_ID"].apply(lambda x: x.split("_")[0])

In [92]:
from itertools import combinations

In [93]:
for i in pr_columns:
    print "alg = ", i, roc_auc_score(df["ibs"], df[i])

alg =  NN_PR 0.944439237738
alg =  STAT_PR_LR 0.923499600819
alg =  STAT_PR_XGB 0.947958971155
alg =  HRV_PR_LR 0.750605713492
alg =  HRV_PR_XGB 0.877777777778
alg =  WORDS_PR_LR 0.907541046201
alg =  WORDS_PR_LR_w2v 0.908740324204
alg =  WORDS_PR_xgb 0.950421743205
alg =  CNN_PR 0.91176368496


In [94]:
def patient_score(df, y_test):
    new_df = pd.DataFrame({"Q": df["ibs"]==y_test, "Label":df["Patient"]})
    return new_df.groupby("Label")["Q"].mean().mean()

In [95]:
results = []
for i in p_columns:
    results.append(df[i])

In [97]:
for i in p_columns:
    print "alg = ", i
    print "f1", f1_score(df["ibs"], df[i])

alg =  WORDS_P_LR
f1 0.773599386032
alg =  WORDS_P_xgb
f1 0.847315436242
alg =  WORDS_P_xgb_new
f1 0.848281642917
alg =  STAT_P_NEW_LR
f1 0.781163434903
alg =  STAT_P_NEW_XGB
f1 0.839669421488
alg =  WORDS_P_LR_w2v
f1 0.777604976672
alg =  WORDS_P_LR_new
f1 0.770864946889
alg =  STAT_P_XGB
f1 0.840236686391
alg =  NN_P
f1 0.827914353688
alg =  HRV_P_NEW_XGB
f1 0.749010292953
alg =  HRV_P_LR
f1 0.539428571429
alg =  HRV_P_NEW_LR
f1 0.657534246575
alg =  HRV_P_XGB
f1 0.751454696592
alg =  WORDS_P_LR_new_w2v
f1 0.761974944731
alg =  STAT_P_LR
f1 0.809564474808
alg =  CNN_P
f1 0.797936371453


In [98]:
for i in p_columns:
    print "alg = ", i
    print "patient_score", patient_score(df, df[i])

alg =  WORDS_P_LR
patient_score 0.767492550134
alg =  WORDS_P_xgb
patient_score 0.773724225462
alg =  WORDS_P_xgb_new
patient_score 0.777427929166
alg =  STAT_P_NEW_LR
patient_score 0.630847885207
alg =  STAT_P_NEW_XGB
patient_score 0.787858766316
alg =  WORDS_P_LR_w2v
patient_score 0.759113174584
alg =  WORDS_P_LR_new
patient_score 0.769962195638
alg =  STAT_P_XGB
patient_score 0.7772527375
alg =  NN_P
patient_score 0.789676220994
alg =  HRV_P_NEW_XGB
patient_score 0.802959368669
alg =  HRV_P_LR
patient_score 0.458590999842
alg =  HRV_P_NEW_LR
patient_score 0.706940083748
alg =  HRV_P_XGB
patient_score 0.78361545224
alg =  WORDS_P_LR_new_w2v
patient_score 0.785610568291
alg =  STAT_P_LR
patient_score 0.733083102927
alg =  CNN_P
patient_score 0.743282043648


In [100]:
results = []
for i in p_columns:
    results.append(df[i])
test_majority = mode(results)[0][0]
print accuracy_score(df.ibs, test_majority)
print f1_score(df.ibs, test_majority)
print patient_score(df, test_majority)

0.909207161125
0.87925170068
0.810511761447


In [106]:
p_new_columns = ['CNN_P', 'HRV_P_NEW_XGB', 'WORDS_P_xgb_new', 'STAT_P_LR', 'NN_P',
                 'HRV_P_NEW_LR']
results = []
for i in p_new_columns:
    results.append(df[i])
test_majority = mode(results)[0][0]
t_matrix = np.array(list(test_majority))
em = EMAggregator(df.index, t_matrix, results)
em_proba = em.predict_proba()
print "roc_auc", roc_auc_score(df["ibs"], em_proba[:,1])
em_p = np.zeros(len(em_proba))
em_p[em_proba[:, 1] > 0.5] = 1

print "patient score", patient_score(df, em_p)
print "f1", f1_score(df.ibs, em_p)

print "patient score", patient_score(df, test_majority)
print "f1", f1_score(df.ibs, test_majority)

roc_auc 0.973729563678
patient score 0.841988687262
f1 0.90245971162
patient score 0.760596403038
f1 0.880717488789
