In [1]:
# Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as spy
import prince
import pickle
import plotly.express as px

# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

# 데이터 불러오기

In [20]:
with open('FwithS_mat.pickle', 'rb') as f:
    FwithS_mat = pickle.load(f)

In [21]:
with open('FwoutS_mat.pickle', 'rb') as f:
    FwoutS_mat = pickle.load(f)

# 데이터 전처리

## icd9 : 4019, 4280/ item 제거

In [22]:
FwithS_mat = FwithS_mat.drop(['icd9 : 4019', 'icd9 : 4280'], axis=1)
FwithS_mat = FwithS_mat[FwithS_mat.columns.drop(
    list(FwithS_mat.filter(regex='item')))]

FwithS_mat

Unnamed: 0,HADM_ID,GENDER,age,icd9 : 0389,icd9 : 5849,icd9 : 486,icd9 : 51881,icd9 : 78552,icd9 : 2866,icd9 : 496,...,icd9 : 61179,icd9 : E9399,icd9 : 99939,icd9 : 56731,icd9 : 41404,icd9 : 30183,icd9 : 30928,icd9 : 5933,icd9 : 71697,icd9 : 7469
0,192123.0,F,66.0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,112906.0,M,71.0,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,196896.0,M,76.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,193975.0,F,83.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,151459.0,M,52.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,148314.0,F,81.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
635,103030.0,F,78.0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
636,145414.0,F,61.0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
637,186076.0,M,69.0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0


In [23]:
FwoutS_mat = FwoutS_mat.drop('icd9 : 4019', axis=1)
FwoutS_mat = FwoutS_mat[FwoutS_mat.columns.drop(
    list(FwoutS_mat.filter(regex='item')))]

FwoutS_mat

Unnamed: 0,HADM_ID,GENDER,age,icd9 : 1570,icd9 : 57410,icd9 : 9971,icd9 : 4275,icd9 : 99811,icd9 : 5680,icd9 : 55321,...,icd9 : 33721,icd9 : 34691,icd9 : 60781,icd9 : 40591,icd9 : 1838,icd9 : 4374,icd9 : 86122,icd9 : 86113,icd9 : E9654,icd9 : 88013
0,112213.0,M,72.0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,143045.0,F,39.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,188822.0,M,50.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,109235.0,M,300.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,157681.0,F,75.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075,101083.0,M,66.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
13076,167228.0,M,77.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13077,117390.0,M,78.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13078,197084.0,F,65.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 모든 feature를 binary로 변환

In [24]:
# Gender가 F면 0, M이면 1로 변환

FwithS_mat.replace({'F': 0, 'M': 1}, inplace=True)
FwithS_mat

FwoutS_mat.replace({'F': 0, 'M': 1}, inplace=True)
FwoutS_mat

Unnamed: 0,HADM_ID,GENDER,age,icd9 : 0389,icd9 : 5849,icd9 : 486,icd9 : 51881,icd9 : 78552,icd9 : 2866,icd9 : 496,...,icd9 : 61179,icd9 : E9399,icd9 : 99939,icd9 : 56731,icd9 : 41404,icd9 : 30183,icd9 : 30928,icd9 : 5933,icd9 : 71697,icd9 : 7469
0,192123.0,0,66.0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,112906.0,1,71.0,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,196896.0,1,76.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,193975.0,0,83.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,151459.0,1,52.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,148314.0,0,81.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
635,103030.0,0,78.0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
636,145414.0,0,61.0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
637,186076.0,1,69.0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0


Unnamed: 0,HADM_ID,GENDER,age,icd9 : 1570,icd9 : 57410,icd9 : 9971,icd9 : 4275,icd9 : 99811,icd9 : 5680,icd9 : 55321,...,icd9 : 33721,icd9 : 34691,icd9 : 60781,icd9 : 40591,icd9 : 1838,icd9 : 4374,icd9 : 86122,icd9 : 86113,icd9 : E9654,icd9 : 88013
0,112213.0,1,72.0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,143045.0,0,39.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,188822.0,1,50.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,109235.0,1,300.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,157681.0,0,75.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075,101083.0,1,66.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
13076,167228.0,1,77.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13077,117390.0,1,78.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13078,197084.0,0,65.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
FwithS_mat['age'] = FwithS_mat['age'].apply(lambda x: 0 if x <= 60 else 1)
FwithS_mat

FwoutS_mat['age'] = FwoutS_mat['age'].apply(lambda x: 0 if x <= 60 else 1)
FwoutS_mat

Unnamed: 0,HADM_ID,GENDER,age,icd9 : 0389,icd9 : 5849,icd9 : 486,icd9 : 51881,icd9 : 78552,icd9 : 2866,icd9 : 496,...,icd9 : 61179,icd9 : E9399,icd9 : 99939,icd9 : 56731,icd9 : 41404,icd9 : 30183,icd9 : 30928,icd9 : 5933,icd9 : 71697,icd9 : 7469
0,192123.0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,112906.0,1,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,196896.0,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,193975.0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,151459.0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,148314.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
635,103030.0,0,1,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
636,145414.0,0,1,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
637,186076.0,1,1,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0


Unnamed: 0,HADM_ID,GENDER,age,icd9 : 1570,icd9 : 57410,icd9 : 9971,icd9 : 4275,icd9 : 99811,icd9 : 5680,icd9 : 55321,...,icd9 : 33721,icd9 : 34691,icd9 : 60781,icd9 : 40591,icd9 : 1838,icd9 : 4374,icd9 : 86122,icd9 : 86113,icd9 : E9654,icd9 : 88013
0,112213.0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,143045.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,188822.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,109235.0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,157681.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075,101083.0,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
13076,167228.0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13077,117390.0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13078,197084.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 머신러닝 모델 적용

In [28]:
FwithS_mat['target'] = 1
FwithS_mat

FwoutS_mat['target'] = 0
FwoutS_mat

Unnamed: 0,HADM_ID,GENDER,age,icd9 : 0389,icd9 : 5849,icd9 : 486,icd9 : 51881,icd9 : 78552,icd9 : 2866,icd9 : 496,...,icd9 : E9399,icd9 : 99939,icd9 : 56731,icd9 : 41404,icd9 : 30183,icd9 : 30928,icd9 : 5933,icd9 : 71697,icd9 : 7469,target
0,192123.0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,112906.0,1,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,196896.0,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,193975.0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,151459.0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,148314.0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
635,103030.0,0,1,0,1,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
636,145414.0,0,1,1,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
637,186076.0,1,1,1,1,0,0,1,0,1,...,0,0,0,0,0,0,1,1,0,1


Unnamed: 0,HADM_ID,GENDER,age,icd9 : 1570,icd9 : 57410,icd9 : 9971,icd9 : 4275,icd9 : 99811,icd9 : 5680,icd9 : 55321,...,icd9 : 34691,icd9 : 60781,icd9 : 40591,icd9 : 1838,icd9 : 4374,icd9 : 86122,icd9 : 86113,icd9 : E9654,icd9 : 88013,target
37,121804.0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3806,163534.0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6684,102404.0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12510,125643.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5392,125174.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,140331.0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5797,149063.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1930,129131.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12273,111421.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## fmerge

In [35]:
df_all = pd.DataFrame(
    columns=['precision', 'recall', 'F-score', 'AUC', 'model'])

for i in tqdm(range(30)):

    FwoutS_mat = FwoutS_mat.sample(n=len(FwithS_mat))
    fmerge = pd.merge(FwithS_mat, FwoutS_mat, how='outer')
    fmerge = fmerge.fillna(0)
    cols = fmerge.columns.tolist()
    cols = [col for col in fmerge if col != 'target'] + ['target']
    fmerge = fmerge[cols]

    X_train, X_test, y_train, y_test = train_test_split(
        fmerge.iloc[:, 1:-1], fmerge['target'])

    # randomforest
    rf = RandomForestClassifier(oob_score=True)

    rf_model = rf.fit(X_train, y_train)

    rf_y_pred = rf_model.predict(X_test)
    rf_y_pred_prob = rf_model.predict_proba(X_test)

    df_all.loc[i] = [
        metrics.precision_score(y_test, rf_y_pred),
        metrics.recall_score(y_test, rf_y_pred),
        metrics.f1_score(y_test, rf_y_pred),
        metrics.roc_auc_score(y_test, rf_y_pred_prob[:, 1]), 'randomFst'
    ]

    # adaboost
    ad = AdaBoostClassifier(n_estimators=50, learning_rate=1)

    ad_model = ad.fit(X_train, y_train)

    ad_y_pred = ad_model.predict(X_test)
    ad_y_pred_prob = ad_model.predict_proba(X_test)

    df_all.loc[i+30] = [
        metrics.precision_score(y_test, ad_y_pred),
        metrics.recall_score(y_test, ad_y_pred),
        metrics.f1_score(y_test, ad_y_pred),
        metrics.roc_auc_score(y_test, ad_y_pred_prob[:, 1]), 'adaBST'
    ]

    # xgboost
    xgb = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=4)

    xgb_model = xgb.fit(X_train, y_train)

    xgb_y_pred = xgb_model.predict(X_test)
    xgb_y_pred_prob = xgb_model.predict_proba(X_test)

    df_all.loc[i+60] = [
        metrics.precision_score(y_test, xgb_y_pred),
        metrics.recall_score(y_test, xgb_y_pred),
        metrics.f1_score(y_test, xgb_y_pred),
        metrics.roc_auc_score(y_test, xgb_y_pred_prob[:, 1]), 'XGBoost'
    ]

























































































































100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [21:20<00:00, 42.68s/it]


In [63]:
df_all = df_all.sort_index()
df_all

Unnamed: 0,precision,recall,F-score,AUC,model
0,0.910569,0.788732,0.845283,0.942510,randomFst
1,0.963235,0.775148,0.859016,0.941658,randomFst
2,0.950413,0.727848,0.824373,0.949677,randomFst
3,0.888060,0.739130,0.806780,0.914108,randomFst
4,0.944000,0.742138,0.830986,0.934782,randomFst
...,...,...,...,...,...
85,0.887218,0.813793,0.848921,0.930150,XGBoost
86,0.851351,0.851351,0.851351,0.929297,XGBoost
87,0.873333,0.850649,0.861842,0.928262,XGBoost
88,0.871622,0.860000,0.865772,0.936998,XGBoost


## mca

In [38]:
mca = prince.MCA(n_components=10,
                 n_iter=3,
                 copy=True,
                 check_input=True,
                 engine='auto',
                 random_state=42)

In [56]:
mca_df_all = pd.DataFrame(
    columns=['precision', 'recall', 'F-score', 'AUC', 'model'])
lst_target = []

for i in tqdm(range(30)):

    FwoutS_mat = FwoutS_mat.sample(n=len(FwithS_mat))
    fmerge = pd.merge(FwithS_mat, FwoutS_mat, how='outer')
    fmerge = fmerge.fillna(0)
    target = fmerge['target']
    fmerge.replace({0: 'F', 1: 'T'}, inplace=True)
    fmerge = fmerge.drop('target', axis=1)

    # mca 적용
    mca = mca.fit(fmerge)
    fmerge_mca = mca.transform(fmerge)
    fmerge_mca['target'] = target

    X_train, X_test, y_train, y_test = train_test_split(
        fmerge_mca.iloc[:, :-1], fmerge_mca['target'])

    # randomforest
    rf = RandomForestClassifier(oob_score=True)

    rf_model = rf.fit(X_train, y_train)

    rf_y_pred = rf_model.predict(X_test)
    rf_y_pred_prob = rf_model.predict_proba(X_test)

    mca_df_all.loc[i] = [
        metrics.precision_score(y_test, rf_y_pred),
        metrics.recall_score(y_test, rf_y_pred),
        metrics.f1_score(y_test, rf_y_pred),
        metrics.roc_auc_score(y_test, rf_y_pred_prob[:, 1]), 'randomFst'
    ]

    # adaboost
    ad = AdaBoostClassifier(n_estimators=50, learning_rate=1)

    ad_model = ad.fit(X_train, y_train)

    ad_y_pred = ad_model.predict(X_test)
    ad_y_pred_prob = ad_model.predict_proba(X_test)

    mca_df_all.loc[i + 30] = [
        metrics.precision_score(y_test, ad_y_pred),
        metrics.recall_score(y_test, ad_y_pred),
        metrics.f1_score(y_test, ad_y_pred),
        metrics.roc_auc_score(y_test, ad_y_pred_prob[:, 1]), 'adaBST'
    ]

    # xgboost
    xgb = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=4)

    xgb_model = xgb.fit(X_train, y_train)

    xgb_y_pred = xgb_model.predict(X_test)
    xgb_y_pred_prob = xgb_model.predict_proba(X_test)

    mca_df_all.loc[i + 60] = [
        metrics.precision_score(y_test, xgb_y_pred),
        metrics.recall_score(y_test, xgb_y_pred),
        metrics.f1_score(y_test, xgb_y_pred),
        metrics.roc_auc_score(y_test, xgb_y_pred_prob[:, 1]), 'XGBoost'
    ]

























































































































100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [08:01<00:00, 16.05s/it]


In [58]:
mca_df_all = mca_df_all.sort_index()
mca_df_all

Unnamed: 0,precision,recall,F-score,AUC,model
0,0.812081,0.756250,0.783172,0.861547,randomFst
1,0.763514,0.812950,0.787456,0.880362,randomFst
2,0.773973,0.837037,0.804270,0.885721,randomFst
3,0.731250,0.829787,0.777409,0.874158,randomFst
4,0.768750,0.842466,0.803922,0.886677,randomFst
...,...,...,...,...,...
85,0.786207,0.775510,0.780822,0.864387,XGBoost
86,0.739726,0.750000,0.744828,0.829264,XGBoost
87,0.748344,0.784722,0.766102,0.864738,XGBoost
88,0.838710,0.817610,0.828025,0.894189,XGBoost


# 결과 저장

In [66]:
# with open('df_all.pickle', 'wb') as f:
#     pickle.dump(df_all, f)

In [69]:
# with open('mca_df_all.pickle', 'wb') as f:
#     pickle.dump(mca_df_all, f)