<a href="https://colab.research.google.com/github/su-00/Graduation-portfolio/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from scipy import interp

data = pd.read_csv('./noin.csv')

In [None]:
features = data.columns

In [None]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

Outliers_to_drop = detect_outliers(data,2,features)

In [None]:
def missing_values_table(df):
    #Total missing values
    mis_val = df.isnull().sum()

    #Percentage of missing values
    mis_val_percent = 100 * mis_val / len(df)

    #Make a tale with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    #Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: '% of Total Values'})

    #Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values('% of Total Values', ascending = False).round(1)

    #Print some summary information
    print('Your selected dataframe has ' + str(df.shape[1]) + " columns. \n" "There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

    #Return the dataframe with missing information
    return mis_val_table_ren_columns

In [None]:
#No duplicated values
print("{} duplicated values in data".format(data.duplicated().sum()))

In [None]:
#drop DFAB_REG_YM (too many missing values), PERSON_ID
data = data.drop(['DFAB_REG_YM'], axis=1)
data = data.drop(['PERSON_ID'], axis =1)

In [None]:
#Age distribution
g = sns.kdeplot(data['AGE'][(data['Stroke'] == 0) & (data['AGE'].notnull())], color = 'Red', shade = True)
g = sns.kdeplot(data['AGE'][(data['Stroke'] == 1) & (data['AGE'].notnull())], color = 'Blue', shade = True)
g.set_xlabel('Age')
g = g.legend(['have no stroke', 'have stroke'])

In [None]:
#환자기본정보와 Stroke
g = sns.heatmap(data[['Stroke', 'SEX', 'AGE', 'SIDO', 'IPSN_TYPE_CD', 'CTRB_PT_TYPE_CD', 'DFAB_GRD_CD', 'DFAB_PTN_CD']].corr(), annot=True, fmt=".2f", cmap="coolwarm")


In [None]:
#explore sex vs stroke
#male > female
fig, ax = plt.subplots(1, 2, figsize=(18, 8))
data[['Stroke', 'SEX']].groupby(['SEX'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Stroke vs Sex')
sns.countplot('SEX', hue='Stroke', data=data, ax=ax[1])
ax[1].set_title('Sex: Stroke vs No stroke')
plt.show()

In [None]:
#SIDO vs Storke
g = sns.factorplot(x='SIDO', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#IPSN_TYPE_CD vs Stroke
g = sns.factorplot(x='IPSN_TYPE_CD', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#CTRB_PT_TYPE_CD
g = sns.factorplot(x='CTRB_PT_TYPE_CD', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#DFAB_GRD_CD -> 1 : (high stroke probability)
g = sns.factorplot(x='DFAB_GRD_CD', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#DFAB_PTN_CD (2: high)
g = sns.factorplot(x='DFAB_PTN_CD', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#미래 및 과거 병력 정보와 Stroke
g = sns.heatmap(data[['Stroke', 'AF', 'CAD', 'Cancer', 'DM', 'HF', 'HTN']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#AF(심방세동) vs Stroke  (1: high)
g = sns.factorplot(x='AF', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#CAD(심근경색) vs Storke
g = sns.factorplot(x='CAD', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#Cancer vs Storke
g = sns.factorplot(x='Cancer', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#DM vs Stroke
g = sns.factorplot(x='DM', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#HF vs Stroke
g = sns.factorplot(x='HF', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#HTN vs stroke
g = sns.factorplot(x='HTN', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#과거 약제 정보와 Stroke
g = sns.heatmap(data[['Stroke', 'aad', 'anticoagulant', 'antiplatelet_agent', 'cardiotonics', 'statin']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#Stroke vs statin
g = sns.factorplot(x='statin', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#cardiotonics vs Stroke
g = sns.factorplot(x='cardiotonics', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#antiplatelet_agent vs Stroke
g = sns.factorplot(x='antiplatelet_agent', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#anticoagulant vs Stroke
g = sns.factorplot(x='anticoagulant', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#aad vs Stroke
g = sns.factorplot(x='aad', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#statin(콜레스테롤 합성저해제) vs antiplatelet_agent(항혈소판제)
g = sns.factorplot(x='statin', y='antiplatelet_agent', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('antiplatelet_agent')

In [None]:
#건강 검진 정보(검진연도 ~ 혈색소(22 ~ 34))와 Stroke
fig, ax = plt.subplots(figsize = (18, 18))
g = sns.heatmap(data[['Stroke', 'HCHK_YEAR', 'HME_DT', 'HEIGHT', 'WEIGHT', 'WAIST', 'BP_HIGH', 'BP_LWST', 'BLDS', 'TOT_CHOLE', 'TRIGLYCERIDE', 'HDL_CHOLE', 'LDL_CHOLE', 'HMG']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#HCHK_YEAR vs Stroke
g = sns.factorplot(x='HCHK_YEAR', y='Stroke', data=data, kind='bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#HME_DT vs Stroke
g = sns.kdeplot(data['HME_DT'][(data['Stroke'] == 0)], color='Red', shade=True)
g = sns.kdeplot(data['HME_DT'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = g)
g.set_xlabel("HME_DT")
g = g.legend(['No Stroke', 'Stroke'])

In [None]:
#HEIGHT vs Stroke
g = sns.kdeplot(data['HEIGHT'][(data['Stroke'] == 0)], color='Red', shade=True)
g = sns.kdeplot(data['HEIGHT'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = g)
g.set_xlabel("HEIGHT")
g = g.legend(['No Stroke', 'Stroke'])

In [None]:
#WEIGHT vs Stroke -> 영향 x 
g = sns.kdeplot(data['WEIGHT'][(data['Stroke'] == 0)], color='Red', shade=True)
g = sns.kdeplot(data['WEIGHT'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = g)
g.set_xlabel("WEIGHT")
g = g.legend(['No Stroke', 'Stroke'])

In [None]:
#BP_HIGH, BP_LWST, BLDS, TOT_CHOLE, TRIGLYCERIDE
f , ax = plt.subplots(3, 2, figsize=(20,15))
sns.kdeplot(data['WAIST'][(data['Stroke'] == 0)], color='Red', shade=True, ax= ax[0,0])
sns.kdeplot(data['WAIST'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 0])
ax[0, 0].set_xlabel("WAIST")
ax[0, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['BP_HIGH'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[0, 1])
sns.kdeplot(data['BP_HIGH'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 1])
ax[0, 1].set_xlabel("BP_HIGH")
ax[0, 1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['BP_LWST'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 0])
sns.kdeplot(data['BP_LWST'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 0])
ax[1, 0].set_xlabel("BP_LWST")
ax[1, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['BLDS'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 1])
sns.kdeplot(data['BLDS'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 1])
ax[1, 1].set_xlabel("BLDS")
ax[1, 1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['TOT_CHOLE'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[2, 0])
sns.kdeplot(data['TOT_CHOLE'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[2, 0])
ax[2, 0].set_xlabel("TOT_CHOLE")
ax[2, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['TRIGLYCERIDE'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[2, 1])
sns.kdeplot(data['TRIGLYCERIDE'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[2, 1])
ax[2, 1].set_xlabel("TRIGLYCERIDE")
ax[2, 1].legend(['No Stroke', 'Stroke'])

In [None]:
#HDL_CHOLE, LDL_CHOLE, HMG -> Stroke에 큰 영향 없어 보임
f , ax = plt.subplots(1, 3, figsize=(20,15))
sns.kdeplot(data['HDL_CHOLE'][(data['Stroke'] == 0)], color='Red', shade=True, ax= ax[0])
sns.kdeplot(data['HDL_CHOLE'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0])
ax[0].set_xlabel("HDL_CHOLE")
ax[0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['LDL_CHOLE'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1])
sns.kdeplot(data['LDL_CHOLE'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1])
ax[1].set_xlabel("LDL_CHOLE")
ax[1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['HMG'][(data['Stroke'] == 0)], color='Red', shade=True)
sns.kdeplot(data['HMG'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[2])
ax[2].set_xlabel("HMG")
ax[2].legend(['No Stroke', 'Stroke'])

In [None]:
#OLIG_PROTE_CD ~ GAMMA_GTP
fig, ax = plt.subplots(figsize = (6, 6))
g = sns.heatmap(data[['Stroke', 'OLIG_PROTE_CD', 'CREATININE', 'SGOT_AST', 'SGPT_ALT', 'GAMMA_GTP']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#OLIG_PROTE_CD vs Stroke
g = sns.factorplot(x='OLIG_PROTE_CD', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#CREATIINNE, SGOT_AST, SGPT_ALT, GAMMA_GTP
f , ax = plt.subplots(2, 2, figsize=(18,8))
sns.kdeplot(data['CREATININE'][(data['Stroke'] == 0)], color='Red', shade=True, ax= ax[0,0])
sns.kdeplot(data['CREATININE'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 0])
ax[0, 0].set_xlabel("CREATININE")
ax[0, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['SGOT_AST'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[0, 1])
sns.kdeplot(data['SGOT_AST'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 1])
ax[0, 1].set_xlabel("SGOT_AST")
ax[0, 1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['SGPT_ALT'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 0])
sns.kdeplot(data['SGPT_ALT'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 0])
ax[1, 0].set_xlabel("SGPT_ALT")
ax[1, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['GAMMA_GTP'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 1])
sns.kdeplot(data['GAMMA_GTP'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 1])
ax[1, 1].set_xlabel("GAMMA_GTP")
ax[1, 1].legend(['No Stroke', 'Stroke'])

In [None]:
#본인 병력 유무 vs Stroke
fig, ax = plt.subplots(figsize = (18, 8))
g = sns.heatmap(data[['Stroke', 'HCHK_APOP_PMH_YN', 'HCHK_HDISE_PMH_YN', 'HCHK_HPRTS_PMH_YN', 'HCHK_DIABML_PMH_YN', 'HCHK_HPLPDM_PMH_YN', 'HCHK_PHSS_PMH_YN', 'HCHK_ETCDSE_PMH_YN']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
# HCHK_APOP_PMH_YN 뇌졸중과거병력유무
g = sns.factorplot(x='HCHK_APOP_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
sns.factorplot(y='DFAB_GRD_CD', x='HCHK_APOP_PMH_YN', data=data, hue='HCHK_APOP_PMH_YN', kind='bar')


In [None]:
#심장병
g = sns.factorplot(x='HCHK_HDISE_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#고혈압
g = sns.factorplot(x='HCHK_HPRTS_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#당뇨병
g = sns.factorplot(x='HCHK_DIABML_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#고지혈증
g = sns.factorplot(x='HCHK_HPLPDM_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#폐결핵
g = sns.factorplot(x='HCHK_PHSS_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#기타질환(암포함)
g = sns.factorplot(x='HCHK_ETCDSE_PMH_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#가족력 vs Stroke
fig, ax = plt.subplots(figsize = (8, 8))
g = sns.heatmap(data[['Stroke', 'FMLY_APOP_PATIEN_YN', 'FMLY_HDISE_PATIEN_YN', 'FMLY_HPRTS_PATIEN_YN', 'FMLY_DIABML_PATIEN_YN', 'FMLY_CANCER_PATIEN_YN']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#가족력 뇌졸중
g = sns.factorplot(x='FMLY_APOP_PATIEN_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#가족력 심장병
g = sns.factorplot(x='FMLY_HDISE_PATIEN_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#가족력 고혈압
g = sns.factorplot(x='FMLY_HPRTS_PATIEN_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#가족력 당뇨병
g = sns.factorplot(x='FMLY_DIABML_PATIEN_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#가족력 기타(암포함)환자유무
g = sns.factorplot(x='FMLY_CANCER_PATIEN_YN', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#흡연, 음주, 생활습관 vs Stroke
fig, ax = plt.subplots(figsize = (18, 12))
g = sns.heatmap(data[['Stroke', 'SMK_STAT_TYPE_RSPS_CD', 'PAST_SMK_TERM_RSPS_CD', 'CUR_SMK_TERM_RSPS_CD', 'CUR_DSQTY_RSPS_CD', 'DRNK_HABIT_RSPS_CD', 'TM1_DRKQTY_RSPS_CD', 'MOV20_WEK_FREQ_ID', 'MOV30_WEK_FREQ_ID', 'WLK30_WEK_FREQ_ID']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#흡연, 음주, 생활습관 vs Stroke
fig, ax = plt.subplots(figsize = (12, 9))
g = sns.heatmap(data[['SMK_STAT_TYPE_RSPS_CD', 'DRNK_HABIT_RSPS_CD', 'TM1_DRKQTY_RSPS_CD']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#흡연상태
g = sns.factorplot(x='SMK_STAT_TYPE_RSPS_CD', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#흡연기간, 하루흡연량 과거, 현재
f , ax = plt.subplots(2, 2, figsize=(12,6))
sns.kdeplot(data['PAST_SMK_TERM_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True, ax=ax[0, 0])
sns.kdeplot(data['PAST_SMK_TERM_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0,0])
ax[0, 0].set_xlabel("PAST_SMK_TERM_RSPS_CD")
ax[0, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['PAST_DSQTY_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True, ax= ax[0, 1])
sns.kdeplot(data['PAST_DSQTY_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 1])
ax[0, 1].set_xlabel("PAST_DSQTY_RSPS_CD")
ax[0, 1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['CUR_SMK_TERM_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 0])
sns.kdeplot(data['CUR_SMK_TERM_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 0])
ax[1, 0].set_xlabel("CUR_SMK_TERM_RSPS_CD")
ax[1, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['CUR_DSQTY_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 1])
sns.kdeplot(data['CUR_DSQTY_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 1])
ax[1, 1].set_xlabel("CUR_DSQTY_RSPS_CD")
ax[1, 1].legend(['No Stroke', 'Stroke'])

In [None]:
#음주습관
g = sns.kdeplot(data['DRNK_HABIT_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True)
sns.kdeplot(data['DRNK_HABIT_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = g)

In [None]:
#1회음주량, 흡연기간, 하루흡연량 과거, 현재
f , ax = plt.subplots(2, 2, figsize=(14,8))
sns.kdeplot(data['TM1_DRKQTY_RSPS_CD'][(data['Stroke'] == 0)], color='Red', shade=True, ax=ax[0, 0])
sns.kdeplot(data['TM1_DRKQTY_RSPS_CD'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0,0])
ax[0, 0].set_xlabel("TM1_DRKQTY_RSPS_CD")
ax[0, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['MOV20_WEK_FREQ_ID'][(data['Stroke'] == 0)], color='Red', shade=True, ax= ax[0, 1])
sns.kdeplot(data['MOV20_WEK_FREQ_ID'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[0, 1])
ax[0, 1].set_xlabel("MOV20_WEK_FREQ_ID")
ax[0, 1].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['MOV30_WEK_FREQ_ID'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 0])
sns.kdeplot(data['MOV30_WEK_FREQ_ID'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 0])
ax[1, 0].set_xlabel("MOV30_WEK_FREQ_ID")
ax[1, 0].legend(['No Stroke', 'Stroke'])

sns.kdeplot(data['WLK30_WEK_FREQ_ID'][(data['Stroke'] == 0)], color='Red', shade=True, ax = ax[1, 1])
sns.kdeplot(data['WLK30_WEK_FREQ_ID'][(data['Stroke'] == 1)], color='Blue', shade=True, ax = ax[1, 1])
ax[1, 1].set_xlabel("WLK30_WEK_FREQ_ID")
ax[1, 1].legend(['No Stroke', 'Stroke'])

In [None]:
#가족력 vs Stroke
fig, ax = plt.subplots(figsize = (8, 8))
g = sns.heatmap(data[['Stroke', 'KDSQ_C', 'KDSQ_C_1', 'KDSQ_C_2', 'KDSQ_C_3', 'KDSQ_C_4', 'KDSQ_C_5']].corr(), annot=True, fmt=".2f", cmap="coolwarm")

In [None]:
#KDSQ_C
g = sns.factorplot(x='KDSQ_C', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#KDSQ_C_1
g = sns.factorplot(x='KDSQ_C_1', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#KDSQ_C_2
g = sns.factorplot(x='KDSQ_C_2', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#KDSQ_C_3
g = sns.factorplot(x='KDSQ_C_3', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#KDSQ_C_4
g = sns.factorplot(x='KDSQ_C_4', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#KDSQ_C_5
g = sns.factorplot(x='KDSQ_C_5', y='Stroke', data=data, kind = 'bar', size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Stroke probability')

In [None]:
#fill Null with DRNK_HABIT's mean value
data['DRNK_HABIT_RSPS_CD'].mean()

In [None]:
data['DRNK_HABIT_RSPS_CD'] = data['DRNK_HABIT_RSPS_CD'].fillna(2)

In [None]:
data.groupby('DRNK_HABIT_RSPS_CD')['TM1_DRKQTY_RSPS_CD'].mean()

In [None]:
#fill Null TM1_DRNKQTY_RSPS_CD
data.loc[((data.DRNK_HABIT_RSPS_CD == 1) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 0
data.loc[((data.DRNK_HABIT_RSPS_CD == 2) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 3
data.loc[((data.DRNK_HABIT_RSPS_CD == 3) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 4
data.loc[((data.DRNK_HABIT_RSPS_CD == 4) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 5
data.loc[((data.DRNK_HABIT_RSPS_CD == 5) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 5
data.loc[((data.DRNK_HABIT_RSPS_CD == 6) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 5
data.loc[((data.DRNK_HABIT_RSPS_CD == 7) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 6
data.loc[((data.DRNK_HABIT_RSPS_CD == 8) & data.TM1_DRKQTY_RSPS_CD.isnull()), 'TM1_DRKQTY_RSPS_CD'] = 6


In [None]:
#fill Null with MOV20_WEK_RSPS_CD's mean
data['MOV20_WEK_FREQ_ID'].mean()

In [None]:
data['MOV20_WEK_FREQ_ID'] = data['MOV20_WEK_FREQ_ID'].fillna(2)

In [None]:
#fill LDL_CHOLE with mean 

In [None]:
data['LDL_CHOLE'].mean()

In [None]:
data['LDL_CHOLE'].isnull().sum()

In [None]:
data['LDL_CHOLE'] = data['LDL_CHOLE'].fillna(116.2)

In [None]:
#fill WLK30_WEK_FREQ_ID
data.groupby('MOV20_WEK_FREQ_ID')['WLK30_WEK_FREQ_ID'].mean()

In [None]:
data.loc[((data.MOV20_WEK_FREQ_ID == 1) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 3
data.loc[((data.MOV20_WEK_FREQ_ID == 2) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 4
data.loc[((data.MOV20_WEK_FREQ_ID == 3) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 4
data.loc[((data.MOV20_WEK_FREQ_ID == 4) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 4
data.loc[((data.MOV20_WEK_FREQ_ID == 5) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 5
data.loc[((data.MOV20_WEK_FREQ_ID == 6) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 5
data.loc[((data.MOV20_WEK_FREQ_ID == 7) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 6
data.loc[((data.MOV20_WEK_FREQ_ID == 8) & data.WLK30_WEK_FREQ_ID.isnull()), 'WLK30_WEK_FREQ_ID'] = 7


In [None]:
data.groupby('MOV20_WEK_FREQ_ID')['MOV30_WEK_FREQ_ID'].mean()

In [None]:
data.loc[((data.MOV20_WEK_FREQ_ID == 1) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 2
data.loc[((data.MOV20_WEK_FREQ_ID == 2) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 2
data.loc[((data.MOV20_WEK_FREQ_ID == 3) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 3
data.loc[((data.MOV20_WEK_FREQ_ID == 4) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 4
data.loc[((data.MOV20_WEK_FREQ_ID == 5) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 4
data.loc[((data.MOV20_WEK_FREQ_ID == 6) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 5
data.loc[((data.MOV20_WEK_FREQ_ID == 7) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 5
data.loc[((data.MOV20_WEK_FREQ_ID == 8) & data.MOV30_WEK_FREQ_ID.isnull()), 'MOV30_WEK_FREQ_ID'] = 6


In [None]:
#fill SMK_STAT_TYPE_RSPS_CD

In [None]:
data.groupby('DRNK_HABIT_RSPS_CD')['SMK_STAT_TYPE_RSPS_CD'].mean()

In [None]:
data.loc[((data.DRNK_HABIT_RSPS_CD == 1) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 1
data.loc[((data.DRNK_HABIT_RSPS_CD == 2) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 3) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 4) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 5) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 6) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 7) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2
data.loc[((data.DRNK_HABIT_RSPS_CD == 8) & data.SMK_STAT_TYPE_RSPS_CD.isnull()), 'SMK_STAT_TYPE_RSPS_CD'] = 2


In [None]:
#fill OLIG_PROTE_CD

In [None]:
data['OLIG_PROTE_CD'].mean()

In [None]:
data['OLIG_PROTE_CD'].fillna(1.0, inplace=True)

In [None]:
#fill GAMMA_GTP, BLDS

In [None]:
data['GAMMA_GTP'].mean()

In [None]:
data['GAMMA_GTP'].fillna(33, inplace=True)

In [None]:
data['BLDS'].mean()

In [None]:
data['BLDS'].fillna(104, inplace=True)

In [None]:
data['BLDS'].isnull().sum()

In [None]:
#fill SGOT_AST, SGPT_ALT, CREATININE -> mean
data['SGOT_AST'].mean()

In [None]:
data['SGOT_AST'].fillna(26.8, inplace=True)

In [None]:
data['SGPT_ALT'].mean()

In [None]:
data['SGPT_ALT'].fillna(22.0, inplace=True)

In [None]:
data['CREATININE'].mean()

In [None]:
data['CREATININE'].fillna(1.0, inplace=True)

In [None]:
#TOT_CHOLE, HDL_CHOLE, HMG, TRIGLYCERIDE -> mean
data['TOT_CHOLE'].mean()

In [None]:
data['TOT_CHOLE'].fillna(196.5, inplace=True)

In [None]:
data['HDL_CHOLE'].mean()

In [None]:
data['HDL_CHOLE'].fillna(53.9, inplace=True)

In [None]:
data['HMG'].mean()

In [None]:
data['HMG'].fillna(13.2, inplace=True)

In [None]:
data['TRIGLYCERIDE'].mean()

In [None]:
data['TRIGLYCERIDE'].fillna(139.1, inplace=True)

In [None]:
#BP_HIGH, BP_LWST fillna -> mean

In [None]:
print('BP_HIGH mean : {}, BP_LWST.mean : {}'.format(data['BP_HIGH'].mean(), data['BP_LWST'].mean()))

In [None]:
data['BP_HIGH'].fillna(130.6, inplace=True)
data['BP_LWST'].fillna(78.1, inplace=True)

In [None]:
print("BP_HIGH null : {}, BP_LWST null: {}".format(data['BP_HIGH'].isnull().sum(), data['BP_LWST'].isnull().sum()))

In [None]:
#WEIGHT, HEIGHT, WAIST

In [None]:
data[(data['HEIGHT'].isnull())]['WEIGHT'].isnull()

In [None]:
data['HEIGHT'].describe()

In [None]:
data['HEIGHT'].fillna(156.3, inplace=True)

In [None]:
data['HEIGHT'].isnull().sum()

In [None]:
temp = []
for i in range(109, 223, 10):
    temp.append(data[((i <= data['HEIGHT']) & (data['HEIGHT'] < (i+10)))]['WEIGHT'].sum() / len(data[((i <= data['HEIGHT']) & (data['HEIGHT'] < (i+10)))]['WEIGHT']))
    plt.figure(figsize=(7, 7))
plt.xlabel('HEIGHT')
plt.ylabel('WEIGHT')
plt.plot(temp)

In [None]:
#209 ~220 -> 62.8로 채움
data.loc[(((109 <= data.HEIGHT) & (data.HEIGHT < 119)) & data.WEIGHT.isnull()), 'WEIGHT'] = 40.3
data.loc[(((119 <= data.HEIGHT) & (data.HEIGHT < 129)) & data.WEIGHT.isnull()), 'WEIGHT'] = 38.8
data.loc[(((129 <= data.HEIGHT) & (data.HEIGHT < 139)) & data.WEIGHT.isnull()), 'WEIGHT'] = 42.5
data.loc[(((139 <= data.HEIGHT) & (data.HEIGHT < 149)) & data.WEIGHT.isnull()), 'WEIGHT'] = 50.3
data.loc[(((149 <= data.HEIGHT) & (data.HEIGHT < 159)) & data.WEIGHT.isnull()), 'WEIGHT'] = 56.2
data.loc[(((159 <= data.HEIGHT) & (data.HEIGHT < 169)) & data.WEIGHT.isnull()), 'WEIGHT'] = 62.6
data.loc[(((169 <= data.HEIGHT) & (data.HEIGHT < 179)) & data.WEIGHT.isnull()), 'WEIGHT'] = 69.0
data.loc[(((179 <= data.HEIGHT) & (data.HEIGHT < 189)) & data.WEIGHT.isnull()), 'WEIGHT'] = 76.4
data.loc[(((189 <= data.HEIGHT) & (data.HEIGHT < 199)) & data.WEIGHT.isnull()), 'WEIGHT'] = 62.8
data.loc[(((199 <= data.HEIGHT) & (data.HEIGHT < 209)) & data.WEIGHT.isnull()), 'WEIGHT'] = 62.8
data.loc[(((209 <= data.HEIGHT) & (data.HEIGHT < 219)) & data.WEIGHT.isnull()), 'WEIGHT'] = 62.8
data.loc[(((219 <= data.HEIGHT) & (data.HEIGHT < 223)) & data.WEIGHT.isnull()), 'WEIGHT'] = 63.0


In [None]:
temp_waist = []
for i in range(25, 170, 29):
    temp_waist.append(data[((i <= data['WEIGHT']) & (data['WEIGHT'] < (i+29)))]['WAIST'].sum() / len(data[((i <= data['WEIGHT']) & (data['WEIGHT'] < (i+29)))]['WAIST']))
    plt.figure(figsize=(7, 7))
plt.xlabel('WEIGHT')
plt.ylabel('WAIST')
plt.plot(temp_waist)

In [None]:
data.loc[(((25 <= data.WEIGHT) & (data.WEIGHT < 54)) & data.WAIST.isnull()), 'WAIST'] = 76.1
data.loc[(((54 <= data.WEIGHT) & (data.WEIGHT < 83)) & data.WAIST.isnull()), 'WAIST'] = 86.0
data.loc[(((83 <= data.WEIGHT) & (data.WEIGHT < 112)) & data.WAIST.isnull()), 'WAIST'] = 100.9
data.loc[(((112 <= data.WEIGHT) & (data.WEIGHT < 141)) & data.WAIST.isnull()), 'WAIST'] = 101.0
data.loc[(((141 <= data.WEIGHT) & (data.WEIGHT < 170)) & data.WAIST.isnull()), 'WAIST'] = 78.0


In [None]:
#FMLY_~ (가족력 유무), 본인 과거병력유무
g = sns.factorplot(x='HCHK_APOP_PMH_YN', y='Stroke', hue='FMLY_APOP_PATIEN_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='HCHK_HDISE_PMH_YN', y='Stroke', hue='FMLY_HDISE_PATIEN_YN',data=data,  size=6, palette = 'muted')
g = sns.factorplot(x='HCHK_HPRTS_PMH_YN', y='Stroke', hue='FMLY_HPRTS_PATIEN_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='HCHK_DIABML_PMH_YN', y='Stroke', hue='FMLY_DIABML_PATIEN_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='HCHK_ETCDSE_PMH_YN', y='Stroke', hue='FMLY_CANCER_PATIEN_YN',data=data, size=6, palette = 'muted')


In [None]:
data.loc[((data['HCHK_APOP_PMH_YN'] == 1) & (data['FMLY_APOP_PATIEN_YN'].isnull())), 'FMLY_APOP_PATIEN_YN'] = 1
data.loc[((data['HCHK_APOP_PMH_YN'] == 0) & (data['FMLY_APOP_PATIEN_YN'].isnull())), 'FMLY_APOP_PATIEN_YN'] = 0
data.loc[((data['HCHK_HDISE_PMH_YN'] == 1) & (data['FMLY_HDISE_PATIEN_YN'].isnull())), 'FMLY_HDISE_PATIEN_YN'] = 0
data.loc[((data['HCHK_HDISE_PMH_YN'] == 0) & (data['FMLY_HDISE_PATIEN_YN'].isnull())), 'FMLY_HDISE_PATIEN_YN'] = 1
data.loc[((data['HCHK_HPRTS_PMH_YN'] == 1) & (data['FMLY_HPRTS_PATIEN_YN'].isnull())), 'FMLY_HPRTS_PATIEN_YN'] = 0
data.loc[((data['HCHK_HPRTS_PMH_YN'] == 0) & (data['FMLY_HPRTS_PATIEN_YN'].isnull())), 'FMLY_HPRTS_PATIEN_YN'] = 0
data.loc[((data['HCHK_DIABML_PMH_YN'] == 1) & (data['FMLY_DIABML_PATIEN_YN'].isnull())), 'FMLY_DIABML_PATIEN_YN'] = 0
data.loc[((data['HCHK_DIABML_PMH_YN'] == 0) & (data['FMLY_DIABML_PATIEN_YN'].isnull())), 'FMLY_DIABML_PATIEN_YN'] = 0
data.loc[((data['HCHK_ETCDSE_PMH_YN'] == 1) & (data['FMLY_CANCER_PATIEN_YN'].isnull())), 'FMLY_CANCER_PATIEN_YN'] = 0
data.loc[((data['HCHK_ETCDSE_PMH_YN'] == 0) & (data['FMLY_CANCER_PATIEN_YN'].isnull())), 'FMLY_CANCER_PATIEN_YN'] = 0


In [None]:
g = sns.factorplot(x='FMLY_APOP_PATIEN_YN', y='Stroke', hue='HCHK_APOP_PMH_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='FMLY_HDISE_PATIEN_YN', y='Stroke', hue='HCHK_HDISE_PMH_YN',data=data,  size=6, palette = 'muted')
g = sns.factorplot(x='FMLY_HPRTS_PATIEN_YN', y='Stroke', hue='HCHK_HPRTS_PMH_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='FMLY_DIABML_PATIEN_YN', y='Stroke', hue='HCHK_DIABML_PMH_YN',data=data, size=6, palette = 'muted')
g = sns.factorplot(x='FMLY_CANCER_PATIEN_YN', y='Stroke', hue='HCHK_ETCDSE_PMH_YN',data=data, size=6, palette = 'muted')


In [None]:
data.loc[((data['FMLY_APOP_PATIEN_YN'] == 1) & (data['HCHK_APOP_PMH_YN'].isnull())), 'HCHK_APOP_PMH_YN'] = 1
data.loc[((data['FMLY_APOP_PATIEN_YN'] == 0) & (data['HCHK_APOP_PMH_YN'].isnull())), 'HCHK_APOP_PMH_YN'] = 1
data.loc[((data['FMLY_HDISE_PATIEN_YN'] == 1) & (data['HCHK_HDISE_PMH_YN'].isnull())), 'HCHK_HDISE_PMH_YN'] = 1
data.loc[((data['FMLY_HDISE_PATIEN_YN'] == 0) & (data['HCHK_HDISE_PMH_YN'].isnull())), 'HCHK_HDISE_PMH_YN'] = 1
data.loc[((data['FMLY_HPRTS_PATIEN_YN'] == 1) & (data['HCHK_HPRTS_PMH_YN'].isnull())), 'FMLY_HPRTS_PATIEN_YN'] = 1
data.loc[((data['FMLY_HPRTS_PATIEN_YN'] == 0) & (data['HCHK_HPRTS_PMH_YN'].isnull())), 'FMLY_HPRTS_PATIEN_YN'] = 1
data.loc[((data['FMLY_DIABML_PATIEN_YN'] == 1) & (data['HCHK_DIABML_PMH_YN'].isnull())), 'HCHK_DIABML_PMH_YN'] = 0
data.loc[((data['FMLY_DIABML_PATIEN_YN'] == 0) & (data['HCHK_DIABML_PMH_YN'].isnull())), 'HCHK_DIABML_PMH_YN'] = 0
data.loc[((data['FMLY_CANCER_PATIEN_YN'] == 1) & (data['HCHK_ETCDSE_PMH_YN'].isnull())), 'HCHK_ETCDSE_PMH_YN'] = 0
data.loc[((data['FMLY_CANCER_PATIEN_YN'] == 0) & (data['HCHK_ETCDSE_PMH_YN'].isnull())), 'HCHK_ETCDSE_PMH_YN'] = 1


In [None]:
#가족력, 본인과거병력 모두 NaN
data['HCHK_APOP_PMH_YN'].fillna(0, inplace=True)
data['HCHK_HDISE_PMH_YN'].fillna(0, inplace=True)
data['HCHK_HPRTS_PMH_YN'].fillna(0, inplace=True)
data['HCHK_DIABML_PMH_YN'].fillna(0, inplace=True)
data['HCHK_ETCDSE_PMH_YN'].fillna(0, inplace=True)
data['FMLY_APOP_PATIEN_YN'].fillna(0, inplace=True)
data['FMLY_HDISE_PATIEN_YN'].fillna(0, inplace=True)
data['FMLY_HPRTS_PATIEN_YN'].fillna(0, inplace=True)
data['FMLY_DIABML_PATIEN_YN'].fillna(0, inplace=True)
data['FMLY_CANCER_PATIEN_YN'].fillna(0, inplace=True)


In [None]:
#HCHK_HPLPDM_PMH_YN
f, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(data[data['HCHK_HPLPDM_PMH_YN'] == 1]['LDL_CHOLE'])
sns.kdeplot(data[data['HCHK_HPLPDM_PMH_YN'] == 0]['LDL_CHOLE'])
plt.legend(['HPLPDM == 1, HPLPDM == 0'])

In [None]:
f, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(data[data['HCHK_HPLPDM_PMH_YN'] == 1]['TOT_CHOLE'])
sns.kdeplot(data[data['HCHK_HPLPDM_PMH_YN'] == 0]['TOT_CHOLE'])
plt.legend(['HPLPDM == 1, HPLPDM == 0'])

In [None]:
#HCHK_HPLPDM_PMH_YN
g = sns.factorplot(x='HCHK_HPLPDM_PMH_YN', y='statin', hue='Stroke',data=data, size=6, kind='bar', palette = 'muted')


In [None]:
data.loc[((data['statin'] == 1) & (data['HCHK_HPLPDM_PMH_YN'].isnull())), 'HCHK_HPLPDM_PMH_YN'] = 1
data.loc[((data['statin'] == 0) & (data['HCHK_HPLPDM_PMH_YN'].isnull())), 'HCHK_HPLPDM_PMH_YN'] = 0

In [None]:
#흡연
f, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(data[data['SMK_STAT_TYPE_RSPS_CD'] == 1]['CUR_DSQTY_RSPS_CD'])
sns.kdeplot(data[data['SMK_STAT_TYPE_RSPS_CD'] == 2]['CUR_DSQTY_RSPS_CD'])
sns.kdeplot(data[data['SMK_STAT_TYPE_RSPS_CD'] == 3]['CUR_DSQTY_RSPS_CD'])
plt.legend(['SMK_STAT_TYPE_RSPS_CD=1, SMK_STAT_TYPE_RSPS_CD=2, SMK_STAT_TYPE_RSPS_CD=3'])


In [None]:
#흡연관련 데이터
g = sns.factorplot(x='SMK_STAT_TYPE_RSPS_CD', y='CUR_DSQTY_RSPS_CD', hue='Stroke',data=data, size=6, kind='bar', palette = 'muted')

In [None]:
#안 피우는 사람 현재 흡연기간, 흡연량 0으로
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 3) & (data['CUR_DSQTY_RSPS_CD'].isnull())), 'CUR_DSQTY_RSPS_CD'].isnull().sum()

In [None]:
#안 피우는 사람 현재 흡연기간, 흡연량 0으로
data['CUR_DSQTY_RSPS_CD'].fillna(0, inplace=True)

In [None]:
data['CUR_SMK_TERM_RSPS_CD'].fillna(0, inplace=True)

In [None]:
#과거 흡연기간

In [None]:
g = sns.factorplot(x='SMK_STAT_TYPE_RSPS_CD', y='PAST_SMK_TERM_RSPS_CD', hue='Stroke',data=data, size=6, kind='bar', palette = 'muted')

In [None]:
data['PAST_SMK_TERM_RSPS_CD'].describe()

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 2) & (data['PAST_SMK_TERM_RSPS_CD'].isnull())), 'PAST_SMK_TERM_RSPS_CD'] = 28.0

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 2) & (data['PAST_SMK_TERM_RSPS_CD'].isnull())), 'PAST_SMK_TERM_RSPS_CD'].isnull().sum()

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 3) & (data['PAST_SMK_TERM_RSPS_CD'].isnull())), 'PAST_SMK_TERM_RSPS_CD'].isnull().sum()

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 3) & (data['PAST_SMK_TERM_RSPS_CD'].isnull())), 'PAST_SMK_TERM_RSPS_CD']= 0.0

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 1) & (data['PAST_SMK_TERM_RSPS_CD'].isnull())), 'PAST_SMK_TERM_RSPS_CD'] = 0

In [None]:
#과거 하루흡연량
g = sns.factorplot(x='SMK_STAT_TYPE_RSPS_CD', y='PAST_DSQTY_RSPS_CD', hue='Stroke',data=data, size=6, kind='bar', palette = 'muted')

In [None]:
sns.kdeplot(data[data['SMK_STAT_TYPE_RSPS_CD'] == 2]['PAST_DSQTY_RSPS_CD'])

In [None]:
data['PAST_DSQTY_RSPS_CD'].describe()

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 2) & (data['PAST_DSQTY_RSPS_CD'].isnull())), 'PAST_DSQTY_RSPS_CD'] = 17.0

In [None]:
data['PAST_DSQTY_RSPS_CD'].fillna(0, inplace=True)

In [None]:
#HCHK_PHSS_PMH_YN
g = sns.factorplot(x='SMK_STAT_TYPE_RSPS_CD', y='HCHK_PHSS_PMH_YN',data=data, size=6, kind='bar', palette = 'muted')

In [None]:
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 2) & (data['HCHK_PHSS_PMH_YN'].isnull())), 'HCHK_PHSS_PMH_YN'] = 1
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 1) & (data['HCHK_PHSS_PMH_YN'].isnull())), 'HCHK_PHSS_PMH_YN'] = 0
data.loc[((data['SMK_STAT_TYPE_RSPS_CD'] == 3) & (data['HCHK_PHSS_PMH_YN'].isnull())), 'HCHK_PHSS_PMH_YN'] = 0

In [None]:
#KDSQ_C와 FMLY_APOP_PATIEN_YN, HCHK_APOP_PMH_YN
g = sns.factorplot(x='FMLY_APOP_PATIEN_YN', y='KDSQ_C',data=data, size=6, palette = 'muted')

In [None]:
g = sns.factorplot(x='HCHK_APOP_PMH_YN', y='KDSQ_C',data=data, size=6, palette = 'muted')

In [None]:
data.loc[(((data['HCHK_APOP_PMH_YN'] == 1) | (data['FMLY_APOP_PATIEN_YN'] == 1)) & data['KDSQ_C'].isnull()), 'KDSQ_C'] = 2

In [None]:
data.loc[(((data['HCHK_APOP_PMH_YN'] == 0) & (data['FMLY_APOP_PATIEN_YN'] == 0)) & data['KDSQ_C'].isnull()), 'KDSQ_C'] = 1

In [None]:
#KDSQ_C와 KDSQ_C_1, 2, 3, 4, 5 비교

In [None]:
#KDSQ_C_1
data.groupby('KDSQ_C')['KDSQ_C_1'].describe()

In [None]:
data.loc[((data['KDSQ_C'] == 1) & (data['KDSQ_C_1'].isnull())), 'KDSQ_C_1'] = 1
data.loc[((data['KDSQ_C'] == 2) & (data['KDSQ_C_1'].isnull())), 'KDSQ_C_1'] = 2
data['KDSQ_C_1'].isnull().sum()

In [None]:
#KDSQ_C_2
data.groupby('KDSQ_C')['KDSQ_C_2'].describe()

In [None]:
data.loc[((data['KDSQ_C'] == 1) & (data['KDSQ_C_2'].isnull())), 'KDSQ_C_2'] = 1
data.loc[((data['KDSQ_C'] == 2) & (data['KDSQ_C_2'].isnull())), 'KDSQ_C_2'] = 2
data['KDSQ_C_2'].isnull().sum()

In [None]:
#KDSQ_C_3

In [None]:
data.groupby('KDSQ_C')['KDSQ_C_3'].describe()

In [None]:
data.loc[((data['KDSQ_C'] == 1) & (data['KDSQ_C_3'].isnull())), 'KDSQ_C_3'] = 1
data.loc[((data['KDSQ_C'] == 2) & (data['KDSQ_C_3'].isnull())), 'KDSQ_C_3'] = 2
data['KDSQ_C_3'].isnull().sum()

In [None]:
#KDSQ_C_4
data.groupby('KDSQ_C')['KDSQ_C_4'].describe()

In [None]:
data.loc[((data['KDSQ_C'] == 1) & (data['KDSQ_C_4'].isnull())), 'KDSQ_C_4'] = 1
data.loc[((data['KDSQ_C'] == 2) & (data['KDSQ_C_4'].isnull())), 'KDSQ_C_4'] = 2
data['KDSQ_C_4'].isnull().sum()

In [None]:
#KDSQ_C_5
data.groupby('KDSQ_C')['KDSQ_C_5'].describe()

In [None]:
data.loc[((data['KDSQ_C'] == 1) & (data['KDSQ_C_5'].isnull())), 'KDSQ_C_5'] = 1
data.loc[((data['KDSQ_C'] == 2) & (data['KDSQ_C_5'].isnull())), 'KDSQ_C_5'] = 2
data['KDSQ_C_5'].isnull().sum()

In [None]:
missing_values = missing_values_table(data)

In [None]:
#Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics 
train, test = train_test_split(data, test_size=0.2)


In [None]:
X_train = train.drop('Stroke', axis=1).values
target_label = train['Stroke'].values
X_test = test.values

In [None]:
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.2, random_state=2018)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_tr, y_tr)

In [None]:
nb = GaussianNB()
nb.fit(X_tr, y_tr)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_tr, y_tr)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_tr, y_tr)

In [None]:
rf_probs = rf.predict_proba(X_vld)
nb_probs = nb.predict_proba(X_vld)
knn_probs = knn.predict_proba(X_vld)
dtc_probs = dtc.predict_proba(X_vld)

In [None]:
rf_probs = rf_probs[:, 1]
nb_probs = nb_probs[:, 1]
knn_probs = knn_probs[:, 1]
dtc_probs = dtc_probs[:, 1]

In [None]:
rf_auc = roc_auc_score(y_vld, rf_probs)
nb_auc = roc_auc_score(y_vld, nb_probs)
knn_auc = roc_auc_score(y_vld, knn_probs)
dtc_auc = roc_auc_score(y_vld, dtc_probs)

In [None]:
rf_fpr, rf_tpr, _ = roc_curve(y_vld, rf_probs)
nb_fpr, nb_tpr, _ = roc_curve(y_vld, nb_probs)
knn_fpr, knn_tpr, _ = roc_curve(y_vld, knn_probs)
dtc_fpr, dtc_tpr, _ = roc_curve(y_vld, dtc_probs)

In [None]:
plt.plot(rf_fpr, rf_tpr, linestyle='--', label = 'Random Forest (AUROC = %.3f)' % rf_auc)
plt.plot(nb_fpr, nb_tpr, marker='.', label='Naive Bayes (AUROC = %.3f)' % nb_auc)
plt.plot(knn_fpr, knn_tpr, marker='.', label = 'KNN (AUROC = %.3f)' % knn_auc)
plt.plot(dtc_fpr, dtc_tpr, marker='.', label = 'dtc (AUROC = %.3f)' % dtc_auc)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
print("Random Forest: AUROC = %.3f" % (rf_auc))
print("knn: AUROC = %.3f" % (knn_auc))
print("Navie Bayes: AUROC = %.3f" % (nb_auc))
print("DTC: AUROC = %.3f" % (dtc_auc))