In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

In [11]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [14]:
y = df['stroke'].copy()
X_1 = df.drop(columns=['stroke'], inplace=False)
num_feat = len(X_1.columns) 
num_obj = len(X_1)
# Cross-validator
cv = KFold(n_splits=5, shuffle=True, random_state=42)
X_1.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [16]:
#Define ohe methods
def ohe(df, features):
    for feat in features:
        categ_list = df[feat].unique()
        df_enc = np.zeros((df.shape[0], len(categ_list)))
        for ii in range(len(categ_list)):
            df_enc[:, ii] = (df[feat]==categ_list[ii]).astype(int) 
        df_enc = pd.DataFrame(data=df_enc, index=df.index, columns=categ_list)
        df = pd.concat([df, df_enc], axis=1)
    return df


In [19]:

X_2 = X_1.copy()
# Encode categorial features
#Convert some data form
X_2['gender'] = X_2['gender'].map({'Male': 1, 'Female': 0, 'Other': 1})
X_2['Residence_type'] = X_2['Residence_type'].map({'Urban': 1, 'Rural': 0})
X_2['ever_married'] = X_2['ever_married'].map({'Yes': 1, 'No': 0})

X_2 = ohe(X_2, ['work_type', 'smoking_status'])
X_2.drop(columns=['work_type', 'smoking_status'], inplace=True)

X_2['bmi'].fillna(X_2['bmi'].median(), inplace=True)
# Scaling
scaler = StandardScaler()
X_2 = pd.DataFrame(data=scaler.fit_transform(X_2), index=X_2.index, columns=X_2.columns)

In [21]:
#Validation results
clf_bayes = GaussianNB().fit(X_2, y)
print('Cross-validation score: %f' % cross_val_score(clf_bayes, X_2, y, cv=cv, scoring='roc_auc').mean())

Cross-validation score: 0.800360
