In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
stroke = pd.read_csv("Brain_Stroke_Data/stroke_data_clean.csv")
stroke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4904,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
#dataset information
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   object 
 2   age                4909 non-null   float64
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   object 
 6   work_type          4909 non-null   object 
 7   Residence_type     4909 non-null   object 
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   object 
 11  stroke             4909 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 460.3+ KB


In [4]:
#data description
stroke.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,42.865374,0.091872,0.049501,105.30515,28.893237,0.042575
std,20995.098457,22.555115,0.288875,0.216934,44.424341,7.854067,0.201917
min,77.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,18605.0,25.0,0.0,0.0,77.07,23.5,0.0
50%,37608.0,44.0,0.0,0.0,91.68,28.1,0.0
75%,55220.0,60.0,0.0,0.0,113.57,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [5]:
#fill null values
stroke['bmi'].fillna((stroke['bmi'].mean()), inplace=True)
stroke.isnull().mean()

id                   0.0
gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64

In [6]:
#data description summary
stroke.select_dtypes(include=['object']).describe()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,4909,4909,4909,4909,4909
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2897,3204,2811,2490,1852


In [7]:
#data description summary
col=stroke.select_dtypes(include=['object']).columns.tolist()
#create iteration
for i in col:
    count=stroke.groupby(['stroke'])[i].value_counts()[1]
    percent=stroke.groupby(['stroke'])[i].value_counts(normalize=True)[1]
    display(pd.DataFrame({"Patients":count, "Percent":percent*100})\
            .sort_values("Percent", ascending=False)
            .style.set_caption('Variable: {}'.format(i))\
            .format({"Percent": "{:,.1f}%"})
            .highlight_max(props='font-weight:bold; color:Black', axis=0)) 


Unnamed: 0_level_0,Patients,Percent
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,120,57.4%
Male,89,42.6%


Unnamed: 0_level_0,Patients,Percent
ever_married,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,186,89.0%
No,23,11.0%


Unnamed: 0_level_0,Patients,Percent
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Private,127,60.8%
Self-employed,53,25.4%
Govt_job,28,13.4%
children,1,0.5%


Unnamed: 0_level_0,Patients,Percent
Residence_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Urban,109,52.2%
Rural,100,47.8%


Unnamed: 0_level_0,Patients,Percent
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
never smoked,84,40.2%
formerly smoked,57,27.3%
smokes,39,18.7%
Unknown,29,13.9%


In [8]:
# linear algebra
import numpy as np

In [9]:
#label encoding
from sklearn import preprocessing


In [10]:
columns_obj = ["gender", "ever_married" ,"Residence_type"]
encoding = preprocessing.LabelEncoder()
for col in columns_obj:
    stroke[col]=  encoding.fit_transform(stroke[col])

#convert to 0s and 1s  
stroke = pd.get_dummies(stroke)
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,67.0,0,1,1,1,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,31112,1,80.0,0,1,1,0,105.92,32.5,1,0,0,1,0,0,0,0,1,0
2,60182,0,49.0,0,0,1,1,171.23,34.4,1,0,0,1,0,0,0,0,0,1
3,1665,0,79.0,1,0,1,0,174.12,24.0,1,0,0,0,1,0,0,0,1,0
4,56669,1,81.0,0,0,1,1,186.21,29.0,1,0,0,1,0,0,0,1,0,0


In [11]:
from imblearn.over_sampling import SMOTE #oversample data

In [12]:
#sepate labels and target
X = stroke.drop(columns = ['stroke'])

#target
y = stroke['stroke']

#oversample data
smote = SMOTE(random_state=42)
X , y = smote.fit_resample(X,y)

before = stroke.stroke.value_counts(normalize=True)
after = y.value_counts(normalize=True)
print('Rows before smote:' + ' {}'.format(stroke.shape[0]))
print('Rows after smote:' + ' {}'.format(X.shape[0]))

Rows before smote: 4909
Rows after smote: 9400


In [13]:
# separate training and testing sets
X_train, X2, y_train, y2 = train_test_split(
    X,  # 
    y,  
    test_size=0.50, 
    shuffle=True, 
    stratify=y,
    random_state=42) 

X_val, X_test, y_val, y_test = train_test_split(
    X2, y2, test_size=0.5, shuffle=True, stratify=y2, random_state=42)

In [14]:
#cross validation
from sklearn.model_selection import cross_val_score

In [15]:
#cross validation by kfold
from sklearn.model_selection import KFold

In [16]:
#ML model
from sklearn.tree import DecisionTreeClassifier

In [17]:
#find best parameter 
from sklearn.model_selection import GridSearchCV

In [18]:
#cross validation by kfold
from sklearn.model_selection import KFold

In [19]:
#fold configuration
kf = KFold(n_splits=5, shuffle=True, random_state=4)

In [24]:
#Decision tree
tree_model= DecisionTreeClassifier(random_state=42)

#parameters
tree_param = {'max_features': ['auto', 'sqrt', 'log2'],'ccp_alpha': [0.1, .01, .001, 1.0],
              'max_depth' : [5, 6, 7, 8, 9], 'criterion' :['gini', 'entropy']}

# gridsearch
grid_tree = GridSearchCV(tree_model, tree_param, scoring = 'roc_auc' ,cv=5,n_jobs=-1)

#fit gridsearch
search_tree = grid_tree.fit(X_train, y_train)

#best parameters
best_tree = search_tree.best_estimator_

#get score
cross_tree =  cross_val_score(
    best_tree,
    X_val, 
    y_val,
    n_jobs=-1,
    scoring='accuracy',
    cv=kf,
)

print('Accuracy: ' + str(round(cross_tree.mean()*100, 2)) +"%")

Accuracy: 84.72%
