In [11]:
# import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [12]:
# import data

df = pd.read_csv("/content/drive/MyDrive/DS Course Uploads/Datasets/healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### EDA

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [14]:
df.duplicated().sum()

0

In [15]:
df.dropna(inplace=True)

In [16]:
df.shape

(4909, 12)

In [17]:
df.stroke.value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4700
1,209


In [18]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [19]:
# implement one hot encoding

df = pd.get_dummies(df, columns=['gender','work_type','smoking_status'], dtype=int)
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Female,...,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,Yes,Urban,228.69,36.6,1,0,...,0,0,0,1,0,0,0,1,0,0
2,31112,80.0,0,1,Yes,Rural,105.92,32.5,1,0,...,0,0,0,1,0,0,0,0,1,0
3,60182,49.0,0,0,Yes,Urban,171.23,34.4,1,1,...,0,0,0,1,0,0,0,0,0,1
4,1665,79.0,1,0,Yes,Rural,174.12,24.0,1,1,...,0,0,0,0,1,0,0,0,1,0
5,56669,81.0,0,0,Yes,Urban,186.21,29.0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [21]:
df.ever_married = df.ever_married.apply(lambda x: 1 if x == 'Yes' else 0)
df.Residence_type = df.Residence_type.apply(lambda x: 1 if x == 'Urban' else 0)

In [22]:
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Female,...,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,1,1,228.69,36.6,1,0,...,0,0,0,1,0,0,0,1,0,0
2,31112,80.0,0,1,1,0,105.92,32.5,1,0,...,0,0,0,1,0,0,0,0,1,0
3,60182,49.0,0,0,1,1,171.23,34.4,1,1,...,0,0,0,1,0,0,0,0,0,1
4,1665,79.0,1,0,1,0,174.12,24.0,1,1,...,0,0,0,0,1,0,0,0,1,0
5,56669,81.0,0,0,1,1,186.21,29.0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [23]:
df.drop('id', axis=1, inplace=True)

In [24]:
# Standardise the data

scaler = StandardScaler()
df_temp = df.drop('stroke', axis=1)
df_cols = df_temp.columns
df_temp = scaler.fit_transform(df_temp)
df_scaled = pd.DataFrame(df_temp, columns=df_cols)

### Modeling

In [25]:
# Separate input and output

X = df_scaled
y = df.stroke

In [26]:
# Perform train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# K fold

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [28]:
# Model

rf = RandomForestClassifier(random_state=42)
rf_fit = rf.fit(X_train, y_train)
y_pred = rf_fit.predict(X_test)

In [33]:
# cross validation

cv_scores = cross_val_score(rf, X, y, cv=kfold)
print(cv_scores.mean())

0.956610416060518


In [32]:
print("Accuracy - ", metrics.accuracy_score(y_pred, y_test))
print("Precision - ", metrics.precision_score(y_pred, y_test))
print("Recall - ", metrics.recall_score(y_pred, y_test))
print("F1 Score - ", metrics.f1_score(y_pred, y_test))

Accuracy -  0.945010183299389
Precision -  0.0
Recall -  0.0
F1 Score -  0.0


In [34]:
# Training score evaluation

y_train_pred = rf_fit.predict(X_train)

print("Accuracy - ", metrics.accuracy_score(y_train_pred, y_train))
print("Precision - ", metrics.precision_score(y_train_pred, y_train))
print("Recall - ", metrics.recall_score(y_train_pred, y_train))
print("F1 Score - ", metrics.f1_score(y_train_pred, y_train))

Accuracy -  1.0
Precision -  1.0
Recall -  1.0
F1 Score -  1.0


In [38]:
# list down all methods or variables that can be called on 'rf'

importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importance_df.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
5,avg_glucose_level,0.2804896
0,age,0.233139
6,bmi,0.2266303
4,Residence_type,0.03448587
1,hypertension,0.02878457
2,heart_disease,0.02380703
12,work_type_Private,0.02037995
13,work_type_Self-employed,0.0188646
17,smoking_status_never smoked,0.01808832
8,gender_Male,0.01795494
