##**The aim of this project is to create a model that predict whether a person will have stroke or not.**

In [1]:
#importing the required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
#mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#importing the dataset from drive
df=pd.read_csv('/content/drive/MyDrive/Datasets/healthcare-dataset-stroke-data.csv')

In [4]:
#displaying top 5 values of dataset
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
#information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
#statistical description of the dataset
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
#checking for null values in dataset
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [8]:
#dropping the null values
df.dropna(inplace=True)

In [9]:
#creating dummy variable for smoking column
df=pd.get_dummies(data=df,columns=['smoking_status'],drop_first=True)

In [10]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,1,1,0,0
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,1,0,1,0
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,1,0,0,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,1,0,1,0
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,1,1,0,0


In [11]:
#checking which feature variables are highly correlated with target variable
df.corr()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,1.0,0.008984,0.001147,0.004016,0.006409,0.003084,0.004839,0.009136,0.00165,0.013708
age,0.008984,1.0,0.274425,0.257123,0.235838,0.333398,0.232331,0.242404,0.124377,0.076801
hypertension,0.001147,0.274425,1.0,0.115991,0.180543,0.167811,0.142515,0.062078,0.066717,0.028214
heart_disease,0.004016,0.257123,0.115991,1.0,0.154525,0.041357,0.137938,0.071339,-0.020685,0.048686
avg_glucose_level,0.006409,0.235838,0.180543,0.154525,1.0,0.175502,0.138936,0.07425,0.032085,0.010981
bmi,0.003084,0.333398,0.167811,0.041357,0.175502,1.0,0.042374,0.107031,0.107964,0.088324
stroke,0.004839,0.232331,0.142515,0.137938,0.138936,0.042374,1.0,0.05732,0.010723,0.02153
smoking_status_formerly smoked,0.009136,0.242404,0.062078,0.071339,0.07425,0.107031,0.05732,1.0,-0.352884,-0.190555
smoking_status_never smoked,0.00165,0.124377,0.066717,-0.020685,0.032085,0.107964,0.010723,-0.352884,1.0,-0.327141
smoking_status_smokes,0.013708,0.076801,0.028214,0.048686,0.010981,0.088324,0.02153,-0.190555,-0.327141,1.0


In [12]:
#Creating the feature and target Variable
X=df.drop(['id','gender', 'ever_married','work_type','Residence_type','stroke'],axis=1)
y=df['stroke']

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [14]:
#Training on Logistic Regression Model
model=LogisticRegression(max_iter=500)
model.fit(X_train,y_train)

In [21]:
pred=model.predict(X_test)

In [16]:
cr=classification_report(y_test,pred)
print(cr)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       939
           1       0.00      0.00      0.00        43

    accuracy                           0.96       982
   macro avg       0.48      0.50      0.49       982
weighted avg       0.91      0.96      0.93       982



In [17]:
#Training on Random Forest Model
rfc_model=RandomForestClassifier(n_estimators=200)
rfc_model.fit(X_train,y_train)

In [22]:
rfc_pred=rfc_model.predict(X_test)


In [19]:
rfc_cr=classification_report(y_test,rfc_pred)
print(rfc_cr)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       939
           1       1.00      0.02      0.05        43

    accuracy                           0.96       982
   macro avg       0.98      0.51      0.51       982
weighted avg       0.96      0.96      0.94       982

