In [1]:
# Importing modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import pickle
import os
import joblib
# for Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# For Data Spliting
from sklearn.model_selection import train_test_split
# For Model Accuracy
from sklearn.metrics import classification_report, accuracy_score
# Machine Learning Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
data = pd.read_csv(r'C:\Python39\Projects\Stroke Prediction\healthcare-dataset-stroke-data.csv')

In [3]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
#fill missing value
avg = data['bmi'].mean()
data.bmi=(data.bmi.fillna(28.74))
data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
def preprocess_inputs(df):
    
    df = df.copy()
    le = LabelEncoder()
    df['gender'] = le.fit_transform(df['gender'])
    df['ever_married'] = le.fit_transform(df['ever_married'])
    df['work_type'] = le.fit_transform(df['work_type'])
    df['Residence_type'] = le.fit_transform(df['Residence_type'])
    df['smoking_status'] = le.fit_transform(df['smoking_status'])
    return df

In [7]:
df = preprocess_inputs(data)

In [8]:
#drop id 
df=df.drop(columns='id')

In [9]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.60,1,1
1,0,61.0,0,0,1,3,0,202.21,28.74,2,1
2,1,80.0,0,1,1,2,0,105.92,32.50,2,1
3,0,49.0,0,0,1,2,1,171.23,34.40,3,1
4,0,79.0,1,0,1,3,0,174.12,24.00,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.74,2,0
5106,0,81.0,0,0,1,3,1,125.20,40.00,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.60,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.60,1,0


In [10]:
# feature selection
features = ['age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

label = ['stroke']

X = df[features]
y = df[label]

In [11]:
#Splitting tha data for train and test
train_X, val_X, train_y, val_y = train_test_split(X, y,test_size=0.2,random_state=100)

In [12]:
#STANDARDIZATION OF THE DATA AS DATA ARE IN DIFFERENT SCALES
sc=StandardScaler()
X_std= sc.fit_transform(X)
train_X_std = sc.fit_transform(train_X)
val_X_std = sc.transform(val_X)

# Training diffrent ML models for getting best model based on Accuracy Score 

In [13]:
model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])
models = {"LR": LogisticRegression(),
          "KNN" : KNeighborsClassifier(),
          "DT" : DecisionTreeClassifier(),
          'RFC' : RandomForestClassifier(),
          'BGC' : BaggingClassifier(),
          'ABC' : AdaBoostClassifier(),
          'DTC' : DecisionTreeClassifier(),
          }


for model_name, model in models.items():
    model.fit(train_X_std, train_y.values.ravel())
    pred = model.predict(val_X_std)
    ac = accuracy_score(val_y,pred)
    print( model_name + ' Accuracy scores')
    print(ac)
    model_accuracy = model_accuracy.append({'Model': model_name, 'Accuracy': ac}, ignore_index=True)

LR Accuracy scores
0.9549902152641878
KNN Accuracy scores
0.9559686888454012
DT Accuracy scores
0.9119373776908023
RFC Accuracy scores
0.9530332681017613
BGC Accuracy scores
0.9461839530332681
ABC Accuracy scores
0.9530332681017613
DTC Accuracy scores
0.9099804305283757


In [14]:
# Sorting Models by there Accuracy Scores
model_accuracy.sort_values('Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy
1,KNN,0.955969
0,LR,0.95499
3,RFC,0.953033
5,ABC,0.953033
4,BGC,0.946184
2,DT,0.911937
6,DTC,0.90998


# LogisticRegression to go
As we can see KNeighborsClassifier, LogisticRegression, RandomForestClassifier
has a accuracy of score of 95%.

In [15]:
#Training final Model with all data
final_model=LogisticRegression()
final_model.fit(X_std,y.values.ravel())

LogisticRegression()

In [24]:
p= final_model.predict(val_X_std)
acc = accuracy_score(val_y,p)
print(' Accuracy scores')
print(acc)

 Accuracy scores
0.9549902152641878


In [16]:
pickle.dump(final_model, open('model.pkl','wb'))

In [17]:
import json
columns = {
    'data_columns' : [col.lower() for col in features]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [25]:
filename = 'finalized_model.sav'
joblib.dump(final_model,filename)

['finalized_model.sav']