<a href="https://colab.research.google.com/github/DPatt365/Project_2/blob/main/stroke_prediction_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Import/Load

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

filename = '/content/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


##Data Cleaning

In [5]:
#check for duplicates
df.duplicated().sum()

0

In [6]:
#Dropping 'id' column because it serves no purpose in our model or analysis
df.drop(columns='id', inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


BMI column is missing data. we will have to impute for our visualizations.

In [8]:
#Search for na values
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [9]:
#Checking categorical columns for inadequate data and dropping the category

df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [10]:
df.index[df['gender']=='Other']

Int64Index([3116], dtype='int64')

In [11]:
df.drop(3116, inplace=True)

In [12]:
df['work_type'].value_counts()

Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [13]:
df.drop(df.loc[df.work_type=='Never_worked'].index, inplace=True)

In [14]:
df['Residence_type'].value_counts()

Urban    2581
Rural    2506
Name: Residence_type, dtype: int64

In [15]:
df['smoking_status'].value_counts()

never smoked       1878
Unknown            1536
formerly smoked     884
smokes              789
Name: smoking_status, dtype: int64

In [16]:
#checking for odd entries
for v in df.columns:
  print(df[v].value_counts())

Female    2983
Male      2104
Name: gender, dtype: int64
78.00    102
57.00     95
52.00     90
54.00     87
51.00     86
        ... 
1.40       3
0.48       3
0.16       3
0.40       2
0.08       2
Name: age, Length: 104, dtype: int64
0    4589
1     498
Name: hypertension, dtype: int64
0    4811
1     276
Name: heart_disease, dtype: int64
Yes    3353
No     1734
Name: ever_married, dtype: int64
Private          2924
Self-employed     819
children          687
Govt_job          657
Name: work_type, dtype: int64
Urban    2581
Rural    2506
Name: Residence_type, dtype: int64
93.88     6
84.10     5
91.85     5
91.68     5
73.00     5
         ..
111.93    1
94.40     1
95.57     1
66.29     1
85.28     1
Name: avg_glucose_level, Length: 3966, dtype: int64
28.7    41
27.6    37
26.1    37
26.7    37
27.7    37
        ..
51.8     1
13.4     1
56.0     1
10.3     1
14.9     1
Name: bmi, Length: 418, dtype: int64
never smoked       1878
Unknown            1536
formerly smoked     884
smok

##Data Models

'bmi' = mean using mean imputer
StandardScaler

Models: KNN, Random Forrest, Logistic Rehression


In [39]:
#Model Instance
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state=42)
logreg = LogisticRegression()

###Preprocessing

In [18]:
# Imputers
mean_imputer = SimpleImputer(strategy='mean')
# Scaler
scaler = StandardScaler()
# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)


In [19]:
#Split data before imputation
target = 'stroke'
X = df.drop(columns = [target])
y = df[target]

In [20]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [21]:
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

num_pipe = make_pipeline(scaler, mean_imputer)
cat_pipe = make_pipeline(ohe)

num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
preprocessor

##K-nearest Neighbors

In [22]:
knn_pipe = make_pipeline(preprocessor, knn)
knn_pipe.fit(X_train, y_train)
print(knn_pipe.score(X_train,y_train))
print(knn_pipe.score(X_test, y_test))

0.9528178243774574
0.9441823899371069


In [23]:
knn_train_preds = knn_pipe.predict(X_train)
knn_test_preds = knn_pipe.predict(X_test)

In [24]:
print(f'\nClassification Report: Testing\n{classification_report(y_test,knn_test_preds,zero_division=1)}')
print(f'\nClassification Report: Training\n{classification_report(y_train,knn_train_preds,zero_division=1)}')


Classification Report: Testing
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1210
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1272
   macro avg       0.48      0.50      0.49      1272
weighted avg       0.90      0.94      0.92      1272


Classification Report: Training
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3628
           1       0.82      0.05      0.09       187

    accuracy                           0.95      3815
   macro avg       0.89      0.52      0.53      3815
weighted avg       0.95      0.95      0.93      3815



###KNN Parameter Tuning

In [25]:
# Display the hyperparameters.
tune_knn_pipe = make_pipeline(preprocessor,knn)
knn_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler()),
                                                    ('simpleimputer',
                                                     SimpleImputer())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7febd3d6a950>),
                                   ('pipeline-2',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fec0aaa6590>)])),
  ('kneighborsclassifier

In [26]:
knn_params = {'kneighborsclassifier__n_neighbors': range(2, 10),
              'kneighborsclassifier__p': range(1,6),
              'kneighborsclassifier__weights': ['uniform', 'distance']}

In [27]:
knn_pipe_gs = GridSearchCV(tune_knn_pipe, knn_params)
knn_pipe_gs.fit(X_train, y_train)

print('Best KNN Parameters:')
print(knn_pipe_gs.best_params_)
best_knn_pipe = knn_pipe_gs.best_estimator_
print(f'Accuracy of best KNN Train model is: {best_knn_pipe.score(X_train, y_train)}')
print(f'Accuracy of best KNN Test model is: {best_knn_pipe.score(X_test, y_test)}')

Best KNN Parameters:
{'kneighborsclassifier__n_neighbors': 6, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'uniform'}
Accuracy of best KNN Train model is: 0.9517693315858453
Accuracy of best KNN Test model is: 0.949685534591195


In [28]:
best_knn_pipe = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=6, p=1, weights='uniform'))
best_knn_pipe.fit(X_train, y_train)

best_knn_train_predict = best_knn_pipe.predict(X_train)
best_knn_test_predict = best_knn_pipe.predict(X_test)

In [30]:
best_knn_train_report = classification_report(y_train, best_knn_train_predict)
best_knn_test_report = classification_report(y_test, best_knn_test_predict)

print(f'Best Train KNN Report:\n{best_knn_train_report}')
print('')
print(f'Best Test KNN Report:\n{best_knn_test_report}')


Best Train KNN Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3628
           1       0.80      0.02      0.04       187

    accuracy                           0.95      3815
   macro avg       0.88      0.51      0.51      3815
weighted avg       0.94      0.95      0.93      3815


Best Test KNN Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1210
           1       0.00      0.00      0.00        62

    accuracy                           0.95      1272
   macro avg       0.48      0.50      0.49      1272
weighted avg       0.90      0.95      0.93      1272



##Random Forest Classifier 

In [61]:
rf_pipe = make_pipeline(preprocessor, rf)
rf_pipe.fit(X_train, y_train)
print(rf_pipe.score(X_train,y_train))
print(rf_pipe.score(X_test, y_test))

1.0
0.9488993710691824


In [62]:
rf_train_preds = rf_pipe.predict(X_train)
rf_test_preds = rf_pipe.predict(X_test)

In [63]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler()),
                                                    ('simpleimputer',
                                                     SimpleImputer())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7febd3d6a950>),
                                   ('pipeline-2',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fec0aaa6590>)])),
  ('randomforestclassifi

In [64]:
params = {'randomforestclassifier__max_depth':[10,20,30,40,50], 
          'randomforestclassifier__max_features':['sqrt', 'log2'], 
          'randomforestclassifier__n_estimators':[50,100,150,200]}

In [65]:
rf_gridsearch = GridSearchCV(rf_pipe, params, scoring='f1')

In [66]:
rf_gridsearch.fit(X_train, y_train)

In [67]:
print(rf_gridsearch.best_params_)
rf_bestparams = rf_gridsearch.best_estimator_

{'randomforestclassifier__max_depth': 30, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__n_estimators': 200}


In [68]:
rf_bestparams.fit(X_train, y_train)
rf_bestparams.score(X_test, y_test)

0.9488993710691824

In [69]:
rf_confusionparams = rf_bestparams.predict(X_train)
rf_confusionparams_test = rf_bestparams.predict(X_test)

In [70]:
print(f'\nClassification Report: Testing\n{classification_report(y_test,rf_confusionparams_test,zero_division=1)}')
print(f'\nClassification Report: Training\n{classification_report(y_train,rf_confusionparams,zero_division=1)}')


Classification Report: Testing
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1210
           1       0.20      0.02      0.03        62

    accuracy                           0.95      1272
   macro avg       0.58      0.51      0.50      1272
weighted avg       0.92      0.95      0.93      1272


Classification Report: Training
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3628
           1       1.00      1.00      1.00       187

    accuracy                           1.00      3815
   macro avg       1.00      1.00      1.00      3815
weighted avg       1.00      1.00      1.00      3815



##Logistic Regression

In [40]:
logreg_pipe = make_pipeline(preprocessor, logreg)
logreg_pipe.fit(X_train, y_train)
print(logreg_pipe.score(X_train,y_train))
print(logreg_pipe.score(X_test, y_test))

0.9512450851900394
0.9512578616352201


In [41]:
logreg_train_preds = logreg_pipe.predict(X_train)
logreg_test_preds = logreg_pipe.predict(X_test)

In [44]:
print(f'\nLogistic Regression Report: Testing\n{classification_report(y_test,logreg_test_preds,zero_division=1)}')
print(f'\nLogistic Regression Report: Training\n{classification_report(y_train,logreg_train_preds,zero_division=1)}')


Logistic Regression Report: Testing
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1210
           1       1.00      0.00      0.00        62

    accuracy                           0.95      1272
   macro avg       0.98      0.50      0.49      1272
weighted avg       0.95      0.95      0.93      1272


Logistic Regression Report: Training
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3628
           1       1.00      0.01      0.01       187

    accuracy                           0.95      3815
   macro avg       0.98      0.50      0.49      3815
weighted avg       0.95      0.95      0.93      3815



###Logistic Regression Parameter Tuning

In [45]:
# Display the hyperparameters.
tune_logreg_pipe = make_pipeline(preprocessor,logreg)
logreg_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler()),
                                                    ('simpleimputer',
                                                     SimpleImputer())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7febd3d6a950>),
                                   ('pipeline-2',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fec0aaa6590>)])),
  ('logisticregression',

In [50]:
param_grid = {'logisticregression__C': [.0001, .001, .01, .1, 1, 10, 100, 1000, 10000],
 'logisticregression__max_iter': [10000],
 'logisticregression__penalty': ['l1','l2',],
 'logisticregression__solver': ['liblinear'],}

In [51]:
logreg_pipe_gs = GridSearchCV(tune_logreg_pipe, param_grid)
logreg_pipe_gs.fit(X_train, y_train)

print('Best logreg Parameters:')
print(logreg_pipe_gs.best_params_)
best_logreg_pipe = logreg_pipe_gs.best_estimator_
print(f'Accuracy of best LogReg Train model is: {best_logreg_pipe.score(X_train, y_train)}')
print(f'Accuracy of best LogReg Test model is: {best_logreg_pipe.score(X_test, y_test)}')

Best logreg Parameters:
{'logisticregression__C': 0.1, 'logisticregression__max_iter': 10000, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}
Accuracy of best LogReg Train model is: 0.9512450851900394
Accuracy of best LogReg Test model is: 0.9512578616352201


In [57]:
best_logreg_pipe = make_pipeline(preprocessor, LogisticRegression(C=0.1, max_iter=10000, penalty='12'))
best_logreg_pipe.fit(X_train, y_train)

best_logreg_train_predict = best_logreg_pipe.predict(X_train)
best_logreg_test_predict = best_logreg_pipe.predict(X_test)

InvalidParameterError: ignored

In [59]:
best_logreg_train_report = classification_report(y_train, best_logreg_train_predict)
best_logreg_test_report = classification_report(y_test, best_logreg_test_predict)

print(f'Best Train LogReg Report:\n{best_logreg_train_report}')
print('')
print(f'Best Test LogReg Report:\n{best_logreg_test_report}')


ConfusionMatrixDisplay.from_predictions(y_test, best_logreg_test_predict, cmap='Blues', ax=axes[1])
axes[1].set_title('Best Test logistic Regression Matrix');

NameError: ignored