<a href="https://colab.research.google.com/github/DavidkingMazimpaka/Stroke_Analysis_and_Prediction/blob/main/strokeAnalysisPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stroke Analysis and Prediction

In [55]:
# Importing all necessary libraries
# for visulization
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
pd.plotting.register_matplotlib_converters()
%matplotlib inline
import seaborn as sns
from scipy import stats
# for modeling
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold , StratifiedKFold , cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from joblib import dump, load

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
# loadin' the dataset
df = pd.read_csv("/content/healthcare-dataset-stroke-data.csv")
df.drop(["id"] ,axis=1, inplace=True)
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


## Univariate Analysis

In [30]:
df.describe(include="O").T

Unnamed: 0,count,unique,top,freq
gender,5110,3,Female,2994
ever_married,5110,2,Yes,3353
work_type,5110,5,Private,2925
Residence_type,5110,2,Urban,2596
smoking_status,5110,4,never smoked,1892


In [31]:
cat = list(df.select_dtypes(include='O').columns) + ["stroke" , "hypertension" , "heart_disease"]
cat

['gender',
 'ever_married',
 'work_type',
 'Residence_type',
 'smoking_status',
 'stroke',
 'hypertension',
 'heart_disease']

In [32]:
df[cat] = df[cat].astype('object')
cat.remove('stroke')

In [33]:
for i in cat:
    display(df[i].value_counts(normalize=True))
    print("#"*30)

Unnamed: 0_level_0,proportion
gender,Unnamed: 1_level_1
Female,0.58591
Male,0.413894
Other,0.000196


##############################


Unnamed: 0_level_0,proportion
ever_married,Unnamed: 1_level_1
Yes,0.656164
No,0.343836


##############################


Unnamed: 0_level_0,proportion
work_type,Unnamed: 1_level_1
Private,0.572407
Self-employed,0.160274
children,0.134442
Govt_job,0.128571
Never_worked,0.004305


##############################


Unnamed: 0_level_0,proportion
Residence_type,Unnamed: 1_level_1
Urban,0.508023
Rural,0.491977


##############################


Unnamed: 0_level_0,proportion
smoking_status,Unnamed: 1_level_1
never smoked,0.370254
Unknown,0.302153
formerly smoked,0.17319
smokes,0.154403


##############################


Unnamed: 0_level_0,proportion
hypertension,Unnamed: 1_level_1
0,0.902544
1,0.097456


##############################


Unnamed: 0_level_0,proportion
heart_disease,Unnamed: 1_level_1
0,0.945988
1,0.054012


##############################


In [35]:
df['stroke'].value_counts(normalize=True)
stroke_mean = round(df['stroke'].mean() ,3)
stroke_mean # imbalance

0.049

## ERROR CORRECTIONS

In [36]:
# drop the incetences with other in gender
df.drop(df[df['gender'] == "Other"].index , axis = 0 , inplace = True)
# any formerly smoked as smokes
df['smoking_status'].iloc[df['smoking_status'] == 'formerly smoked'] = 'smokes'
# treating unknown inctences as NaN's
df.drop(df[df['smoking_status'] == 'Unknown'].index ,axis=0 , inplace=True)

In [38]:
# Missing Values
num = list(df.select_dtypes(exclude="O").columns)
df[num].isnull().sum()
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

Unnamed: 0,0
age,0
avg_glucose_level,0
bmi,140


In [39]:
df[num].corr()

Unnamed: 0,age,avg_glucose_level,bmi
age,1.0,0.233005,0.079317
avg_glucose_level,0.233005,1.0,0.156675
bmi,0.079317,0.156675,1.0


In [40]:
df.drop('bmi', axis=1, inplace=True)
num.remove('bmi')

## Modeling
## Data Resampling and Splitting

In [41]:
train , test_df  = train_test_split(df , test_size=0.3 , random_state=42 , shuffle=True)
train_df , valid_df = train_test_split(train , test_size=0.15 , random_state=42 , shuffle=True)

In [42]:
train_df.shape , valid_df.shape , test_df.shape

((2120, 10), (375, 10), (1070, 10))

## Data preprocessing

In [43]:
# Transformers
from sklearn.compose import make_column_transformer
def preprocess(df_train, df_valid , df_test, num, cat):
    # Define transformers
    ohe = OneHotEncoder(drop='first')
    mms = MinMaxScaler()
    scaler =StandardScaler()

    transformer = make_column_transformer((mms, num),
                                           (ohe, cat),
                                          remainder='passthrough',verbose_feature_names_out=False)
    # Fitting & Transformation
    X_train = transformer.fit_transform(df_train[cat+num])
    X_valid = transformer.transform(df_valid[cat+num])
    X_test = transformer.fit_transform(df_test[cat+num])
    columns=transformer.get_feature_names_out()
    X_train = pd.DataFrame(X_train , columns=columns)
    X_valid = pd.DataFrame(X_valid , columns=columns)
    X_test = pd.DataFrame(X_test , columns=columns)
    cat = [i for i in list(columns) if i not in num]
    X_train[cat] = scaler.fit_transform(X_train[cat])
    X_valid[cat] = scaler.transform(X_valid[cat])
    X_test[cat] = scaler.fit_transform(X_test[cat])


    return X_train , X_valid, X_test, list(columns)

In [44]:
X_train , X_val , X_test ,columns = preprocess(train_df, valid_df, test_df , num, cat)

In [47]:
y_train = train_df.stroke.astype("int")
y_valid = valid_df.stroke.astype("int")
y_test_df = test_df.stroke.astype("int")

## Trying Different Models

In [48]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

xgb = XGBClassifier()

lr = LogisticRegression(random_state=42 , C=0.5 , penalty = 'l2'  , class_weight={1 :0.7} )

tr = DecisionTreeClassifier(criterion="entropy" )

svc = SVC(random_state=42)

rm = RandomForestClassifier(n_estimators=30 , criterion="entropy" )


meta_model = SVC(kernel='linear', probability=True)

# Create a stacking classifier
vot = VotingClassifier(estimators=[
    ('lr', lr),
     ('tr', tr),
    ('gb', xgb)
])

In [49]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import ClusterCentroids
from sklearn.metrics import accuracy_score , f1_score , roc_curve , confusion_matrix , recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


models_pred = []
smote = SMOTE(random_state=42)
X_re , y_re  = smote.fit_resample(X_train , y_train)

print(f"Data shape : {X_re.shape}")
for model in [lr , svc  , tr  , rm , xgb , vot]:
    print(f"* {model} \n")
    model.fit(X_re, y_re)
    t_pred = model.predict(X_train)
    print('Train Confustion metric \n' , confusion_matrix(y_train ,t_pred))
    print("Train Cls Report \n" ,classification_report(y_train , t_pred))
    print("-"*50)
    val_pre = model.predict(X_val)
    print('validation Confustion metric \n' , confusion_matrix(y_valid ,val_pre))
    print("validation Cls Report \n" ,classification_report(y_valid , val_pre))
    print("-"*50)
    scores = cross_val_score(model , X_re , y_re , cv= KFold(n_splits=5 , shuffle=True) )
    print("Cross val score : " ,np.mean(scores))
    print("-"*50)

Data shape : (4008, 12)
* LogisticRegression(C=0.5, class_weight={1: 0.7}, random_state=42) 

Train Confustion metric 
 [[1571  433]
 [  35   81]]
Train Cls Report 
               precision    recall  f1-score   support

           0       0.98      0.78      0.87      2004
           1       0.16      0.70      0.26       116

    accuracy                           0.78      2120
   macro avg       0.57      0.74      0.56      2120
weighted avg       0.93      0.78      0.84      2120

--------------------------------------------------
validation Confustion metric 
 [[284  72]
 [  5  14]]
validation Cls Report 
               precision    recall  f1-score   support

           0       0.98      0.80      0.88       356
           1       0.16      0.74      0.27        19

    accuracy                           0.79       375
   macro avg       0.57      0.77      0.57       375
weighted avg       0.94      0.79      0.85       375

--------------------------------------------------


## Testing Prediction

In [50]:
test_pred = lr.predict(X_test)
print('* Confustion metric: \n' , confusion_matrix(y_test_df ,test_pred))
print("-"*50)
print("* Cls Report: \n" ,classification_report(y_test_df , test_pred))
print("-"*50)

* Confustion metric: 
 [[819 184]
 [ 30  37]]
--------------------------------------------------
* Cls Report: 
               precision    recall  f1-score   support

           0       0.96      0.82      0.88      1003
           1       0.17      0.55      0.26        67

    accuracy                           0.80      1070
   macro avg       0.57      0.68      0.57      1070
weighted avg       0.91      0.80      0.85      1070

--------------------------------------------------


**Saving the model**


In [62]:
import pickle

MODEL_OPT_FILE = 'strokePredict.pkl'
with open(MODEL_OPT_FILE, 'wb') as file:
    pickle.dump(model, file)

In [63]:
from google.colab import files

files.download(MODEL_OPT_FILE)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>