In [115]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

Stroke Prediction Dataset - Kaggle

1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12) stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [116]:
df = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/Afraa/Freelance/MLE/2.Prediction_model/stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [117]:
df.shape

(5110, 12)

Data Cleaning

Remove Duplicate Rows

In [118]:
df.drop_duplicates(inplace=True)

Drop ID

In [119]:
df = df.drop(columns=["id"])

Check for missing values & impute

In [120]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [121]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

In [122]:
df['smoking_status'] = df['smoking_status'].replace({'Unknown': 'unknown'})
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Feature Engineering

Encode categorical columns to integers

In [123]:
# Encode ever_married
ever_married_map = {
    "Yes": 1,
    "No": 2
}
df["ever_married"] = df["ever_married"].map(ever_married_map).fillna(0).astype(int)

# Encode work_type
work_type_map = {
    "children": 1,
    "Govt_jov": 2,
    "Never_worked": 3,
    "Private": 4,
    "Self-employed": 5
}
df["work_type"] = df["work_type"].map(work_type_map).fillna(0).astype(int)

# Encode Residence_type
Residence_type_map = {
    "Rural": 1,
    "Urban": 2
}

df["Residence_type"] = df["Residence_type"].map(Residence_type_map).fillna(0).astype(int)

# Encode smoking_status
smoking_status_map = {
    "formerly smoked": 1,
    "never smoked": 2,
    "smokes": 3,
    "Unknown": 4
}
df["smoking_status"] = df["smoking_status"].map(smoking_status_map).fillna(0).astype(int)

In [124]:
# Encode gender
gender_map = {
    "Male": 1,
    "Female": 2
}
df["gender"] = df["gender"].map(gender_map).fillna(0).astype(int)

In [125]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,4,2,228.69,36.6,1,1
1,2,61.0,0,0,1,5,1,202.21,28.1,2,1
2,1,80.0,0,1,1,4,1,105.92,32.5,2,1
3,2,49.0,0,0,1,4,2,171.23,34.4,3,1
4,2,79.0,1,0,1,5,1,174.12,24.0,2,1


Separate features and target

In [126]:
X = df.drop("stroke", axis=1)
y = df["stroke"]

In [127]:
numeric_features = ["age", "avg_glucose_level", "bmi"]
categorical_features = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
binary_features = ["hypertension", "heart_disease"]

Preprocessing pipeline

In [128]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("bin", "passthrough", binary_features),
    ]
)


Model setup

In [129]:
model = BalancedRandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    sampling_strategy="auto",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])

Split dataset

In [130]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Model training

In [131]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


Evaluation meterics

In [132]:
y_pred = pipeline.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.69      0.81       972
           1       0.12      0.84      0.21        50

    accuracy                           0.70      1022
   macro avg       0.56      0.77      0.51      1022
weighted avg       0.95      0.70      0.78      1022


Confusion Matrix:

[[672 300]
 [  8  42]]


Save Model

In [133]:
with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("\nModel Saved Successfully as model.pkl")


Model Saved Successfully as model.pkl


Testing

In [135]:
import pandas as pd
import pickle


with open("model.pkl", "rb") as f:
    model = pickle.load(f)


sample_input = {
    "gender": 1,
    "age": 65,
    "hypertension": 1,
    "heart_disease": 1,
    "ever_married": 1,
    "work_type": 4,
    "Residence_type": 2,
    "avg_glucose_level": 180.5,
    "bmi": 32.0,
    "smoking_status": 3
}

input_df = pd.DataFrame([sample_input])

print("Raw Input:")
display(input_df)


probability = model.predict_proba(input_df)[0][1]  
prediction = model.predict(input_df)[0]           

risk = "High Risk" if prediction == 1 else "Low Risk"

print("\nPredicted Class:", prediction)
print("Predicted Risk:", risk)
print("Probability of Stroke Risk:", round(probability * 100, 2), "%")


Raw Input:


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,65,1,1,1,4,2,180.5,32.0,3



Predicted Class: 1
Predicted Risk: High Risk
Probability of Stroke Risk: 91.0 %
