In [1]:
# 📦 Data manipulation
import pandas as pd
import numpy as np 
import joblib

# 📊 Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# ⚙️ Preprocessing
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# 🧪 Model selection & evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    classification_report,
    confusion_matrix,
    roc_curve, 
    auc
)

In [4]:
# 🧠 Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [5]:
# ⚖️ Handling imbalance
from imblearn.over_sampling import SMOTE
from collections import Counter

In [6]:
data =  pd.read_csv("healthcare-dataset-stroke-data.csv")
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


#### Fill null values

In [7]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


#### Remove outliers

In [8]:
data = data[data['gender']  != 'Other'] #remove rows where gender = other
data['gender'].value_counts()

gender
Female    2994
Male      2115
Name: count, dtype: int64

#### Drop unimportant columns

In [9]:
data = data.drop(['id'],axis=1)


#### Separating numarical and categorical features

In [10]:
data_cat = data.select_dtypes(exclude = [np.number]) #selects categorical columns (non-numeric).
data_num = data.select_dtypes(include=np.number)     #selects numeric columns (integers, floats).
data_stroke = data['stroke'].copy()
data_num = data_num.drop('stroke' , axis =1)



#### Fixing Text and Categorical Attributes

In [None]:

cat_encodder = OneHotEncoder(sparse_output = False)  #sparse_output=False means it returns a dense NumPy array, not a sparse matrix.

dat_cat_1hot = cat_encodder.fit_transform(data_cat)

encoded_cols = cat_encodder.get_feature_names_out(data_cat.columns)                     #Get encoded column names
df_cat_encoded = pd.DataFrame(dat_cat_1hot, columns=encoded_cols, index=data_cat.index) # Convert to DataFrame





## we wil apply pipeline insted of encodding and scalling 

In [15]:
data_Conc = pd.concat([data_num, df_cat_encoded, data_stroke ], axis=1)               # Combine encoded data with scalled numeric data
data_Conc

features = list(data_Conc.columns)  # Creating a list of our features for future use
features

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'gender_Female',
 'gender_Male',
 'ever_married_No',
 'ever_married_Yes',
 'work_type_Govt_job',
 'work_type_Never_worked',
 'work_type_Private',
 'work_type_Self-employed',
 'work_type_children',
 'Residence_type_Rural',
 'Residence_type_Urban',
 'smoking_status_Unknown',
 'smoking_status_formerly smoked',
 'smoking_status_never smoked',
 'smoking_status_smokes',
 'stroke']

#### Scaling data

In [None]:

scaler = StandardScaler()
Scaled_data = data_num.copy()


Scaled_data = scaler.fit_transform(Scaled_data)
## we wil apply pipeline insted of encodding and scalling 

#### Save preprocessed data

In [None]:
Stroke_preprocessed = Scaled_data.copy()
Stroke_preprocessed.to_csv("Stroke_preprocessed.csv", index=False)

#### Train test split

In [None]:
X = pd.concat([data_num, data_cat], axis=1)              
y = data['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,  random_state=42)

#### Pipleline 

In [None]:
# Preprocessing pipeline
'''preprocessor = ColumnTransformer([
    ("num", StandardScaler(), data_num),
    ("cat", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), data_cat)
])'''

In [None]:
#joblib.dump(preprocessor, "preprocessor.pkl")