In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Preprocessing And Feature Engineering

In [4]:
df1 = df[['id', 'gender', 'age', 'hypertension', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']]

In [5]:
df1.head()

Unnamed: 0,id,gender,age,hypertension,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
df1.gender.value_counts()

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

In [7]:
df1.drop('id', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop('id', inplace=True, axis=1)


In [8]:
df1 = df1[df1['gender'] != 'Other']

In [9]:
df1.ever_married.value_counts()

ever_married
Yes    3353
No     1756
Name: count, dtype: int64

In [10]:
df1.work_type.value_counts()

work_type
Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

In [11]:
df1.Residence_type.value_counts()

Residence_type
Urban    2596
Rural    2513
Name: count, dtype: int64

In [12]:
df1.smoking_status.value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     884
smokes              789
Name: count, dtype: int64

In [13]:
df1.head()

Unnamed: 0,gender,age,hypertension,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [15]:
X = df1.drop(columns=["stroke"])
y = df1["stroke"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [16]:
numeric_features = ["age", "avg_glucose_level", "bmi"]
categorical_features = [
    "gender",
    "ever_married",
    "work_type",
    "Residence_type",
    "smoking_status"
]

In [19]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical transformer (impute most frequent + one hot encoding)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="passthrough"  # This keeps hypertension as it is
)

In [46]:
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        penalty="l1",
        solver="liblinear"
    ))
])

In [47]:
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]

In [48]:
threshold = 0.75
y_pred = (y_proba >= threshold).astype(int)

In [49]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8718199608610567
Confusion matrix:
 [[858 114]
 [ 17  33]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       972
           1       0.22      0.66      0.34        50

    accuracy                           0.87      1022
   macro avg       0.60      0.77      0.63      1022
weighted avg       0.94      0.87      0.90      1022



In [50]:
import joblib
joblib.dump(model, "stroke_model.pkl")
print("Model saved as stroke_model.pkl")

Model saved as stroke_model.pkl
