In [2]:
# ==============================
# Stroke Risk Prediction Project
# ==============================

# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# -------------------------------
# Step 1: Load Dataset
# -------------------------------
df = pd.read_csv('stroke_risk_dataset_v2.csv')
print("Original Shape:", df.shape)

# -------------------------------
# Step 2: Clean Data
# -------------------------------
# Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)
print("After Removing Duplicates:", df.shape)

# Encode gender (Male=1, Female=0)
df = pd.get_dummies(df, columns=['gender'], drop_first=True, dtype=int)

# Handle age outliers using IQR
q1, q3 = df['age'].quantile(0.25), df['age'].quantile(0.75)
iqr = q3 - q1
ll, ul = q1 - 1.5*iqr, q3 + 1.5*iqr
df['age'] = np.where(df['age'] < ll, ll, df['age'])
df['age'] = np.where(df['age'] > ul, ul, df['age'])

# -------------------------------
# Step 3: Define Features & Target
# -------------------------------
X = df.drop('at_risk', axis=1)
y = df['at_risk']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features (MinMaxScaler)
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# -------------------------------
# Step 4: Train Logistic Regression
# -------------------------------
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)

# -------------------------------
# Step 5: Evaluate Model
# -------------------------------
y_pred = lr.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n✅ Model Performance")
print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------
# Step 6: Save Model & Scaler
# -------------------------------
joblib.dump(lr, "stroke_lr_model.pkl")
joblib.dump(scaler, "stroke_scaler.pkl")
print("\n✅ Model and Scaler saved as 'stroke_lr_model.pkl' and 'stroke_scaler.pkl'")


Original Shape: (35000, 19)
After Removing Duplicates: (18721, 19)

✅ Model Performance
Accuracy: 0.9870037386505252

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      2182
           1       0.99      0.99      0.99      3435

    accuracy                           0.99      5617
   macro avg       0.99      0.99      0.99      5617
weighted avg       0.99      0.99      0.99      5617


Confusion Matrix:
 [[2151   31]
 [  42 3393]]

✅ Model and Scaler saved as 'stroke_lr_model.pkl' and 'stroke_scaler.pkl'


In [3]:
X_train.columns.tolist()

['age',
 'chest_pain',
 'high_blood_pressure',
 'irregular_heartbeat',
 'shortness_of_breath',
 'fatigue_weakness',
 'dizziness',
 'swelling_edema',
 'neck_jaw_pain',
 'excessive_sweating',
 'persistent_cough',
 'nausea_vomiting',
 'chest_discomfort',
 'cold_hands_feet',
 'snoring_sleep_apnea',
 'anxiety_doom',
 'stroke_risk_percentage',
 'gender_Male']