In [1]:
# 📦 Data manipulation
import pandas as pd
import numpy as np 
import joblib

# 📊 Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# ⚙️ Preprocessing
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# 🧪 Model selection & evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    classification_report,
    confusion_matrix,
    roc_curve, 
    auc
)

In [4]:
# 🧠 Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [5]:
# ⚖️ Handling imbalance
from imblearn.over_sampling import SMOTE
from collections import Counter

### Loading the preprocessed data

In [6]:
data = pd.read_csv('Stroke_preprocessed.csv')
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5105,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5106,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5107,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


### Split data

In [7]:
X = data.drop('stroke', axis=1)
y = data['stroke']

In [8]:
# Split again
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


### Separating numarical and categorical features

In [9]:
categorical_cols = X.select_dtypes(exclude=['number']).columns.tolist() #selects categorical columns (non-numeric).
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()     #selects numeric columns (integers, floats).


### Pipleine 

In [10]:

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_cols)
])



#### logistic regression

In [11]:
pipe_log_reg= Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])


In [12]:
pipe_log_reg.fit(X_train, y_train) # Train the full pipeline

In [13]:
logreg_predect = pipe_log_reg.predict(X_test)

In [14]:
print(classification_report(logreg_predect,y_test))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98      1021
           1       0.02      1.00      0.04         1

    accuracy                           0.95      1022
   macro avg       0.51      0.98      0.51      1022
weighted avg       1.00      0.95      0.97      1022



#### SVM

In [None]:
pipe_log_reg= Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])
