In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
%pip install --upgrade scikit-learn imbalanced-learn
from imblearn.over_sampling import SMOTE

### Import Data

In [None]:
df = pd.read_csv("stroke_data.csv",sep=",",index_col="id")

In [None]:
df.head(10)

In [None]:
df.tail(10)

### EDA

#### Overview

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
for col in df.columns:
    print(f"\n=== {col} ===")
    print(df[col].value_counts(dropna=False))

#### Check for missing values

In [None]:
df.isnull().sum()

#### Plots by Feature

##### Gender

In [None]:
sns.countplot(x='gender', hue='stroke', data=df)
plt.title('Gender vs Stroke')


##### Age

In [None]:
sns.boxplot(x='stroke', y='age', data=df)
plt.title('Age by Stroke')

##### Hypertension

In [None]:
sns.countplot(x='hypertension', hue='stroke', data=df)
plt.title('Hypertension vs Stroke')

##### Heart disease

In [None]:
sns.countplot(x='heart_disease', hue='stroke', data=df)
plt.title('Heart Disease vs Stroke')

##### Marriage Status

In [None]:
sns.countplot(x='ever_married', hue='stroke', data=df)
plt.title('Marital Status vs Stroke')

##### Work Type

In [None]:
sns.countplot(x='work_type', hue='stroke', data=df)
plt.title('Work Type vs Stroke')

##### Resicende Type

In [None]:
sns.countplot(x='Residence_type', hue='stroke', data=df)
plt.title('Residence Type vs Stroke')

##### Average glucose level

In [None]:
sns.boxplot(x='stroke', y='avg_glucose_level', data=df)
plt.title('Average Glucose Level by Stroke')

##### BMI

In [None]:
sns.boxplot(x='stroke', y='bmi', data=df)
plt.title('BMI by Stroke')

##### Smoking status

In [None]:
sns.countplot(x='smoking_status', hue='stroke', data=df)
plt.title('Smoking Status vs Stroke')


### Feature Engineering

#### Deleting Residence_type column

In [None]:

df['Residence_type'].value_counts(normalize=True) * 100

In [None]:
pd.crosstab(df['Residence_type'], df['stroke'], normalize='index') * 100

In [None]:
df_processed = df.copy()
df_processed = df.drop(columns=['Residence_type'])

#### Removing gender outlier

In [None]:
# confirm the 1 case of "Other" gender
other_gender = df_processed[df_processed['gender'] == 'Other'].index
other_gender

In [None]:
# delete by index
df_processed = df_processed.drop(index=other_gender)

In [None]:
# confirm its gone
df_processed['gender'].value_counts()

#### Removing low age vs stroke outliers

In [None]:
# confirm the outliers seen in the boxplot
young_stroke_indexes = df_processed.loc[(df_processed['age'] < 20) & (df_processed['stroke'] == 1)].index
young_stroke_indexes

In [None]:
#delete by index
df_processed = df_processed.drop(index=young_stroke_indexes)

In [None]:
# check if the cases are gone
df_processed.loc[(df_processed['age'] < 20) & (df_processed['stroke'] == 1)]

#### FIlling the bmi column

In [None]:
#average bmi
df_processed['bmi'].mean()

In [None]:
df_processed.groupby('gender')['bmi'].mean()

In [None]:
# define 10-year bins and labels
bins = list(range(0, 91, 10))
labels = [f'{i}-{i+10}' for i in range(0, 90, 10)]

In [None]:
# create age groups
age_groups = pd.cut(df_processed['age'], bins=bins, labels=labels, right=False)

In [None]:
# group bmi by gender and age groups
mean_bmi = df_processed.groupby(['gender', age_groups])['bmi'].mean().round(1)

In [None]:
# view table with mean bmi for each gender and age group
mean_bmi_df = mean_bmi.unstack()
mean_bmi_df

In [None]:
def fill_bmi(row):
    if pd.isna(row['bmi']):
        # Look up mean BMI for this gender and age group
        age_group = pd.cut([row['age']], bins=bins, labels=labels, right=False)[0]
        return mean_bmi.loc[row['gender'], age_group]
    else:
        return row['bmi']

In [None]:
# make a clean copy and fill missing values
df_filled = df_processed.copy()
df_filled['bmi'] = df_filled.apply(fill_bmi, axis=1)

In [None]:
# check for missing values in the new column
missing_bmi_filled = df_filled['bmi'].isna().sum()
missing_bmi_filled

#### Encoding the gender column

In [None]:
# transform the gender column to numerical values. Female = 0 and Male = 1
df_filled.loc[:, ['gender']] = pd.get_dummies(df_filled['gender'], drop_first=True, dtype=int).values

In [None]:
df_filled['gender'].unique()

#### Encoding the ever_married column

In [None]:
# transform the ever_married column to numerical values. No = 0 and Yes = 1
df_filled.loc[:, ['ever_married']] = pd.get_dummies(df_filled['ever_married'], drop_first=True, dtype=int).values

In [None]:
df_filled['ever_married'].unique()

#### Encoding smoking_status column

In [None]:
# applying k−1 dummy encoding to smoking_status
smoking_dummies = pd.get_dummies(df_filled['smoking_status'], dtype=int, drop_first=True)

In [None]:
# concat the new dummy columns to the original df
df_filled = pd.concat([df_filled, smoking_dummies], axis=1)

In [None]:
# Optionally drop the original smoking_status column
df_filled.drop('smoking_status', axis=1, inplace=True)

#### Encoding work_type column

In [None]:
# applying k−1 dummy encoding to work_type
working_dummies = pd.get_dummies(df_filled['work_type'], dtype=int, drop_first=True)

In [None]:
# concat the new dummy columns to the original df
df_filled = pd.concat([df_filled, working_dummies], axis=1)

In [None]:
# drop the original smoking_status column
df_filled.drop('work_type', axis=1, inplace=True)

### Model Training

#### Train-Test Split

In [None]:
df_train, df_test = train_test_split(df_filled,test_size = 0.2, random_state=32)

In [None]:
# check for balanced datasets
print(df_filled.gender.mean())
print(df_train.gender.mean())
print(df_test.gender.mean())

In [None]:
print(df_filled.avg_glucose_level.mean())
print(df_train.avg_glucose_level.mean())
print(df_test.avg_glucose_level.mean())

In [None]:
df_train.columns

In [None]:
# get the values of the columns for the training data
X_train = df_train.drop(columns=['stroke']).values
y_train = df_train['stroke'].values

In [None]:
# get the values of the columns for the testing data
X_test = df_test.drop(columns=['stroke']).values
y_test = df_test['stroke'].values

#### Logistic Regression

In [None]:
# initialize the model
lr_model = LogisticRegression(random_state=32,max_iter=1000)

In [None]:
# train the model on the training data
lr_model.fit(X=X_train, y=y_train)

In [None]:
y_test_predicted = lr_model.predict(X_test)

In [None]:
y_test_predicted

In [None]:
y_test

In [None]:
(y_test_predicted == y_test).sum()/len(y_test)

In [None]:
cf = pd.DataFrame(
    columns=["y_test_0","y_test_1"],index=["y_pred_0","y_pred_1"]
)

In [None]:
cf.loc[:,:] = confusion_matrix(y_true= y_test,y_pred= y_test_predicted)

In [None]:
cf

In [None]:
cf/len(y_test)

In [None]:
recall_score(y_true=y_test, y_pred=y_test_predicted)

In [None]:
precision_score(y_true=y_test, y_pred=y_test_predicted, zero_division=0)

In [None]:
report_lr = classification_report(y_true=y_test, y_pred=y_test_predicted, zero_division=0)
print(report_lr)

#### Random Forest

In [None]:
# initialize the model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=32)

In [None]:
# train the model on the training data
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
# Confusion matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# Classification report
report_rf = classification_report(y_test, y_pred)
print(report_rf)

#### Cross Validation with Logistic Regression

In [None]:
# initialize the model for cross-validation
cv_lr_model = LogisticRegression(random_state=32, max_iter=1000)

In [None]:
scores_lr = cross_val_score(cv_lr_model, X_train, y_train, cv=5, scoring='f1')

In [None]:
print("Logistic Regression - F1 scores for each fold:", scores_lr)

In [None]:
print("Mean F1 score:", scores_lr.mean())

#### Cross Validation with Random Forest

In [None]:
cv_rf_model = RandomForestClassifier(class_weight='balanced', random_state=32)


In [None]:
scores_rf = cross_val_score(cv_rf_model, X_train, y_train, cv=5, scoring='f1')

In [None]:
scores_rf

In [None]:
print("Mean F1 score:", scores_rf.mean())

#### SMOTE

##### Create new balanced dataset

In [None]:
# copy dataset
df_balanced = df_filled.copy()

In [None]:
count_non_stroke = len(df_balanced[df_balanced['stroke'] == 0])
count_non_stroke

In [None]:
count_stroke = len(df_balanced[df_balanced['stroke'] == 1])
count_stroke

In [None]:
X = df_balanced.drop(columns=['stroke']).values
y = df_balanced['stroke'].values
print(f"Before SMOTE: non-stroke = {sum(y==0)}, stroke = {sum(y==1)}")

In [None]:
# initialize SMOTE to balance minority class to majority count
smote = SMOTE(sampling_strategy=1.0, random_state=32)

In [None]:
# apply SMOTE on full dataset to balance classes
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"After SMOTE: non-stroke = {sum(y_resampled==0)}, stroke = {sum(y_resampled==1)}")

In [None]:
# Recreate df_balanced DataFrame with new balanced data
df_balanced = df_balanced.iloc[:len(X_resampled), :].copy()

In [None]:
# get columns of features
feature_columns = df_balanced.drop(columns=['stroke']).columns

In [None]:
# create new balanced DataFrame from resampled arrays
df_balanced = pd.DataFrame(X_resampled, columns=feature_columns)
df_balanced['stroke'] = y_resampled

In [None]:
num_stroke_0 = len(df_balanced[df_balanced['stroke'] == 0])
num_stroke_1 = len(df_balanced[df_balanced['stroke'] == 1])
num_stroke_0, num_stroke_1


In [None]:
df_train, df_test = train_test_split(df_balanced, test_size=0.2, random_state=32)

In [None]:
# train data
X_train = df_train.drop(columns=['stroke']).values
y_train = df_train['stroke'].values

In [None]:
# test data
X_test = df_test.drop(columns=['stroke']).values
y_test = df_test['stroke'].values

##### Logistic Regression

In [None]:
lr_model = LogisticRegression(random_state=32, max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [None]:
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

##### Random Forest

In [None]:
# initialize
rf_model = RandomForestClassifier(random_state=32)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))