# Model Training

## 1. Import Data and Required Packages

In [1]:
# Basic Import
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/stroke.csv')

## 2. Pre-Processing

### 2.1. Remove and Combine Categories in a Feature

From the EDA, we found that the *"Other"* category in the **Age** feature is extremely rare with just 1 record. It’s generally a good idea to remove or combine with male or female categories. We have decided to remove it.

In [3]:
df = df[df['gender'] != 'Other']
print(df['gender'].value_counts())

gender
Female    2994
Male      2115
Name: count, dtype: int64


Combining rare categories in a categorical column is a good way to reduce the number of dummy variables when doing one-hot encoding. This helps simplify the model and avoid creating too many sparse features.

From the EDA, the *"Never_worked"* category in the **work_type** feature rare with jush 22 record. One approach is to combine *children* and *never_worked* categories together and label it as "other"

In [4]:
print(f"--- work_type variable before feature engineering: ----\n {df['work_type'].value_counts()}")
df['work_type'] = df['work_type'].replace(['Never_worked', 'children'], 'Other')
print(f"\n--- work_type variable after feature engineering: ----\n {df['work_type'].value_counts()}")

--- work_type variable before feature engineering: ----
 work_type
Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

--- work_type variable after feature engineering: ----
 work_type
Private          2924
Self-employed     819
Other             709
Govt_job          657
Name: count, dtype: int64


### 2.2. Missing Values

We have 201 missing value in the **bmi**. Since **bmi** feature has an skewed distribution *Median* is better choice for data imputation.

In [5]:
print(f"--- The number of Missing values in the bmi before data imputation: {df['bmi'].isna().sum()} ---- ")
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
print(f"--- The number of Missing values in the bmi after data imputation: {df['bmi'].isna().sum()} ---- ")

--- The number of Missing values in the bmi before data imputation: 201 ---- 
--- The number of Missing values in the bmi after data imputation: 0 ---- 


### 2.3. Preparing X and y variables

In [6]:
df.drop(columns=['id'], inplace=True)
X = df.drop(columns=['stroke'],axis=1)
y = df['stroke']

### 2.4. Column Transformer

In [7]:
numerical_features = ['age', 'bmi', 'avg_glucose_level']
binary_features = ['gender', 'ever_married', 'Residence_type']
categorical_features = ['work_type', 'smoking_status']

# Identify passthrough columns
all_columns = X.columns.tolist()
used_columns = numerical_features + binary_features + categorical_features
passthrough_columns = [col for col in all_columns if col not in used_columns]


# Transformers
numeric_transformer = StandardScaler()
binary_transformer = OrdinalEncoder()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('bin', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [8]:
X_transformed = preprocessor.fit_transform(X)

cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numerical_features + binary_features + list(cat_features) + passthrough_columns

# To DataFrame
X_transformed = pd.DataFrame(X_transformed, columns=all_features)

### 2.5. Train Test Splitting

In [9]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X_transformed,y,test_size=0.3,random_state=42, stratify=y)
X_train.shape, X_test.shape

((3576, 14), (1533, 14))

### 2.6. Oversampling

In [10]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

## 3. Create an Evaluate Function to give all metrics after model Training

In [11]:
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)
    
    return acc, precision, recall, f1, cm

In [12]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest Classifier": RandomForestClassifier(class_weight='balanced'),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBRegressor": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
}

model_list = []
f1_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate
    train_acc = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_cm = confusion_matrix(y_train, y_train_pred)

    test_acc = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_cm = confusion_matrix(y_test, y_test_pred)

    print(name)
    model_list.append(name)

    print('Model performance for Training set')
    print(f"- Accuracy: {train_acc:.4f}")
    print(f"- Precision: {train_precision:.4f}")
    print(f"- Recall (Sensitivity): {train_recall:.4f}")
    print(f"- F1 Score: {train_f1:.4f}")

    print('----------------------------------')

    print('Model performance for Test set:')
    print(f"- Accuracy: {test_acc:.4f}")
    print(f"- Precision: {test_precision:.4f}")
    print(f"- Recall (Sensitivity): {test_recall:.4f}")
    print(f"- F1 Score: {test_f1:.4f}")
    print("Confusion Matrix:\n", test_cm)

    f1_list.append(test_f1)

    print('='*40)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 0.7436
- Precision: 0.1411
- Recall (Sensitivity): 0.8391
- F1 Score: 0.2415
----------------------------------
Model performance for Test set:
- Accuracy: 0.7352
- Precision: 0.1314
- Recall (Sensitivity): 0.7867
- F1 Score: 0.2252
Confusion Matrix:
 [[1068  390]
 [  16   59]]


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall (Sensitivity): 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set:
- Accuracy: 0.9191
- Precision: 0.0984
- Recall (Sensitivity): 0.0800
- F1 Score: 0.0882
Confusion Matrix:
 [[1403   55]
 [  69    6]]


Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall (Sensitivity): 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set:
- Accuracy: 0.9511
- Precision: 0.0000
- Recall (Sensitivity): 0.0000
- F1 Score: 0.0000
Confu

### Results

In [13]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'f1']).sort_values(by=["f1"],ascending=False)

Unnamed: 0,Model Name,f1
0,Logistic Regression,0.225191
4,XGBRegressor,0.152381
1,Decision Tree,0.088235
5,CatBoosting Classifier,0.068966
3,K-Neighbors Classifier,0.0
2,Random Forest Classifier,0.0
