In [1]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

import optuna
from optuna.integration import LightGBMPruningCallback

from lightgbm import LGBMClassifier

import pickle

In [2]:
stroke_df_with_features = pd.read_csv('healthcare_stroke_data_clean_with_features.csv')

In [3]:
X_feat = stroke_df_with_features.drop('stroke', axis=1)
y_feat = stroke_df_with_features['stroke']

X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feat, y_feat, test_size=0.2, random_state=42, stratify=y_feat)

X_train_feat.shape, X_test_feat.shape, y_train_feat.shape, y_test_feat.shape

((3927, 12), (982, 12), (3927,), (982,))

In [4]:
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
categorical_cols_feat = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'bmi_category', 'age_group']

In [5]:
numerical_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols_feat)
])

lgbm_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(class_weight="balanced",
            objective="binary", metric="binary_logloss",
            verbosity=-1))
])

In [6]:
def objective(trial):
    n_estimators = trial.suggest_int('classifier__n_estimators', 50, 250, step=20)
    max_depth = trial.suggest_int('classifier__max_depth', 3, 15)
    boosting_type = trial.suggest_categorical('classifier__boosting_type', ["gbdt"])
    lambda_l1 = trial.suggest_float('classifier__lambda_l1', 0.001, 10.0, log=True)
    lambda_l2 = trial.suggest_float('classifier__lambda_l2', 0.001, 10.0, log=True)
    num_leaves = trial.suggest_int('classifier__num_leaves', 10, 200)
    feature_fraction = trial.suggest_float('classifier__feature_fraction', 0.5, .8)
    bagging_fraction = trial.suggest_float('classifier__bagging_fraction', 0.5, .8)
    bagging_freq = trial.suggest_int('classifier__bagging_freq', 1, 7)
    min_child_samples = trial.suggest_int('classifier__min_child_samples', 30, 100)
    scale_pos_weight = trial.suggest_float('classifier__scale_pos_weight', 10, 50)
    learning_rate = trial.suggest_float('classifier__learning_rate', 0.01, 0.1)

    lgbm_model_pipeline.set_params(**{
    'classifier__n_estimators': n_estimators,
    'classifier__max_depth': max_depth,
    'classifier__boosting_type': boosting_type,
    'classifier__lambda_l1': lambda_l1,
    'classifier__lambda_l2': lambda_l2,
    'classifier__num_leaves': num_leaves,
    'classifier__feature_fraction': feature_fraction,
    'classifier__bagging_fraction': bagging_fraction,
    'classifier__bagging_freq': bagging_freq,
    'classifier__min_child_samples': min_child_samples,
    'classifier__scale_pos_weight': scale_pos_weight,
    'classifier__learning_rate': learning_rate
})

    scores = []
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_train_feat, y_train_feat):
        X_train, X_val = X_train_feat.iloc[train_index], X_train_feat.iloc[test_index]
        y_train, y_val = y_train_feat.iloc[train_index], y_train_feat.iloc[test_index]


        lgbm_model_pipeline.fit(X_train_feat, y_train_feat)

        preds = lgbm_model_pipeline.predict(X_val)

        score = f1_score(y_val, preds)
        scores.append(score)

    return -np.mean(scores)

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=100)  

best_params = study.best_params
best_score = -study.best_value

print(best_params, best_score)

[I 2024-04-13 13:32:18,144] A new study created in memory with name: no-name-b45a4775-3d6c-4fd6-a1af-9aec29b9ffce
[I 2024-04-13 13:32:18,913] Trial 0 finished with value: -0.295356812061124 and parameters: {'classifier__n_estimators': 190, 'classifier__max_depth': 6, 'classifier__boosting_type': 'gbdt', 'classifier__lambda_l1': 0.025925774595176813, 'classifier__lambda_l2': 0.058240206973644605, 'classifier__num_leaves': 67, 'classifier__feature_fraction': 0.5974914582924447, 'classifier__bagging_fraction': 0.6975531233511453, 'classifier__bagging_freq': 4, 'classifier__min_child_samples': 66, 'classifier__scale_pos_weight': 30.283324258536716, 'classifier__learning_rate': 0.08010362031156919}. Best is trial 0 with value: -0.295356812061124.
[I 2024-04-13 13:32:19,345] Trial 1 finished with value: -0.13918417913060646 and parameters: {'classifier__n_estimators': 90, 'classifier__max_depth': 4, 'classifier__boosting_type': 'gbdt', 'classifier__lambda_l1': 0.02798096558071742, 'classifie

{'classifier__n_estimators': 70, 'classifier__max_depth': 3, 'classifier__boosting_type': 'gbdt', 'classifier__lambda_l1': 0.00157319233985872, 'classifier__lambda_l2': 0.0015553217741865806, 'classifier__num_leaves': 58, 'classifier__feature_fraction': 0.5512706551945784, 'classifier__bagging_fraction': 0.7448608103450373, 'classifier__bagging_freq': 6, 'classifier__min_child_samples': 92, 'classifier__scale_pos_weight': 44.2729906111573, 'classifier__learning_rate': 0.012953028610957897} 0.10572831706121406


In [7]:
best_params = study.best_params

lgbm_model_pipeline.set_params(**{
    'classifier__n_estimators': best_params['classifier__n_estimators'],
    'classifier__max_depth': best_params['classifier__max_depth'],
    'classifier__boosting_type': best_params['classifier__boosting_type'],
    'classifier__lambda_l1': best_params['classifier__lambda_l1'],
    'classifier__lambda_l2': best_params['classifier__lambda_l2'],
    'classifier__num_leaves': best_params['classifier__num_leaves'],
    'classifier__feature_fraction': best_params['classifier__feature_fraction'],
    'classifier__bagging_fraction': best_params['classifier__bagging_fraction'],
    'classifier__bagging_freq': best_params['classifier__bagging_freq'],
    'classifier__min_child_samples': best_params['classifier__min_child_samples'],
    'classifier__scale_pos_weight': best_params['classifier__scale_pos_weight'],
    'classifier__learning_rate': best_params['classifier__learning_rate']
})

lgbm_model_pipeline.fit(X_train_feat, y_train_feat)

y_pred_feat = lgbm_model_pipeline.predict(X_test_feat)

In [8]:
# Save the model to a file
with open("model.pkl", "wb") as f:
    pickle.dump(lgbm_model_pipeline, f)