In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
    
# SVM
# load train data
X_train = pd.read_csv("data/processed_data/X_train.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
X_validation = pd.read_csv("data/processed_data/X_validation.csv")
y_validation = pd.read_csv("data/processed_data/y_validation.csv")


In [2]:
continuous_cols = ['age', 'avg_glucose_level', 'bmi']
discrete_cols = ['hypertension', 'heart_disease', 'stroke']
nominal_cols = ['gender', 'work_type', 'Residence_type', 'smoking_status', 'ever_married', 'diabetic_status', 'weight_status']

In [3]:
!pip install catboost
!pip install lightgbm



In [4]:


# Optimizing the hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

def optimizing_model_parameters(model_name, model, param_grid, X_train, y_train, X_validation, y_validation):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_validation)
    accuracy = accuracy_score(y_validation, y_pred)
    print(f"Optimized Accuracy {model_name}:", accuracy) 
    return best_model

rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': [ 2, 4, 8, 16, 32, 64, 100],
    'max_depth': [3, 5, None]
}

best_model_RandomForest = optimizing_model_parameters("RandomForest", rf_model, param_grid, X_train, y_train, X_validation, y_validation)

model_KNN = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [1, 2, 4, 8, 16],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

model_KNN = optimizing_model_parameters("KNN", model_KNN, param_grid, X_train, y_train, X_validation, y_validation)

gbc_model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [1, 3, 5], 
    'min_samples_leaf': [1, 2, 4],
}

best_model_GradientBoosting = optimizing_model_parameters("GradientBoosting", gbc_model, param_grid, X_train, y_train, X_validation, y_validation)

svm_model = SVC()
param_grid = {
    'C': [1, 2, 4, 8],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 3, 4, 5],
    'gamma': ['scale', 'auto']
}
best_model_SVM = optimizing_model_parameters("SVM", svm_model, param_grid, X_train, y_train, X_validation, y_validation)

catboost_model = CatBoostClassifier()
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'depth': [2, 4, 8, 16],
    'l2_leaf_reg': [1, 3, 8]
}
best_model_CatBoost = optimizing_model_parameters("CatBoost", catboost_model, param_grid, X_train, y_train, X_validation, y_validation)

lgbm_model = LGBMClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [2, 4, 8, 16]
}
best_model_LGBM = optimizing_model_parameters("LGBM", lgbm_model, param_grid, X_train, y_train, X_validation, y_validation)

# todo: check params of each model


Optimized Accuracy RandomForest: 0.9795665634674923
Optimized Accuracy KNN: 0.9764705882352941
Optimized Accuracy GradientBoosting: 0.9678018575851394
Optimized Accuracy SVM: 0.9504643962848297
0:	learn: 0.5669550	total: 56.3ms	remaining: 56.3s
1:	learn: 0.4755957	total: 63.9ms	remaining: 31.9s
2:	learn: 0.4046659	total: 70.1ms	remaining: 23.3s
3:	learn: 0.3507740	total: 76.8ms	remaining: 19.1s
4:	learn: 0.3043508	total: 82.3ms	remaining: 16.4s
5:	learn: 0.2635932	total: 89.1ms	remaining: 14.8s
6:	learn: 0.2354312	total: 96.1ms	remaining: 13.6s
7:	learn: 0.2234336	total: 103ms	remaining: 12.7s
8:	learn: 0.2113314	total: 109ms	remaining: 12s
9:	learn: 0.2036998	total: 135ms	remaining: 13.4s
10:	learn: 0.1981110	total: 146ms	remaining: 13.1s
11:	learn: 0.1909293	total: 155ms	remaining: 12.8s
12:	learn: 0.1859150	total: 176ms	remaining: 13.4s
13:	learn: 0.1823213	total: 181ms	remaining: 12.8s
14:	learn: 0.1803520	total: 230ms	remaining: 15.1s
15:	learn: 0.1757055	total: 276ms	remaining: 1

KeyboardInterrupt: 

In [None]:
# Save the best model
import joblib
joblib.dump(best_model_RandomForest, "models/best_model_RandomForest.pkl")
joblib.dump(model_KNN, "models/best_model_KNN.pkl")
joblib.dump(best_model_GradientBoosting, "models/best_model_GradientBoosting.pkl")
joblib.dump(best_model_SVM, "models/best_model_SVM.pkl")
joblib.dump(best_model_CatBoost, "models/best_model_CatBoost.pkl")
joblib.dump(best_model_LGBM, "models/best_model_LGBM.pkl")

