In [31]:
import pandas as pd
from sklearn.metrics import accuracy_score
    
# SVM
# load train data
X_train = pd.read_csv("data/processed_data/X_train.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
X_validation = pd.read_csv("data/processed_data/X_validation.csv")
y_validation = pd.read_csv("data/processed_data/y_validation.csv")


In [32]:
continuous_cols = ['age', 'avg_glucose_level', 'bmi']
discrete_cols = ['hypertension', 'heart_disease', 'stroke']
nominal_cols = ['gender', 'work_type', 'Residence_type', 'smoking_status', 'ever_married', 'diabetic_status', 'weight_status']

In [33]:
!pip install catboost
!pip install lightgbm



In [None]:


# Optimizing the hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

def optimizing_model_parameters(model_name, model, param_grid, X_train, y_train, X_validation, y_validation):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_validation)
    accuracy = accuracy_score(y_validation, y_pred)
    print(f"Optimized Accuracy {model_name}:", accuracy) 
    return best_model

rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_depth': [2, 4, 8, 16, 32, 64, 100, 200],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_model_RandomForest = optimizing_model_parameters("RandomForest", rf_model, param_grid, X_train, y_train, X_validation, y_validation)

model_KNN = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

model_KNN = optimizing_model_parameters("KNN", model_KNN, param_grid, X_train, y_train, X_validation, y_validation)

gbc_model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [2, 4, 8, 16, 32, 64, 100, 200],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_model_GradientBoosting = optimizing_model_parameters("GradientBoosting", gbc_model, param_grid, X_train, y_train, X_validation, y_validation)

svm_model = SVC()
param_grid = {
    'C': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'gamma': ['scale', 'auto']
}
best_model_SVM = optimizing_model_parameters("SVM", svm_model, param_grid, X_train, y_train, X_validation, y_validation)

catboost_model = CatBoostClassifier()
param_grid = {
    'iterations': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'depth': [2, 4, 8, 16, 32, 64, 100, 200],
    'l2_leaf_reg': [1, 2, 4, 8, 16, 32, 64, 100, 200]
}
best_model_CatBoost = optimizing_model_parameters("CatBoost", catboost_model, param_grid, X_train, y_train, X_validation, y_validation)

lgbm_model = LGBMClassifier()
param_grid = {
    'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [2, 4, 8, 16, 32, 64, 100, 200],
    'min_child_samples': [1, 2, 4, 8, 16, 32, 64, 100, 200]
}
best_model_LGBM = optimizing_model_parameters("LGBM", lgbm_model, param_grid, X_train, y_train, X_validation, y_validation)

# todo: check params of each model


In [None]:
# Save the best model
import joblib
joblib.dump(best_model_RandomForest, "models/best_model_RandomForest.pkl")
joblib.dump(model_KNN, "models/best_model_KNN.pkl")
joblib.dump(best_model_GradientBoosting, "models/best_model_GradientBoosting.pkl")
joblib.dump(best_model_SVM, "models/best_model_SVM.pkl")
joblib.dump(best_model_CatBoost, "models/best_model_CatBoost.pkl")
joblib.dump(best_model_LGBM, "models/best_model_LGBM.pkl")

