In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
   
# SVM
# load train data
X_train = pd.read_csv("data/processed_data/X_train.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
X_validation = pd.read_csv("data/processed_data/X_validation.csv")
y_validation = pd.read_csv("data/processed_data/y_validation.csv")


In [2]:
from scipy.stats import randint, uniform

# Optimizing the hyperparameters
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import json

def optimizing_model_parameters(model_name, model, param_distributions, X_train, y_train, X_validation, y_validation):
    rand_search = RandomizedSearchCV(model, param_distributions, cv=5, scoring='accuracy', n_jobs=-1, n_iter=50)
    rand_search.fit(X_train, y_train.values.ravel())
    best_model = rand_search.best_estimator_
    y_pred = best_model.predict(X_validation)
    accuracy = accuracy_score(y_validation, y_pred)
    train_accuracy = accuracy_score(y_train, best_model.predict(X_train))

    # Enhanced output
    print(f"\nOptimized Parameters for {model_name}:")
    print(rand_search.best_params_)
    print(f"Optimized Accuracy (train) ({model_name}): {train_accuracy}")
    print(f"Optimized Accuracy (validation) ({model_name}): {accuracy}")

    # Unchanged saving logic
    with open(f"models/{model_name}.json", "w") as f:
        json.dump([rand_search.best_params_, {"accuracy_validation": accuracy}, {"accuracy_train": train_accuracy}], f)

    return best_model

rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': randint(2,200),
    'max_depth': randint(1,10)
}

best_model_RandomForest = optimizing_model_parameters("RandomForest", rf_model, param_grid, X_train, y_train, X_validation, y_validation)

model_KNN = KNeighborsClassifier()
param_grid = {
    'n_neighbors': randint(1,100),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

model_KNN = optimizing_model_parameters("KNN", model_KNN, param_grid, X_train, y_train, X_validation, y_validation)

gbc_model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': randint(50,250),
    'learning_rate': uniform(0.001, 0.7),
    'max_depth': randint(1,20), 
    'min_samples_leaf': randint(1,20)
}

best_model_GradientBoosting = optimizing_model_parameters("GradientBoosting", gbc_model, param_grid, X_train, y_train, X_validation, y_validation)

svm_model = SVC()
param_grid = {
    'C': uniform(0.01, 10),   
   'kernel': ['rbf', 'linear']
}
best_model_SVM = optimizing_model_parameters("SVM", svm_model, param_grid, X_train, y_train, X_validation, y_validation)


Optimized Parameters for RandomForest:
{'max_depth': 9, 'n_estimators': 43}
Optimized Accuracy (train) (RandomForest): 0.9612823292550721
Optimized Accuracy (validation) (RandomForest): 0.9541795665634675

Optimized Parameters for KNN:
{'algorithm': 'kd_tree', 'n_neighbors': 76, 'weights': 'distance'}
Optimized Accuracy (train) (KNN): 1.0
Optimized Accuracy (validation) (KNN): 0.9808049535603716

Optimized Parameters for GradientBoosting:
{'learning_rate': 0.18885196844001045, 'max_depth': 17, 'min_samples_leaf': 10, 'n_estimators': 57}
Optimized Accuracy (train) (GradientBoosting): 1.0
Optimized Accuracy (validation) (GradientBoosting): 0.9801857585139319

Optimized Parameters for SVM:
{'C': 3.384762876875148, 'kernel': 'rbf'}
Optimized Accuracy (train) (SVM): 0.9540034071550255
Optimized Accuracy (validation) (SVM): 0.9492260061919504


In [3]:
# Save the best model
import joblib
joblib.dump(best_model_RandomForest, "models/best_model_RandomForest.pkl")
joblib.dump(model_KNN, "models/best_model_KNN.pkl")
joblib.dump(best_model_GradientBoosting, "models/best_model_GradientBoosting.pkl")
joblib.dump(best_model_SVM, "models/best_model_SVM.pkl")



['models/best_model_SVM.pkl']