In [15]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Load Dataset
df = pd.read_csv('../data/final_df.csv')

# Define Features and Target
X = df.drop('target', axis=1)
y = df['target']

In [13]:
# Define Hyperparameter Grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,            # 3-fold cross-validation
    verbose=2,
    n_jobs=-1        # Use all CPUs
)

grid_search.fit(X, y)

# Evaluate Best Model
print("Best Params:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Params: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Accuracy: 0.8114478114478114
Best Params: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Accuracy: 0.8114478114478114


In [14]:
# Define Hyperparameter Distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    scoring='accuracy',
    n_iter=50,        # Number of parameter settings sampled
    cv=3,             # 3-fold cross-validation
    verbose=2,
    n_jobs=-1         # Use all CPUs
)

random_search.fit(X, y)

# Evaluate Best Model
print("Best Params:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None}
Best Accuracy: 0.8114478114478114
Best Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None}
Best Accuracy: 0.8114478114478114


In [17]:
# Save the best model from GridSearchCV
joblib.dump(grid_search.best_estimator_, '../models/best_model_grid.pkl')

['../models/best_model_grid.pkl']