In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import ensemble

In [None]:
df = pd.read_csv('./Cellphone.csv')

In [None]:
X = df.drop('Price', axis=1).values
y = df['Price'].values
X.shape, y.shape

In [None]:
model = ensemble.RandomForestClassifier(n_jobs=-1)

# Grid Search

Grid search is a popular hyperparameter optimization technique used in machine learning to find the best combination of hyperparameters for a model. It loops through the parameters that we provide and finds the best hyperparameter that minimises loss.

In [None]:
parameter_grid = {
    "n_estimators": [200,300, 500, 700, 900],
    "max_depth": [4, 6, 8, 10, 12,14], 
    # "criterion": ['gini', 'entropy']
    }

grid_search = model_selection.GridSearchCV(
    estimator=model,
    param_grid=parameter_grid,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    cv=3 # depends on the dataset size (2-5 for 100 datapoints, 5-10 for 1000, etc)

)
grid_search.fit(X,y)

grid_search.best_score_, grid_search.best_estimator_.get_params()

# Random Search

Here, we give a range of hyperparameter to be tested. The search model randomly searches through the model to find a good set of hyperparameters.

In [None]:
parameter_grid = {
    "n_estimators": np.arange(100, 1000, 100),
    "max_depth": np.arange(3, 20, 2), 
    "criterion": ['gini', 'entropy']
    }

grid_search = model_selection.RandomizedSearchCV(
    estimator=model,
    param_distributions=parameter_grid,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    cv=3

)
grid_search.fit(X,y)

grid_search.best_score_, grid_search.best_estimator_.get_params()

# Optuna

i dont exactly understand this thing.

In [2]:
import sklearn
from sklearn import datasets, ensemble, svm
import optuna

# Define an objective function to be minimized.
def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    regressor_name = trial.suggest_categorical('classifier', ['SVR', 'RandomForest'])
    if regressor_name == 'SVR':
        svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
        regressor_obj = sklearn.svm.SVR(C=svr_c)
    else:
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        regressor_obj = sklearn.ensemble.RandomForestRegressor(max_depth=rf_max_depth)

    X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
    X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y, random_state=0)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = sklearn.metrics.mean_squared_error(y_val, y_pred)

    return error  # An objective value linked with the Trial object.

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=3)  # Invoke optimization of the objective function.

[I 2024-07-06 20:53:28,552] A new study created in memory with name: no-name-b1359a50-2cc6-43d1-8bb8-1f14972b0c5c
[I 2024-07-06 20:53:35,460] Trial 0 finished with value: 0.2745601016490047 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 25}. Best is trial 0 with value: 0.2745601016490047.
[I 2024-07-06 20:53:42,054] Trial 1 finished with value: 0.27337328754356205 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 19}. Best is trial 1 with value: 0.27337328754356205.
[I 2024-07-06 20:53:48,908] Trial 2 finished with value: 0.27420680109724266 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 22}. Best is trial 1 with value: 0.27337328754356205.
