# Hyperparameter Optimization

In [5]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing 
from sklearn import decomposition 
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [6]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
df_train.shape


(2000, 21)

In [8]:
df_test.shape

(1000, 21)

In [9]:
df_train.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [10]:
df_test.columns

Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

In [11]:
X = df_train.drop('price_range', axis=1).values
y = df_train.price_range.values

## Random Forest Classifier 

In [12]:

class RandomForestGridSearch:
    def __init__(self, n_jobs=-1):  #-1 uses all the cpu to its max
        self.classifier = RandomForestClassifier(n_jobs=n_jobs)
        self.param_grid = {
            'n_estimators': [100, 200, 300, 400],
            'max_depth': [1, 3, 5, 7],
            'criterion': ['gini', 'entropy']
        }
        self.model = None

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=1, cv=5):
        self.model = GridSearchCV(
            estimator=self.classifier,
            param_grid=self.param_grid,
            scoring=scoring, #all the classes have the same number of samples so we use accuracy
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage

random_forest_search = RandomForestGridSearch(n_jobs=-1)
random_forest_search.fit(X, y)
random_forest_search.print_best_results()


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............


[CV 1/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.578 total time=   0.2s
[CV 2/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 2/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.583 total time=   0.2s
[CV 3/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 3/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.613 total time=   0.2s
[CV 4/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 4/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.522 total time=   0.3s
[CV 5/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 5/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.568 total time=   0.1s
[CV 1/5; 2/32] START criterion=gini, max_depth=1, n_estimators=200..............
[CV 1/5; 2/32] END criterion=gini, max_depth=1, n_estimators=200;, score=0.605 total time=   0.3s
[CV 2/5

## Random Search Parameters  

In [13]:
class MyRandomForestRandomizedSearch:
    def __init__(self, n_jobs=-1): #-1 uses all the cpu to its max
        self.classifier = RandomForestClassifier(n_jobs=n_jobs)
        self.param_dist = {
            'n_estimators': np.arange(100, 1500, 100),
            'max_depth': np.arange(1, 20),
            'criterion': ['gini', 'entropy']
        }
        self.model = None

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=1, cv=5, n_iter=10):
        self.model = RandomizedSearchCV(
            estimator=self.classifier,
            param_distributions=self.param_dist,
            scoring=scoring,
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv,
            n_iter=n_iter #do random search 10 times
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage
random_forest_random_search = MyRandomForestRandomizedSearch(n_jobs=-1)
random_forest_random_search.fit(X, y)
random_forest_random_search.print_best_results()


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=11, n_estimators=1200............
[CV 1/5; 1/10] END criterion=gini, max_depth=11, n_estimators=1200;, score=0.880 total time=   3.5s
[CV 2/5; 1/10] START criterion=gini, max_depth=11, n_estimators=1200............
[CV 2/5; 1/10] END criterion=gini, max_depth=11, n_estimators=1200;, score=0.887 total time=   3.3s
[CV 3/5; 1/10] START criterion=gini, max_depth=11, n_estimators=1200............
[CV 3/5; 1/10] END criterion=gini, max_depth=11, n_estimators=1200;, score=0.892 total time=   3.2s
[CV 4/5; 1/10] START criterion=gini, max_depth=11, n_estimators=1200............
[CV 4/5; 1/10] END criterion=gini, max_depth=11, n_estimators=1200;, score=0.868 total time=   3.1s
[CV 5/5; 1/10] START criterion=gini, max_depth=11, n_estimators=1200............
[CV 5/5; 1/10] END criterion=gini, max_depth=11, n_estimators=1200;, score=0.860 total time=   3.2s
[CV 1/5; 2/10] START criterion=gin

### Grid/Random Search With Pipeline

In [14]:
class MyRandomForestPipeline:
    def __init__(self, n_iter=10):
        self.scaler = preprocessing.StandardScaler()
        self.pca = decomposition.PCA()
        self.rf = ensemble.RandomForestClassifier(n_jobs= -1)
        self.classifier = pipeline.Pipeline([
            ('scaling', self.scaler),
            ('pca', self.pca),
            ('rf', self.rf)
        ])
        self.param_grid = {
            'pca__n_components': np.arange(5, 10),
            'rf__n_estimators': np.arange(100, 1500, 100),
            'rf__max_depth': np.arange(1, 20),
            'rf__criterion': ['gini', 'entropy']
        }
        self.model = None
        self.n_iter = n_iter

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=-1, cv=5):
        self.model = RandomizedSearchCV(
            estimator=self.classifier,
            param_distributions=self.param_grid,
            scoring=scoring,
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv,
            n_iter=self.n_iter
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage

my_pipeline = MyRandomForestPipeline(n_iter=10)
my_pipeline.fit(X, y)
my_pipeline.print_best_results()


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Score: 0.40700000000000003
Best Parameters: {'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=8)), ('rf', RandomForestClassifier(max_depth=12, n_estimators=1400, n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=8), 'rf': RandomForestClassifier(max_depth=12, n_estimators=1400, n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 8, 'pca__n_oversamples': 10, 'pca__power_iteration_normalizer': 'auto', 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'gini', 'rf__max_depth': 12, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_samples_leaf': 1, 'rf

### Bayesian Optimization with Gaussian Process

In [15]:
"""""
from functools import partial
from sklearn import metrics, ensemble, model_selection
from sklearn import metrics, ensemble, model_selection
import numpy as np
from functools import partial
"""""

"""""
cant import skopt for this functions 
def optimize(params, param_names, x, y):
    params = dict(zip(param_names, params))
    model = ensemble.RandomForestClassifier(**params)
    kf = model_selection.StratifiedKFold(n_splits= 5)
    accuracies = []
    for idx in kf.split( X= x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain =x[train_idx]
        ytrain = y[train_idx]
        
        xtest = x[test_idx]
        ytest = y[test_idx]
        
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)
        
    return -1.0 * np.mean(accuracies)


param_space = [
    space.Integer(3,15, name='max_depth'),
    space.Integer(100, 600, name='n_estimators'),
    space.Categorical(['gini', 'entropy'], name='criterion'),
    space.Real(0.01, 1, prior='uniform' ,name='max_features')
]
params_names = ['max_depth', 'n_estimators', 'criterion', 'max_features']

optimization_function = partial(
    optimize,
    param_names= param_names,
    x= X,
    y = y
)

result = gp_minimize(
    optimization_function,
    dimensions = param_space,
    n_calls= 15,
    n_random_starts=10,
    verbose=10,
)

print(
    dict(
        param_names,
        results.x
    )
)







same function without the use of skopt






def optimize(params, param_names, x, y):
    model_params = {
        param_names[i]: params[i] for i in range(len(param_names))
    }
    model = ensemble.RandomForestClassifier(**model_params)
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain, ytrain = x[train_idx], y[train_idx]
        xtest, ytest = x[test_idx], y[test_idx]
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)
    return -1.0 * np.mean(accuracies)

def grid_search(param_space, param_names, x, y, n_points=10):
    best_params = None
    best_score = float('-inf')

    for _ in range(n_points):
        params = [np.random.uniform(low, high) if isinstance(low, float) else np.random.randint(low, high + 1) for low, high in param_space]
        score = optimize(params, param_names, x, y)

        if score > best_score:
            best_score = score
            best_params = params

    return best_params

param_space = [
    (3, 15),           # max_depth
    (100, 600),        # n_estimators
    (0.01, 1.0),        # max_features
    (0, 1)             # For categorical criterion; 0 corresponds to 'gini' and 1 to 'entropy'
]

param_names = ['max_depth', 'n_estimators', 'max_features', 'criterion']

best_params = grid_search(param_space, param_names, X, y, n_points=15)
print({param_names[i]: best_params[i] for i in range(len(param_names))})
"""

'""\ncant import skopt for this functions \ndef optimize(params, param_names, x, y):\n    params = dict(zip(param_names, params))\n    model = ensemble.RandomForestClassifier(**params)\n    kf = model_selection.StratifiedKFold(n_splits= 5)\n    accuracies = []\n    for idx in kf.split( X= x, y=y):\n        train_idx, test_idx = idx[0], idx[1]\n        xtrain =x[train_idx]\n        ytrain = y[train_idx]\n        \n        xtest = x[test_idx]\n        ytest = y[test_idx]\n        \n        model.fit(xtrain, ytrain)\n        preds = model.predict(xtest)\n        fold_acc = metrics.accuracy_score(ytest, preds)\n        accuracies.append(fold_acc)\n        \n    return -1.0 * np.mean(accuracies)\n\n\nparam_space = [\n    space.Integer(3,15, name=\'max_depth\'),\n    space.Integer(100, 600, name=\'n_estimators\'),\n    space.Categorical([\'gini\', \'entropy\'], name=\'criterion\'),\n    space.Real(0.01, 1, prior=\'uniform\' ,name=\'max_features\')\n]\nparams_names = [\'max_depth\', \'n_estim

## Optuna

In [20]:
import optuna 
from functools import partial

In [22]:
# Typo on inserting the wrong XXXXXX


"""""
import optuna
from sklearn import metrics, ensemble, model_selection
from functools import partial

def optimize(trial, x, y):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_uniform('max_features', 0.01, 1.0)

    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        criterion=criterion
    )
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y=y):  ###X=X is the correct version, change all x to X
        train_idx, test_idx = idx[0], idx[1]
        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)

optimization_function = partial(optimize, x=X, y=y)

class OptunaRandomForestSearch:
    def __init__(self, n_jobs=-1):
        self.n_jobs = n_jobs
        self.best_params = None

    def optimize(self, n_trials=50):
        study = optuna.create_study(direction='minimize')
        study.optimize(optimization_function, n_trials=n_trials)

        self.best_params = study.best_params

    def print_best_results(self):
        print("Best Parameters:", self.best_params)

# Usage
# Assuming X, y, and OptunaRandomForestSearch are defined
optuna_random_forest_search = OptunaRandomForestSearch(n_jobs=-1)
optuna_random_forest_search.optimize(n_trials=50)
optuna_random_forest_search.print_best_results()
"""""

[I 2024-01-22 19:51:23,123] A new study created in memory with name: no-name-d97a290b-59d5-45eb-a249-defec51161c5
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 19:52:13,468] Trial 0 finished with value: -0.8875 and parameters: {'criterion': 'gini', 'n_estimators': 1364, 'max_depth': 7, 'max_features': 0.4103747794299047}. Best is trial 0 with value: -0.8875.
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 19:52:33,771] Trial 1 finished with value: -0.7765000000000001 and parameters: {'criterion': 'gini', 'n_estimators': 1297, 'max_depth': 9, 'max_features': 0.025532469337571882}. Best is trial 0 with value: -0.8875.
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 19:53:18,939] Trial 2 finished with value: -0.7675 and parameters: {'criterion': 'gini', 'n_estimators': 1411, 'max_depth': 3, 'max_features': 0.957543904918412}. Best is trial 0 with value: -0.8875.
  max_features = trial.suggest

Best Parameters: {'criterion': 'entropy', 'n_estimators': 1230, 'max_depth': 13, 'max_features': 0.7604498728878826}


In [23]:

import optuna
from sklearn import metrics, ensemble, model_selection
from functools import partial

def optimize(trial, X, y):  # Corrected X here
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    max_features = trial.suggest_uniform('max_features', 0.01, 1.0)

    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        criterion=criterion
    )
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=X, y=y):  # Corrected X here
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]  # Corrected X here
        ytrain = y[train_idx]

        xtest = X[test_idx]  # Corrected X here
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)

optimization_function = partial(optimize, X=X, y=y)

class OptunaRandomForestSearch:
    def __init__(self, n_jobs=-1):
        self.n_jobs = n_jobs
        self.best_params = None

    def optimize(self, n_trials=50):
        study = optuna.create_study(direction='minimize')
        study.optimize(optimization_function, n_trials=n_trials)

        self.best_params = study.best_params

    def print_best_results(self):
        print("Best Parameters:", self.best_params)

# Usage
# Assuming X, y, and OptunaRandomForestSearch are defined
optuna_random_forest_search = OptunaRandomForestSearch(n_jobs=-1)
optuna_random_forest_search.optimize(n_trials=50)
optuna_random_forest_search.print_best_results()


[I 2024-01-22 21:21:13,316] A new study created in memory with name: no-name-71ee025d-b5ba-4776-9537-b0ae80f3b138
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 21:21:39,070] Trial 0 finished with value: -0.9019999999999999 and parameters: {'criterion': 'gini', 'n_estimators': 459, 'max_depth': 13, 'max_features': 0.5396741128923808}. Best is trial 0 with value: -0.9019999999999999.
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 21:22:41,906] Trial 1 finished with value: -0.9029999999999999 and parameters: {'criterion': 'entropy', 'n_estimators': 1190, 'max_depth': 10, 'max_features': 0.4117219344873095}. Best is trial 1 with value: -0.9029999999999999.
  max_features = trial.suggest_uniform('max_features', 0.01, 1.0)
[I 2024-01-22 21:23:10,782] Trial 2 finished with value: -0.772 and parameters: {'criterion': 'entropy', 'n_estimators': 1280, 'max_depth': 12, 'max_features': 0.09489436141348773}. Best is trial 1 with val

Best Parameters: {'criterion': 'entropy', 'n_estimators': 1368, 'max_depth': 12, 'max_features': 0.5692850291373351}


#### Optimization History Plot


In [24]:
optuna.visualization.plot_optimization_history(study)

NameError: name 'study' is not defined

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
1.Best Parameters: {'criterion': 'entropy', 'n_estimators': 1230, 'max_depth': 13, 'max_features': 0.7604498728878826}
2.Best Parameters: {'criterion': 'entropy', 'n_estimators': 1368, 'max_depth': 12, 'max_features': 0.5692850291373351}

In [None]:
optuna.visualization.plot_slice(study, params = ['criterion',
                                                 'entropy',
                                                 'n_estimators',
                                                 'max_depth',
                                                 'min_samples_split',
                                                 'max_features',
                                                 ])

In [None]:
optuna.visualization.plot_param_importances()

#### Assigning the Best Hyperparameters

In [None]:
best_criterion = best_params['criterion']##
best_criterion

In [None]:
best_entropy = best_params['entropy']
best_entropy

In [None]:
34
best_n_estimators = best_params['n_estimators']##
best_n_estimators

In [None]:
best_max_depth = best_params['max_depth']##
best_max_depth

In [None]:

best_min_samples_split = best_params['min_samples_split']
best_min_samples_split

In [None]:
best_max_features = best_params['max_features']
best_max_features

## Ramdom Forest Model with The Best HyperParatmeters

In [None]:

class RandomForestGridSearch:
    def __init__(self, n_jobs=-1):  #-1 uses all the cpu to its max
        self.classifier = RandomForestClassifier(n_jobs=n_jobs)
        self.param_grid = {
            'n_estimators': best_n_estimators,
            'max_depth': best_max_depth,
            'criterion': best_criterion
        }
        self.model = None

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=1, cv=5):
        self.model = GridSearchCV(
            estimator=self.classifier,
            param_grid=self.param_grid,
            scoring=scoring, #all the classes have the same number of samples so we use accuracy
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage

random_forest_search = RandomForestGridSearch(n_jobs=-1)
random_forest_search.fit(X, y)
random_forest_search.print_best_results()