# Hyperparameter Optimization

In [11]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing 
from sklearn import decomposition 
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.shape


(2000, 21)

In [6]:
df_test.shape

(1000, 21)

In [7]:
df_train.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [8]:
df_test.columns

Index(['id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc',
       'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

In [9]:
X = df_train.drop('price_range', axis=1).values
y = df_train.price_range.values

## Random Forest Classifier 

In [12]:

class RandomForestGridSearch:
    def __init__(self, n_jobs=-1):  #-1 uses all the cpu to its max
        self.classifier = RandomForestClassifier(n_jobs=n_jobs)
        self.param_grid = {
            'n_estimators': [100, 200, 300, 400],
            'max_depth': [1, 3, 5, 7],
            'criterion': ['gini', 'entropy']
        }
        self.model = None

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=1, cv=5):
        self.model = GridSearchCV(
            estimator=self.classifier,
            param_grid=self.param_grid,
            scoring=scoring, #all the classes have the same number of samples so we use accuracy
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage

random_forest_search = RandomForestGridSearch(n_jobs=-1)
random_forest_search.fit(X, y)
random_forest_search.print_best_results()


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 1/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.565 total time=   0.1s
[CV 2/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 2/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.568 total time=   0.1s
[CV 3/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 3/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.615 total time=   0.1s
[CV 4/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 4/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.593 total time=   0.1s
[CV 5/5; 1/32] START criterion=gini, max_depth=1, n_estimators=100..............
[CV 5/5; 1/32] END criterion=gini, max_depth=1, n_estimators=100;, score=0.578 total time=   0.1s
[CV 1/5; 2/32] START criterion=gini, max_de

## Random Search Parameters  

In [13]:
class MyRandomForestRandomizedSearch:
    def __init__(self, n_jobs=-1): #-1 uses all the cpu to its max
        self.classifier = RandomForestClassifier(n_jobs=n_jobs)
        self.param_dist = {
            'n_estimators': np.arange(100, 1500, 100),
            'max_depth': np.arange(1, 20),
            'criterion': ['gini', 'entropy']
        }
        self.model = None

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=1, cv=5, n_iter=10):
        self.model = RandomizedSearchCV(
            estimator=self.classifier,
            param_distributions=self.param_dist,
            scoring=scoring,
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv,
            n_iter=n_iter #do random search 10 times
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage
random_forest_random_search = MyRandomForestRandomizedSearch(n_jobs=-1)
random_forest_random_search.fit(X, y)
random_forest_random_search.print_best_results()


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=entropy, max_depth=2, n_estimators=200...........
[CV 1/5; 1/10] END criterion=entropy, max_depth=2, n_estimators=200;, score=0.695 total time=   0.3s
[CV 2/5; 1/10] START criterion=entropy, max_depth=2, n_estimators=200...........
[CV 2/5; 1/10] END criterion=entropy, max_depth=2, n_estimators=200;, score=0.677 total time=   0.3s
[CV 3/5; 1/10] START criterion=entropy, max_depth=2, n_estimators=200...........
[CV 3/5; 1/10] END criterion=entropy, max_depth=2, n_estimators=200;, score=0.713 total time=   0.3s
[CV 4/5; 1/10] START criterion=entropy, max_depth=2, n_estimators=200...........
[CV 4/5; 1/10] END criterion=entropy, max_depth=2, n_estimators=200;, score=0.690 total time=   0.3s
[CV 5/5; 1/10] START criterion=entropy, max_depth=2, n_estimators=200...........
[CV 5/5; 1/10] END criterion=entropy, max_depth=2, n_estimators=200;, score=0.690 total time=   0.3s
[CV 1/5; 2/10] START criterio

### Grid/Random Search With Pipeline

In [18]:
class MyRandomForestPipeline:
    def __init__(self, n_iter=10):
        self.scaler = preprocessing.StandardScaler()
        self.pca = decomposition.PCA()
        self.rf = ensemble.RandomForestClassifier(n_jobs= -1)
        self.classifier = pipeline.Pipeline([
            ('scaling', self.scaler),
            ('pca', self.pca),
            ('rf', self.rf)
        ])
        self.param_grid = {
            'pca__n_components': np.arange(5, 10),
            'rf__n_estimators': np.arange(100, 1500, 100),
            'rf__max_depth': np.arange(1, 20),
            'rf__criterion': ['gini', 'entropy']
        }
        self.model = None
        self.n_iter = n_iter

    def fit(self, X, y, scoring='accuracy', verbose=10, n_jobs=-1, cv=5):
        self.model = RandomizedSearchCV(
            estimator=self.classifier,
            param_distributions=self.param_grid,
            scoring=scoring,
            verbose=verbose,
            n_jobs=n_jobs,
            cv=cv,
            n_iter=self.n_iter
        )
        self.model.fit(X, y)

    def print_best_results(self):
        print("Best Score:", self.model.best_score_)
        print("Best Parameters:", self.model.best_estimator_.get_params())

# Usage

my_pipeline = MyRandomForestPipeline(n_iter=10)
my_pipeline.fit(X, y)
my_pipeline.print_best_results()


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Score: 0.413
Best Parameters: {'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=8)), ('rf', RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=1200,
                       n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=8), 'rf': RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=1200,
                       n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 8, 'pca__n_oversamples': 10, 'pca__power_iteration_normalizer': 'auto', 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'entropy', 'rf__max_depth': 2, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': None, 'rf__max_sample

### Bayesian Optimization with Gaussian Process

In [4]:
from functools import partial
import skopt

ModuleNotFoundError: No module named 'skopt'

In [23]:
def optimize(params, param_names, x, y):
    params = dict(zip(param_names, params))
    model = ensemble.RandomForestClassifier(**params)
    kf = model_selection.StratifiedKFold(n_splits= 5)
    accuracies = []
    for idx in kf.split( X= x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain =x[train_idx]
        ytrain = y[train_idx]
        
        xtest = x[test_idx]
        ytest = y[test_idx]
        
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)
        
    return -1.0 * np.mean(accuracies)


param_space = [
    space.Integer(3,15, name='max_depth'),
    space.Integer(100, 600, name='n_estimators'),
    space.Categorical(['gini', 'entropy'], name='criterion'),
    space.Real(0.01, 1, prior='uniform' ,name='max_features')
]
params_names = ['max_depth', 'n_estimators', 'criterion', 'max_features']

optimization_function = partial(
    optimize,
    param_names= param_names,
    x= X,
    y = y
)

result = gp_minimize(
    optimization_function,
    dimensions = param_space,
    n_calls= 15,
    n_random_starts=10,
    verbose=10,
)

print(
    dict(
        param_names,
        results.x
    )
)

NameError: name 'space' is not defined