### HYPERPARAMETER OPTIMIZATION WITH HYPEROPT

In [67]:
# import packages 
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope

import warnings
warnings.filterwarnings("ignore")

In [68]:
# load data 
data = pd.read_csv("data/mobile_price_data.csv")

In [69]:
#read data 
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [70]:
#show shape
data.shape

(2000, 21)

In [71]:
#show list of columns 
list(data.columns)

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [72]:
# show data properties
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [73]:
# show data information 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [74]:
# check if it has missing values 
data.isnull().sum() 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [75]:
# split data into features and target 
X = data.drop("price_range", axis=1).values 
y = data.price_range.values

In [76]:
# standardize the feature variables 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [84]:
# define parameter space

space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
    "max_depth": hp.quniform("max_depth", 1, 15,1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

In [85]:
# define objective function

def hyperparameter_tuning(params):
    clf = RandomForestClassifier(**params,n_jobs=-1)
    acc = cross_val_score(clf, X_scaled, y,scoring="accuracy").mean()
    return {"loss": -acc, "status": STATUS_OK}

In [86]:
# Fine tune the model
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))

100%|█████████████████████████████████████████████████████████| 100/100 [10:30<00:00,  6.30s/trial, best loss: -0.8915]
Best: {'criterion': 1, 'max_depth': 11.0, 'n_estimators': 2}


In [96]:
# A list of dictionaries representing everything about the search

trials.trials

[{'state': 2,
  'tid': 0,
  'spec': None,
  'result': {'loss': -0.8790000000000001, 'status': 'ok'},
  'misc': {'tid': 0,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'criterion': [0], 'max_depth': [0], 'n_estimators': [0]},
   'vals': {'criterion': [1], 'max_depth': [10.0], 'n_estimators': [3]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2020, 9, 8, 14, 19, 31, 375000),
  'refresh_time': datetime.datetime(2020, 9, 8, 14, 19, 39, 277000)},
 {'state': 2,
  'tid': 1,
  'spec': None,
  'result': {'loss': -0.877, 'status': 'ok'},
  'misc': {'tid': 1,
   'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'workdir': None,
   'idxs': {'criterion': [1], 'max_depth': [1], 'n_estimators': [1]},
   'vals': {'criterion': [0], 'max_depth': [13.0], 'n_estimators': [0]}},
  'exp_key': None,
  'owner': None,
  'version': 0,
  'book_time': datetime.datetime(2020, 9, 8, 14, 19, 39, 285000),
  'refresh_time': datetime.date

In [97]:
# A list of dictionaries returned by 'objective' during the search 
trials.results

[{'loss': -0.8790000000000001, 'status': 'ok'},
 {'loss': -0.877, 'status': 'ok'},
 {'loss': -0.768, 'status': 'ok'},
 {'loss': -0.8205, 'status': 'ok'},
 {'loss': -0.8720000000000001, 'status': 'ok'},
 {'loss': -0.883, 'status': 'ok'},
 {'loss': -0.8554999999999999, 'status': 'ok'},
 {'loss': -0.8789999999999999, 'status': 'ok'},
 {'loss': -0.595, 'status': 'ok'},
 {'loss': -0.8765000000000001, 'status': 'ok'},
 {'loss': -0.877, 'status': 'ok'},
 {'loss': -0.8775000000000001, 'status': 'ok'},
 {'loss': -0.776, 'status': 'ok'},
 {'loss': -0.882, 'status': 'ok'},
 {'loss': -0.8744999999999999, 'status': 'ok'},
 {'loss': -0.825, 'status': 'ok'},
 {'loss': -0.757, 'status': 'ok'},
 {'loss': -0.8765000000000001, 'status': 'ok'},
 {'loss': -0.674, 'status': 'ok'},
 {'loss': -0.843, 'status': 'ok'},
 {'loss': -0.8579999999999999, 'status': 'ok'},
 {'loss': -0.876, 'status': 'ok'},
 {'loss': -0.866, 'status': 'ok'},
 {'loss': -0.881, 'status': 'ok'},
 {'loss': -0.8825, 'status': 'ok'},
 {'los

In [98]:
#  A list of losses (float for each 'ok' trial)
trials.losses()

[-0.8790000000000001,
 -0.877,
 -0.768,
 -0.8205,
 -0.8720000000000001,
 -0.883,
 -0.8554999999999999,
 -0.8789999999999999,
 -0.595,
 -0.8765000000000001,
 -0.877,
 -0.8775000000000001,
 -0.776,
 -0.882,
 -0.8744999999999999,
 -0.825,
 -0.757,
 -0.8765000000000001,
 -0.674,
 -0.843,
 -0.8579999999999999,
 -0.876,
 -0.866,
 -0.881,
 -0.8825,
 -0.8879999999999999,
 -0.8895,
 -0.8875,
 -0.8805,
 -0.8855000000000001,
 -0.873,
 -0.8880000000000001,
 -0.8865000000000001,
 -0.8875,
 -0.8869999999999999,
 -0.8915,
 -0.8780000000000001,
 -0.8885000000000002,
 -0.8825,
 -0.8785000000000001,
 -0.8614999999999998,
 -0.8795,
 -0.8755,
 -0.8835,
 -0.867,
 -0.8800000000000001,
 -0.8819999999999999,
 -0.8779999999999999,
 -0.885,
 -0.853,
 -0.8799999999999999,
 -0.8100000000000002,
 -0.8379999999999999,
 -0.8879999999999999,
 -0.8695,
 -0.8870000000000001,
 -0.8815,
 -0.8855000000000001,
 -0.8795,
 -0.5504999999999999,
 -0.8860000000000001,
 -0.7849999999999999,
 -0.8844999999999998,
 -0.861000000000

In [95]:
# A list of status strings
trials.statuses() 

['ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok',
 'ok']