### HYPERPARAMETER OPTIMIZATION WITH GRIDSERCHCV

In [1]:
# import packages 
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data 
data = pd.read_csv("data/mobile_price_data.csv")

In [3]:
#read data 
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
#show shape
data.shape

(2000, 21)

In [5]:
#show list of columns 
list(data.columns)

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [6]:
# show data properties
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [7]:
# show data information 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [8]:
# check if it has missing values 
data.isnull().sum() 

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [9]:
# split data into features and target 
X = data.drop("price_range", axis=1).values 
y = data.price_range.values

In [10]:
# standardize the feature variables 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
#Create classifier 
rf_classifier = RandomForestClassifier(n_jobs=-1) 

In [12]:
# set different parameter values to tune
param_grid = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [1, 3, 5, 7, 9],
    "criterion": ["gini", "entropy"],
}

In [13]:
# set gridsearch
model = GridSearchCV(
    estimator=rf_classifier, param_grid=param_grid, cv=5, verbose=2, n_jobs=1
)

In [14]:
# train the model with gridserchCV 
model.fit(X_scaled,y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... criterion=gini, max_depth=1, n_estimators=100, total=   5.2s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s


[CV] .... criterion=gini, max_depth=1, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=200, total=   0.7s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=200, total=   0.7s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................
[CV] .... criterion=gini, max_depth=1, n_estimators=200, total=   0.6s
[CV] criterion=gini, max_depth=1, n_estimators=200 ...................
[CV] .

[CV] .... criterion=gini, max_depth=5, n_estimators=400, total=   1.2s
[CV] criterion=gini, max_depth=7, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV] .... criterion=gini, max_depth=7, n_estimators=200, total=   0.7s
[CV] criterion=gini, max_depth=7, n_estimators=200 ...................
[CV] .

[CV] . criterion=entropy, max_depth=1, n_estimators=400, total=   1.1s
[CV] criterion=entropy, max_depth=1, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=1, n_estimators=400, total=   1.1s
[CV] criterion=entropy, max_depth=1, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=1, n_estimators=400, total=   1.1s
[CV] criterion=entropy, max_depth=3, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=3, n_estimators=100, total=   0.4s
[CV] criterion=entropy, max_depth=3, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=3, n_estimators=100, total=   0.4s
[CV] criterion=entropy, max_depth=3, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=3, n_estimators=100, total=   0.4s
[CV] criterion=entropy, max_depth=3, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=3, n_estimators=100, total=   0.4s
[CV] criterion=entropy, max_depth=3, n_estimators=100 ................
[CV] .

[CV] . criterion=entropy, max_depth=7, n_estimators=400, total=   2.0s
[CV] criterion=entropy, max_depth=7, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=7, n_estimators=400, total=   1.5s
[CV] criterion=entropy, max_depth=7, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=7, n_estimators=400, total=   1.5s
[CV] criterion=entropy, max_depth=7, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=7, n_estimators=400, total=   1.5s
[CV] criterion=entropy, max_depth=7, n_estimators=400 ................
[CV] . criterion=entropy, max_depth=7, n_estimators=400, total=   1.7s
[CV] criterion=entropy, max_depth=9, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=9, n_estimators=100, total=   0.9s
[CV] criterion=entropy, max_depth=9, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=9, n_estimators=100, total=   0.5s
[CV] criterion=entropy, max_depth=9, n_estimators=100 ................
[CV] .

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  3.2min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 3, 5, 7, 9],
                         'n_estimators': [100, 200, 300, 400]},
             verbose=2)

In [15]:
# print the best score and estimator 
print(model.best_score_)
print(model.best_estimator_.get_params())

0.882
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
