# HYPER-PARAMETER TUNING
this is following up to my previous notebook about keras-tuner

## Imports

***drive mount***

In [1]:
from google.colab import drive 
drive.mount("/content/drive")

Mounted at /content/drive


***general imports***

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from IPython.display import Image

***tensorflow imports***

In [3]:
import tensorflow as tf 

***sklearn imports***

In [4]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics 

## Load Data

***load data .csv file***

In [5]:
# assign dataset path 
dataset_path = "/content/drive/MyDrive/Youtube/6 - hyperparameter tuning/dataset/dataset.csv"

# read the dataset .csv file 
dataset = pd.read_csv(dataset_path)

# check the data
dataset.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


***info about dataset***

In [6]:
dataset.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


***making the target vector and feature matrix***

In [7]:
# assigning the target vector (y) 
target = dataset.price_range.values + 1

# assigning the feature matrix (X)
feature = dataset.drop('price_range', axis = 1)

# checking the type of target array
type(target)

numpy.ndarray

## Pre-Process Data

***scaling features***

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
feature_scaled = scaler.fit_transform(feature)

feature_scaled.shape

(2000, 20)

***split data into train/test sets***

In [9]:
X_train, X_test, y_train, y_test = train_test_split(feature_scaled, target, test_size = 0.2, random_state=222)

## Build Model Architecture

***load model from sklearn***

In [10]:
from sklearn.ensemble import RandomForestClassifier

reg_model = RandomForestClassifier(n_estimators = 50, random_state = 45) 

## Train The Model

***train our model***

In [11]:
reg_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=45)

## Evaluate Model Performance

***evaluate the model with .score***

In [12]:
reg_model.score(X_test, y_test)

0.8625

## Hyper-Parameter Tuning (Random Search CV)
Random Search has a wider approach than Grid Search, So we narrow down our choices by Random Search, then use Grid Search to find the best options

In [13]:
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint


rf = RandomForestClassifier()

# Look at parameters used by our current forest
print("Parameters currently in use:\n")
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


***The most important hyperparameters to tune are these:***

- n_estimators = number of trees in the forest 
- max_features = max_number of features cosidered for splitting a node
- max_depth = max number of levels in each decision tree 
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node 
- bootstrap = method for sampling data points (with or without replacment



***we will now try to narrow our options from the randomized search cv***

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


***now, we instantiate the random search and fit it like any other Sklearn model***

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# what are the best parameters ?
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': True}

## Hyper-Parameter Tuning (Grid Search CV)
Grid Search CV is a hyper-parameter tuning method, what we do essentially is give a couple of experimental parameters to our grid search, and the algorithm tried to find the best combination of those parameters