# GridSearchCV

It's a procedure where the best possible parameters are chosen.

This Works Universally on many of the Machine Learning Algorithms.

Consider the implementation below and note the accuracy at the end!!

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Social_Network_Ads.csv")
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
data['Gender'].replace({'Male': 0, 'Female':1}, inplace=True)

In [4]:
# choosing input and output
x = data.iloc[:,1:4].values
y = data.iloc[:, -1].values

In [5]:
# scaling the values since the EstimatedSalary and Age are too far apart
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [6]:
x = scaler.fit_transform(x)
x

array([[-1.02020406, -1.78179743, -1.49004624],
       [-1.02020406, -0.25358736, -1.46068138],
       [ 0.98019606, -1.11320552, -0.78528968],
       ...,
       [ 0.98019606,  1.17910958, -1.46068138],
       [-1.02020406, -0.15807423, -1.07893824],
       [ 0.98019606,  1.08359645, -0.99084367]])

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
# split 20% of data into testing data randomly -- test_size=0.2, random_state=1

In [8]:
# create a classifier object
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [9]:
# use the object and fit the training data
# model training is done her!!!
clf.fit(x_train, y_train)
# printing the clss attributes
print(clf.class_weight, clf.criterion, clf.max_depth, clf.max_features_)

None gini None 3


In [10]:
# use the x_test to predict the output => y_pred
y_pred = clf.predict(x_test)

In [11]:
# measure accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
# we are getting a accuracy of 80% while using all the columns as is...

0.775

Now we use the GridSearchCV to find the best posssible number for these parmeters to get the best accuracy.

In [12]:
# we are choosing to look for the best possible parameters for "criterion" and "max_depth"
# create a Dictionary to pass into the GridSearchCV
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": [1,2,3,4,5,6,7,None]
}
# grid search cv will create a 2D space where it will check each cell for the accuracy
# and return the parameters for which accuracy was the best

In [13]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(clf, param_grid=param_dist, cv=10, n_jobs=-1)
# "CV" stands for Cross-Validation --- will study this in the future

In [14]:
# run the data on this grid
grid.fit(x_train, y_train)

In [15]:
grid.best_estimator_
# will tell you the best possible estimator

In [18]:
grid.best_params_
# will return the best parameters from the grid
# therfore, the best estimator shoiuld be = "gini" and max_depth = 2

{'criterion': 'gini', 'max_depth': 2}

In [19]:
grid.best_score_
# will return the best accuracy achieved

0.91875