# Grid Search 
# Help to Understand which Model to choose Linear Model (SVM) or Non-Linear Model (Kernel SVM)

In [1]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
# Set the Independent Variable (X) and dependent variable (y) = f(x)=y
## Make sure always X is matrix and Y is vector
X = dataset.iloc[:,[2,3]].values
y = dataset.iloc[:,4].values

In [5]:
# Split the dataset into Training set and Testing Set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [6]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [7]:
# Fitting SVM regression to the training set
# Test  with different Kernel -- "linear' , 'rbf', 'poly'
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
# Predict the test_set results
y_predict = classifier.predict(X_test)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1], dtype=int64)

In [9]:
# Evaluate the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_results = confusion_matrix(y_test,y_predict)
cm_results

array([[64,  4],
       [ 3, 29]], dtype=int64)

In [10]:
# Apply K-FOld Cross Validation
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
from sklearn.model_selection import cross_val_score
accuracy_1 = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv=10)

In [11]:
# Find the Mean value of accuracy to get mean value for k-fold cross validation
np.mean(accuracy_1)

0.90053021876158679

In [12]:
# Find the Std value to udnerstand the variance between the split of kfold dataset
np.std(accuracy_1)

0.063889573566262847

In [13]:
# Apply Grid Search to find the best model the best parameters
from sklearn.model_selection import GridSearchCV

# class sklearn.svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto’, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)[source]¶

In [15]:
# Apply Grid Search to find the best model the best parameters

# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html


from sklearn.model_selection import GridSearchCV

# Creating a dictionary parameters, 2 nested one cretaed for two different combination 1) Linear 2)Non-Linear 
parameters =[{'C': [1,10,100,1000], 'kernel':['linear']},
            {'C':[1,10,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1,0.01,0.001,0.00001]}]
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
# use n_jobs for large dataset due to cpu usage
grid_search = grid_search.fit(X_train,y_train)

In [16]:
best_accuracy = grid_search.best_score_
best_accuracy

0.90000000000000002

In [17]:
best_parameters = grid_search.best_params_
best_parameters

{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}