# Grid Search to Tune Hyper Parameters

In [1]:
# regular import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# plt.style.use('bmh')
plt.rcParams['font.size'] = 11

### Data Loading and Train Test Split

In [3]:
# Loading the brest cancer data
data_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(data_uri, header=None)

In [4]:
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
# Seperating the featu
features = df.loc[:, 2:].values
targets = df.loc[:, 1].values

In [7]:
le = LabelEncoder()
targets = le.fit_transform(targets)
le.classes_

array(['B', 'M'], dtype=object)

In [8]:
le.transform(["M", "B"])

array([1, 0])

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# Train Test split
feat_train, feat_test, tar_train, tar_test = train_test_split(
    features, 
    targets, 
    test_size=0.2, 
    stratify=targets, 
    random_state=1
)

### Building Pipeline

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

#### A. Diagnosis with Learning Curve

In [16]:
pipe_svc = make_pipeline(
    StandardScaler(), 
    SVC(random_state=1)
)

In [12]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [
    {'svc__C': param_range, 'svc__kernel': ['linear']},
    {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}
]

In [17]:
gs = GridSearchCV(
    estimator=pipe_svc,
    param_grid = param_grid,
    scoring='accuracy',
    cv=10,
    n_jobs=-1
)

gs.fit(feat_train, tar_train)
print(gs.best_score_)
print(gs.best_params_)

0.9846153846153847
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [18]:
clf = gs.best_estimator_
clf.fit(feat_train, tar_train)

print("Test accuracy: {:.3f}".format(clf.score(feat_test, tar_test)))

Test accuracy: 0.974


# References
[1] Python Machine Learning by Sebastian R.