## Classifier and HPO 

### Load Data

In [41]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.975 (+/-0.014) for {'C': 1, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 10, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 100, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model 

Ada

In [None]:
param_dist = {'algorithm':['SAMME.R','SAMME'],
              'n_estimators':sp_randint(50, 500),
              'learning_rate':sp_randfloat(0.01,2)}

DT

In [None]:
param_dist = {'criterion':['gini','entropy'],
             'min_samples_split':sp_randint(2,20),
             'min_samples_leaf':sp_randint(1,20)}

SVC

In [27]:
param_dist = {'penalty':['l1','l2'],
             'loss':['hinge','squared_hinge'],
             'tol':sp_randfloat(0.00001,0.1),
             'C':sp_randfloat(0.03,32768)}

1

In [38]:
def sp_randint(a,b,n_iter_search=20):
    return [random.randint(a,b) for _ in range(n_iter_search) ]

In [39]:
import random
def sp_randfloat(a,b,n_iter_search=20):
    return [random.uniform(a, b) for _ in range(n_iter_search)]

In [None]:
#Extra Tree
param_dist = {'criterion':['gini','entropy'],
             'max_feature':sp_randfloat(0.0,1.0),
             'min_samples_split':sp_randint(2,20),
             'min_samples_leaf':sp_randint(1,20)}

In [None]:
#Bernoulli
param_dist = {'alpha': [random.uniform(0.01, 2) for _ in range(n_iter_search)],
             'fit_prior':[True,False]}

## compare Improtance

In [42]:
import pandas as pd
df = pd.DataFrame(columns=['kernel','gamma','C','score'])

In [None]:
0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.975 (+/-0.014) for {'C': 1, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 10, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 100, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 1000, 'kernel': 'linear'}

In [45]:
df = df.append({'C': 1, 'gamma': 0.001, 'kernel': 'rbf','score':0.986},ignore_index=True)

Unnamed: 0,kernel,gamma,C,score
0,rbf,0.001,1,0.986


In [58]:
df = pd.DataFrame([{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'},
                  {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'},
                  {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'},
                  {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'},
                  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'},
                  {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'},
                  {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'},
                  {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'},
                  {'C': 1,'gamma':0 ,'kernel': 'linear'},
                  {'C': 10,'gamma':0 ,'kernel': 'linear'},
                  {'C': 100,'gamma':0 ,'kernel': 'linear'},
                  {'C': 1000,'gamma':0 ,'kernel': 'linear'},])

In [59]:
df['score'] = [0.986,0.959,0.988,0.982,0.988,0.982,0.988,0.982,0.975,0.975,0.975,0.975]

In [60]:
df

Unnamed: 0,C,gamma,kernel,score
0,1,0.001,rbf,0.986
1,1,0.0001,rbf,0.959
2,10,0.001,rbf,0.988
3,10,0.0001,rbf,0.982
4,100,0.001,rbf,0.988
5,100,0.0001,rbf,0.982
6,1000,0.001,rbf,0.988
7,1000,0.0001,rbf,0.982
8,1,0.0,linear,0.975
9,10,0.0,linear,0.975


In [56]:
df.groupby(df['C'])['score'].std()

C
1       0.019092
10      0.004243
100     0.004243
1000    0.004243
Name: score, dtype: float64

In [68]:
(0.019092 + 0.004243*3) / 4

0.00795525

In [57]:
df.groupby(df['gamma'])['score'].std()

gamma
0.0001    0.0115
0.0010    0.0010
Name: score, dtype: float64

In [67]:
(0.0115 + 0.0010) / 2

0.00625

In [72]:
df.groupby(df['kernel'])['score'].mean()

kernel
linear    0.975000
rbf       0.981875
Name: score, dtype: float64

In [73]:
df.groupby(df['kernel'])['score'].std()

kernel
linear    0.000000
rbf       0.009658
Name: score, dtype: float64

In [66]:
df.groupby(df['kernel'])['score'].std().mean()

0.00482876426073115

Entropy

In [74]:
from sklearn.metrics import log_loss
log_loss(["spam", "ham", "ham", "spam"],  
         [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])

0.21616187468057912

In [80]:
log_loss([1,2],[1,3])

17.26978799617044