# Test Gridsearch
with different kernels and different configurations

# Setup
Uncomment the next cell if STree is not already installed

In [None]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree
!pip install pandas

In [None]:
import random
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from stree import Stree

In [None]:
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [None]:
random_state=1

def load_creditcard(n_examples=0):
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    return Xtrain, Xtest, ytrain, ytest

data = load_creditcard(-1000) # Take all true samples + 1000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
# data = load_creditcard(0) # Take all the samples

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]

# Tests

In [None]:
parameters = [{
    'base_estimator': [Stree(random_state=random_state)],
    'n_estimators': [10, 25],
    'learning_rate': [.5, 1],
    'estimator__split_criteria': ['max_samples', 'impurity'],
    'estimator__tol': [.1,  1e-02],
    'estimator__max_depth': [3, 5, 7],
    'estimator__C': [1, 7, 55],
    'estimator__kernel': ['linear']
},
{
    'base_estimator': [Stree(random_state=random_state)],
    'n_estimators': [10, 25],
    'learning_rate': [.5, 1],
    'estimator__split_criteria': ['max_samples', 'impurity'],
    'estimator__tol': [.1,  1e-02],
    'estimator__max_depth': [3, 5, 7],
    'estimator__C': [1, 7, 55],
    'estimator__degree': [3, 5, 7],
    'estimator__kernel': ['poly']
},
{
    'base_estimator': [Stree(random_state=random_state)],
    'n_estimators': [10, 25],
    'learning_rate': [.5, 1],
    'estimator__split_criteria': ['max_samples', 'impurity'],
    'estimator__tol': [.1,  1e-02],
    'estimator__max_depth': [3, 5, 7],
    'estimator__C': [1, 7, 55],
    'estimator__gamma': [.1, 1, 10],
    'estimator__kernel': ['rbf']
}]

In [None]:
Stree().get_params()

In [None]:
clf = AdaBoostClassifier(random_state=random_state, algorithm="SAMME")
grid = GridSearchCV(clf, parameters, verbose=5, n_jobs=-1, return_train_score=True)
grid.fit(Xtrain, ytrain)

In [None]:
print("Best estimator: ", grid.best_estimator_)
print("Best hyperparameters: ", grid.best_params_)
print("Best accuracy: ", grid.best_score_)

Best estimator:  AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=Stree(C=55, max_depth=7, random_state=1,
                                        split_criteria='max_samples', tol=0.1),
                   learning_rate=0.5, n_estimators=25, random_state=1)
Best hyperparameters:  {'base_estimator': Stree(C=55, max_depth=7, random_state=1, split_criteria='max_samples', tol=0.1), 'estimator__C': 55, 'estimator__kernel': 'linear', 'estimator__max_depth': 7, 'estimator__split_criteria': 'max_samples', 'estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 25}

Best accuracy:  0.9511777695988222