In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris
from stree import Stree

In [2]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [3]:
random_state=1

def load_creditcard(n_examples=0):
    import pandas as pd
    import numpy as np
    import random
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    return Xtrain, Xtest, ytrain, ytest

# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
data = load_creditcard(-1000) # Take all the samples

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]

Fraud: 0.244% 196
Valid: 99.755% 80234
X.shape (1196, 28)  y.shape (1196,)
Fraud: 16.555% 198
Valid: 83.445% 998


In [4]:
parameters = {
    'base_estimator': [Stree()],
    'n_estimators': [50, 100, 150],
    'learning_rate': [.5, 1],
    'base_estimator__tol': [.1,  1e-02],
    'base_estimator__max_depth': [5, 7],
    'base_estimator__C': [1, 3]
}
#'max_depth': [3, 5]

In [5]:
LinearSVC().get_params(), DecisionTreeClassifier().get_params()

({'C': 1.0,
  'class_weight': None,
  'dual': True,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': None,
  'tol': 0.0001,
  'verbose': 0},
 {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_impurity_split': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'presort': 'deprecated',
  'random_state': None,
  'splitter': 'best'})

In [6]:
random_state=2020
clf = AdaBoostClassifier(random_state=random_state)
grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1

GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,
             param_grid={'base_estimator': [Stree(C=1, max_depth=7, tol=0.1)],
                         'base_estimator__C': [1, 3],
                         'base_estimator__max_depth': [5, 7],
                         'base_estimator__tol': [0.1, 0.01],
                         'learning_rate': [0.5, 1],
                         'n_estimators': [50, 100, 150]},
             return_train_score=True, verbose=10)

In [7]:
grid.best_estimator_, grid.best_score_

(AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=7, tol=0.1),
                    learning_rate=0.5, random_state=2020),
 0.9808810949529512)

In [8]:
grid.cv_results_["params"]

[{'base_estimator': Stree(C=1, max_depth=7, tol=0.1),
  'base_estimator__C': 1,
  'base_estimator__max_depth': 5,
  'base_estimator__tol': 0.1,
  'learning_rate': 0.5,
  'n_estimators': 50},
 {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),
  'base_estimator__C': 1,
  'base_estimator__max_depth': 5,
  'base_estimator__tol': 0.1,
  'learning_rate': 0.5,
  'n_estimators': 100},
 {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),
  'base_estimator__C': 1,
  'base_estimator__max_depth': 5,
  'base_estimator__tol': 0.1,
  'learning_rate': 0.5,
  'n_estimators': 150},
 {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),
  'base_estimator__C': 1,
  'base_estimator__max_depth': 5,
  'base_estimator__tol': 0.1,
  'learning_rate': 1,
  'n_estimators': 50},
 {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),
  'base_estimator__C': 1,
  'base_estimator__max_depth': 5,
  'base_estimator__tol': 0.1,
  'learning_rate': 1,
  'n_estimators': 100},
 {'base_estimator': Stree(C=1, max_depth=7, to

In [9]:
import pandas as pd
res = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])], axis=1)

#print(res.sort_values(['Accuracy'], ascending=False))
print(res)

base_estimator  base_estimator__C  base_estimator__max_depth  \
0                                  1                          5   
1                                  1                          5   
2                                  1                          5   
3                                  1                          5   
4                                  1                          5   
5                                  1                          5   
6                                  1                          5   
7                                  1                          5   
8                                  1                          5   
9                                  1                          5   
10                                 1                          5   
11                                 1                          5   
12                                 1                          7   
13                                 1                          7  

In [10]:
grid.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__algorithm': 'SAMME.R',
 'estimator__base_estimator': None,
 'estimator__learning_rate': 1.0,
 'estimator__n_estimators': 50,
 'estimator__random_state': 2020,
 'estimator': AdaBoostClassifier(random_state=2020),
 'iid': 'deprecated',
 'n_jobs': -1,
 'param_grid': {'base_estimator': [Stree(C=1, max_depth=7, tol=0.1)],
  'n_estimators': [50, 100, 150],
  'learning_rate': [0.5, 1],
  'base_estimator__tol': [0.1, 0.01],
  'base_estimator__max_depth': [5, 7],
  'base_estimator__C': [1, 3]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 10}