In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris
from stree import Stree

In [2]:
#X, y = load_iris(return_X_y=True)
#y[y==2] = 0

In [3]:
random_state=1

def load_creditcard(n_examples=0):
    import pandas as pd
    import numpy as np
    import random
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    return Xtrain, Xtest, ytrain, ytest

# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
data = load_creditcard(-1000) # Take all the samples

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]

Fraud: 0.244% 196
Valid: 99.755% 80234
X.shape (1196, 28)  y.shape (1196,)
Fraud: 16.472% 197
Valid: 83.528% 999


In [4]:
c = Stree(max_depth=2)
c.fit(Xtrain, ytrain)
print(c)

root
root - Down - Leaf class=1.0 belief=0.976000 counts=(array([0., 1.]), array([  3, 122]))
root - Up - Leaf class=0.0 belief=0.977528 counts=(array([0., 1.]), array([696,  16]))



In [5]:
#'base_estimator': [DecisionTreeClassifier(max_depth=1), Stree(max_depth=2), Stree(max_depth=3)],
parameters = {
    'base_estimator': [LinearSVC(), Stree(max_depth=2), Stree(max_depth=3)],
    'n_estimators': [20, 50, 100, 150],
    'learning_rate': [.5, 1, 1.5] 
}

In [6]:
#parameters = {
#    'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=5), Stree(), Stree(C=.1), Stree(C=.01), Stree(C=3)],
#    'n_estimators': [20, 50, 100, 150],
#    'learning_rate': [.5, 1, 1.5]           
#}

In [7]:
from inspect import signature
print(signature(c.fit))

(X: numpy.ndarray, y: numpy.ndarray, sample_weight: <built-in function array> = None) -> 'Stree'


In [8]:
from sklearn.utils.validation import _check_sample_weight

In [9]:
random_state=2020
clf = AdaBoostClassifier(random_state=random_state)
grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1671s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0413s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (7.7880s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:  

GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,
             param_grid={'base_estimator': [LinearSVC(), Stree(max_depth=2),
                                            Stree(max_depth=3)],
                         'learning_rate': [0.5, 1, 1.5],
                         'n_estimators': [20, 50, 100, 150]},
             return_train_score=True, verbose=10)

In [10]:
print(grid.best_estimator_)

AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,
                   n_estimators=150, random_state=2020)


AdaBoostClassifier(base_estimator=Stree(max_depth=3), learning_rate=0.5,
                   n_estimators=20, random_state=2020)