# Test AdaBoost with different configurations

# Setup
Uncomment the next cell if STree is not already installed

In [1]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree

In [2]:
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris
from stree import Stree

In [3]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [4]:
random_state=1

def load_creditcard(n_examples=0):
    import pandas as pd
    import numpy as np
    import random
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    return Xtrain, Xtest, ytrain, ytest

# data = load_creditcard(-1000) # Take all true samples + 1000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
# data = load_creditcard(0) # Take all the samples
data = load_creditcard(-100000)

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]

Fraud: 0.173% 492
Valid: 99.827% 284315
X.shape (100492, 28)  y.shape (100492,)
Fraud: 0.659% 662
Valid: 99.341% 99830


# Tests

## STree alone on the whole dataset and linear kernel

In [5]:
now = time.time()
clf = Stree(max_depth=3, random_state=random_state)
clf.fit(Xtrain, ytrain)
print("Score Train: ", clf.score(Xtrain, ytrain))
print("Score Test: ", clf.score(Xtest, ytest))
print(f"Took {time.time() - now:.2f} seconds")

Score Train:  0.9985499829409757
Score Test:  0.998407854584052
Took 39.45 seconds


## Different kernels with different configuations

In [6]:
n_estimators = 10
C = 7
max_depth = 3

In [7]:
for kernel in ['linear', 'rbf', 'poly']:
    now = time.time()
    clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)
    clf.fit(Xtrain, ytrain)
    score_train = clf.score(Xtrain, ytrain)
    score_test = clf.score(Xtest, ytest)
    print(f"Kernel: {kernel}\tTime: {time.time() - now:.2f} seconds\tScore Train: {score_train:.7f}\tScore Test: {score_test:.7f}")

Kernel: linear	Time: 87.00 seconds	Score Train: 0.9982372	Score Test: 0.9981425
Kernel: rbf	Time: 60.60 seconds	Score Train: 0.9934181	Score Test: 0.9933992
Kernel: poly	Time: 88.08 seconds	Score Train: 0.9937450	Score Test: 0.9938968


## Test algorithm SAMME in AdaBoost to check speed/accuracy

In [8]:
for kernel in ['linear', 'rbf', 'poly']:
    now = time.time()
    clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm="SAMME")
    clf.fit(Xtrain, ytrain)
    score_train = clf.score(Xtrain, ytrain)
    score_test = clf.score(Xtest, ytest)
    print(f"Kernel: {kernel}\tTime: {time.time() - now:.2f} seconds\tScore Train: {score_train:.7f}\tScore Test: {score_test:.7f}")

Kernel: linear	Time: 58.75 seconds	Score Train: 0.9980524	Score Test: 0.9978771
Kernel: rbf	Time: 12.49 seconds	Score Train: 0.9934181	Score Test: 0.9933992
Kernel: poly	Time: 97.85 seconds	Score Train: 0.9972137	Score Test: 0.9971806
