In [1]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification, load_iris, load_wine
from sklearn.model_selection import train_test_split
from stree import Stree
import time

In [3]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [4]:
random_state=1

def load_creditcard(n_examples=0):
    import pandas as pd
    import numpy as np
    import random
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    return Xtrain, Xtest, ytrain, ytest

# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
data = load_creditcard(-1000) # Take all the samples

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]

Fraud: 0.173% 492
Valid: 99.827% 284315
X.shape (1492, 28)  y.shape (1492,)
Fraud: 32.976% 492
Valid: 67.024% 1000


In [5]:
t = time.time()
for C in (.001, .01, 1, 5, 17):
    clf = Stree(C=C, random_state=random_state)
    clf.fit(Xtrain, ytrain)
    print(f"************** C={C} ****************************")
    print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
    print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
    print(clf)
    print(f"**************************************************")
print(f"{time.time() - t:.4f} secs")

************** C=0.001 ****************************
Classifier's accuracy (train): 0.9579
Classifier's accuracy (test) : 0.9509
root
root - Down, <cgaf> - Leaf class=1 belief=0.987013 counts=(array([0, 1]), array([  4, 304]))
root - Up, <cgaf> - Leaf class=0 belief=0.945652 counts=(array([0, 1]), array([696,  40]))

**************************************************
************** C=0.01 ****************************
Classifier's accuracy (train): 0.9579
Classifier's accuracy (test) : 0.9509
root
root - Down, <cgaf> - Leaf class=1 belief=0.990196 counts=(array([0, 1]), array([  3, 303]))
root - Up, <cgaf> - Leaf class=0 belief=0.944444 counts=(array([0, 1]), array([697,  41]))

**************************************************
************** C=1 ****************************
Classifier's accuracy (train): 0.9693
Classifier's accuracy (test) : 0.9576
root
root - Down
root - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([311]))
root - Down - Up, <pure> - Lea

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
scaler = StandardScaler()
cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)
cclf.fit(Xtrain, ytrain)
res = cclf.predict_proba(Xtest)
#an array containing probabilities of belonging to the 1st class

In [7]:
#check iterator
for i in list(clf):
    print(i)

root
root - Down
root - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))
root - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))
root - Up
root - Up - Down
root - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))
root - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))
root - Up - Up
root - Up - Up - Down
root - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Up - Up - Up
root - Up - Up - Up - Down
root - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))
root - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([68

In [8]:
#check iterator again
for i in clf:
    print(i)

root
root - Down
root - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))
root - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))
root - Up
root - Up - Down
root - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))
root - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))
root - Up - Up
root - Up - Up - Down
root - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Up - Up - Up
root - Up - Up - Up - Down
root - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))
root - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([68

In [9]:
# Check if the classifier is a sklearn estimator
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())

In [10]:
# Make checks one by one
c = 0
checks = check_estimator(Stree(), generate_only=True)
for check in checks:
    c += 1
    print(c, check[1])
    check[1](check[0])

1 functools.partial(<function check_no_attributes_set_in_init at 0x12d18e0e0>, 'Stree')
2 functools.partial(<function check_estimators_dtypes at 0x12d185200>, 'Stree')
3 functools.partial(<function check_fit_score_takes_y at 0x12d1850e0>, 'Stree')
4 functools.partial(<function check_sample_weights_pandas_series at 0x12d17eb00>, 'Stree')
5 functools.partial(<function check_sample_weights_not_an_array at 0x12d17ec20>, 'Stree')
6 functools.partial(<function check_sample_weights_list at 0x12d17ed40>, 'Stree')
7 functools.partial(<function check_sample_weights_invariance at 0x12d17ee60>, 'Stree')
8 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree')
9 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree', readonly_memmap=True)
10 functools.partial(<function check_complex_data at 0x12d181050>, 'Stree')
11 functools.partial(<function check_dtype_object at 0x12d17ef80>, 'Stree')
12 functools.partial(<function check_estimators