# Test sample_weight, kernels, C, sklearn estimator

# Setup
Uncomment the next cell if STree is not already installed

In [1]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.estimator_checks import check_estimator
from sklearn.datasets import make_classification, load_iris, load_wine
from sklearn.model_selection import train_test_split
from stree import Stree
import time

In [3]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [4]:
random_state=1

def load_creditcard(n_examples=0):
    import pandas as pd
    import numpy as np
    import random
    df = pd.read_csv('data/creditcard.csv')
    print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = df.Class
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    if n_examples > 0:
        # Take first n_examples samples
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        # Take all the positive samples with a number of random negatives
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state)
    return Xtrain, Xtest, ytrain, ytest

data = load_creditcard(-5000) # Take all true samples with up to 5000 of the others
# data = load_creditcard(5000)  # Take the first 5000 samples
# data = load_creditcard(-1000) # Take 1000 samples

Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]
_, data = np.unique(ytrain, return_counts=True)
wtrain = (data[1] / np.sum(data),  data[0] / np.sum(data))
_, data = np.unique(ytest, return_counts=True)
wtest = (data[1] / np.sum(data),  data[0] / np.sum(data))
# Set weights inverse to its count class in dataset
weights = np.ones(Xtrain.shape[0],)
weights[ytrain==0] = wtrain[0]
weights[ytrain==1] = wtrain[1]
weights_test = np.ones(Xtest.shape[0],)
weights_test[ytest==0] = wtest[0]
weights_test[ytest==1] = wtest[1]
print(weights[:4], weights_test[:4])

Fraud: 0.173% 492
Valid: 99.827% 284315
X.shape (5492, 28)  y.shape (5492,)
Fraud: 9.086% 499
Valid: 90.914% 4993
[0.09157128 0.09157128 0.09157128 0.09157128] [0.08919903 0.08919903 0.08919903 0.08919903]


# Tests

## Test sample_weights
Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class

In [5]:
C = 23
print("Accuracy of Train without weights", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))
print("Accuracy of Train with    weights", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))
print("Accuracy of Tests without weights", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))
print("Accuracy of Tests with    weights", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))

Accuracy of Train without weights 0.9875130072840791
Accuracy of Train with    weights 0.9919354838709677
Accuracy of Tests without weights 0.9866504854368932
Accuracy of Tests with    weights 0.9872572815533981


## Test accuracy with different kernels
Compute accuracy on train and test set with default hyperparmeters of every kernel

In [6]:
random_state=1
for kernel in ['linear', 'rbf', 'poly']:
    now = time.time()
    clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)
    accuracy_train = clf.score(Xtrain, ytrain)
    accuracy_test = clf.score(Xtest, ytest)
    time_spent = time.time() - now
    print(f"Time: {time_spent:.2f}s\tKernel: {kernel}\tAccuracy_train: {accuracy_train}\tAccuracy_test: {accuracy_test}")


Time: 1.45s	Kernel: linear	Accuracy_train: 0.9854318418314256	Accuracy_test: 0.9842233009708737
Time: 0.50s	Kernel: rbf	Accuracy_train: 0.9940166493236212	Accuracy_test: 0.9908980582524272
Time: 0.42s	Kernel: poly	Accuracy_train: 0.9945369406867846	Accuracy_test: 0.9872572815533981


## Test diferent values of C

In [7]:
t = time.time()
for C in (.001, .01, 1, 5, 17):
    clf = Stree(C=C, kernel="linear", random_state=random_state)
    clf.fit(Xtrain, ytrain)
    print(f"************** C={C} ****************************")
    print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
    print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
    print(clf)
    print(f"**************************************************")
print(f"{time.time() - t:.4f} secs")

************** C=0.001 ****************************
Classifier's accuracy (train): 0.9826
Classifier's accuracy (test) : 0.9854
root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1664
root - Down, <cgaf> - Leaf class=1 belief= 0.986348 impurity=0.0269 counts=(array([0, 1]), array([  4, 289]))
root - Up, <cgaf> - Leaf class=0 belief= 0.982259 impurity=0.0349 counts=(array([0, 1]), array([3488,   63]))

**************************************************
************** C=0.01 ****************************
Classifier's accuracy (train): 0.9826
Classifier's accuracy (test) : 0.9860
root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1664
root - Down, <cgaf> - Leaf class=1 belief= 0.986348 impurity=0.0269 counts=(array([0, 1]), array([  4, 289]))
root - Up, <cgaf> - Leaf class=0 belief= 0.982259 impurity=0.0349 counts=(array([0, 1]), array([348

## Test iterator
Check different weays of using the iterator

In [8]:
#check iterator
for i in list(clf):
    print(i)

root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1664
root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0524
root - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([287]))
root - Down - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.3200
root - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([2]))
root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([8]))
root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0349
root - Up - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2

In [9]:
#check iterator again
for i in clf:
    print(i)

root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1664
root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0524
root - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([287]))
root - Down - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.3200
root - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([2]))
root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([8]))
root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0349
root - Up - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2

## Test STree is a sklearn estimator

In [10]:
# Make checks one by one
c = 0
checks = check_estimator(Stree(), generate_only=True)
for check in checks:
    c += 1
    print(c, check[1])
    check[1](check[0])

1 functools.partial(<function check_no_attributes_set_in_init at 0x11f161ef0>, 'Stree')
2 functools.partial(<function check_estimators_dtypes at 0x11f15e050>, 'Stree')
3 functools.partial(<function check_fit_score_takes_y at 0x11f158ef0>, 'Stree')
4 functools.partial(<function check_sample_weights_pandas_series at 0x11f155830>, 'Stree')
5 functools.partial(<function check_sample_weights_not_an_array at 0x11f155950>, 'Stree')
6 functools.partial(<function check_sample_weights_list at 0x11f155a70>, 'Stree')
7 functools.partial(<function check_sample_weights_shape at 0x11f155b90>, 'Stree')
8 functools.partial(<function check_sample_weights_invariance at 0x11f155cb0>, 'Stree')
9 functools.partial(<function check_estimators_fit_returns_self at 0x11f161050>, 'Stree')
10 functools.partial(<function check_estimators_fit_returns_self at 0x11f161050>, 'Stree', readonly_memmap=True)
11 functools.partial(<function check_complex_data at 0x11f155e60>, 'Stree')
12 functools.partial(<function check_dt

In [11]:
# Check if the classifier is a sklearn estimator
check_estimator(Stree())

## Compare to SVM

In [12]:
svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)
clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)
svc.fit(Xtrain, ytrain)
clf.fit(Xtrain, ytrain)
print("== Not Weighted ===")
print("SVC train score ..:", svc.score(Xtrain, ytrain))
print("STree train score :", clf.score(Xtrain, ytrain))
print("SVC test score ...:", svc.score(Xtest, ytest))
print("STree test score .:", clf.score(Xtest, ytest))
svc.fit(Xtrain, ytrain, weights)
clf.fit(Xtrain, ytrain, weights)
print("==== Weighted =====")
print("SVC train score ..:", svc.score(Xtrain, ytrain))
print("STree train score :", clf.score(Xtrain, ytrain))
print("SVC test score ...:", svc.score(Xtest, ytest))
print("STree test score .:", clf.score(Xtest, ytest))
print("*SVC test score ..:", svc.score(Xtest, ytest, weights_test))
print("*STree test score :", clf.score(Xtest, ytest, weights_test))

== Not Weighted ===
SVC train score ..: 0.9823100936524454
STree train score : 0.9830905306971904
SVC test score ...: 0.9842233009708737
STree test score .: 0.9860436893203883
==== Weighted =====
SVC train score ..: 0.9799687825182102
STree train score : 0.9807492195629552
SVC test score ...: 0.9848300970873787
STree test score .: 0.9830097087378641
*SVC test score ..: 0.9487167285301864
*STree test score : 0.9538538933228189


In [13]:
print(clf)

root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1664
root - Down, <cgaf> - Leaf class=1 belief= 0.888268 impurity=0.1985 counts=(array([0, 1]), array([ 40, 318]))
root - Up, <cgaf> - Leaf class=0 belief= 0.990247 impurity=0.0193 counts=(array([0, 1]), array([3452,   34]))



## Test max_features

In [14]:
for max_features in [None, "auto", "log2", 7, .5, .1, .7]:
    now = time.time()
    print("*"*40)
    clf = Stree(random_state=random_state, max_features=max_features)
    clf.fit(Xtrain, ytrain)
    print(f"max_features {max_features} = {clf.max_features_}")
    print("Train score :", clf.score(Xtrain, ytrain))
    print("Test score .:", clf.score(Xtest, ytest))
    print(f"Took {time.time() - now:.2f} seconds")

****************************************
max_features None = 28
Train score : 0.9841311134235172
Test score .: 0.9848300970873787
Took 0.35 seconds
****************************************
max_features auto = 5
Train score : 0.981009365244537
Test score .: 0.9830097087378641
Took 0.71 seconds
****************************************
max_features log2 = 4
Train score : 0.9763267429760666
Test score .: 0.9775485436893204
Took 0.32 seconds
****************************************
max_features 7 = 7
Train score : 0.9750260145681582
Test score .: 0.9775485436893204
Took 2.35 seconds
****************************************
max_features 0.5 = 14
Train score : 0.9820499479708636
Test score .: 0.9824029126213593
Took 84.80 seconds
****************************************
max_features 0.1 = 2
Train score : 0.9513527575442248
Test score .: 0.9526699029126213
Took 0.25 seconds
****************************************
max_features 0.7 = 19
Train score : 0.9841311134235172
Test score .: 0.983009708