In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification, load_iris, load_wine
from trees.Stree import Stree
import time

In [2]:
import os, urllib.request
file_name = 'data/creditcard.csv'
if not os.path.isfile(file_name):
    url = 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv'
    urllib.request.urlretrieve(url, file_name)

In [3]:
def load_creditcard(n_examples=0):
    df = pd.read_csv('data/creditcard.csv')
    print("*Original Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
    print("*Original Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
    y = np.expand_dims(df.Class.values, axis=1)
    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
    #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
    #return Xtrain, Xtest, ytrain, ytest
    if n_examples > 0:
        X = X[:n_examples, :]
        y = y[:n_examples, :]
    else:
        if n_examples < 0:
            Xt = X[(y == 1).ravel()]
            yt = y[(y == 1).ravel()]
            indices = random.sample(range(X.shape[0]), -1 * n_examples)
            X = np.append(Xt, X[indices], axis=0)
            y = np.append(yt, y[indices], axis=0)
    print("X.shape", X.shape, " y.shape", y.shape)
    print("-Generated Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("-Generated Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
    return X, y

random_state = 1


# Datasets

#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, 
#                    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
#                    class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)

#X, y = load_wine(return_X_y=True)
#X, y = load_iris(return_X_y=True)
#y[y==2]=0

X, y = load_creditcard()

*Original Fraud: 0.173% 492
*Original Valid: 99.827% 284315
X.shape (284807, 28)  y.shape (284807, 1)
-Generated Fraud: 0.173% 492
-Generated Valid: 99.827% 284315


In [4]:
t = time.time()
clf = Stree(C=.01, random_state=random_state, use_predictions=False)
clf.fit(X, y)
print(clf)
print(f"{time.time() - t:.4f} secs")

root
root - Down
root - Down - Down, <cgaf> - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))
root - Down - Up
root - Down - Up - Down
root - Down - Up - Down - Down, <cgaf> - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))
root - Down - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Down - Up - Up
root - Down - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Down - Up - Up - Up, <cgaf> - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))
root - Up
root - Up - Down
root - Up - Down - Down
root - Up - Down - Down - Down, <cgaf> - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))
root - Up - Down - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))
root - Up - Up, <cgaf> - Leaf class=1 beli

In [5]:
t = time.time()
clf.score(X, y)
print(f"{time.time() - t:.4f} secs")

Accuracy: 0.999512
0.2389 secs


In [15]:
print(clf)

root
root - Down
root - Down - Down, <cgaf> - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))
root - Down - Up
root - Down - Up - Down
root - Down - Up - Down - Down, <cgaf> - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))
root - Down - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Down - Up - Up
root - Down - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))
root - Down - Up - Up - Up, <cgaf> - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))
root - Up
root - Up - Down
root - Up - Down - Down
root - Up - Down - Down - Down, <cgaf> - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))
root - Up - Down - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))
root - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))
root - Up - Up, <cgaf> - Leaf class=1 beli

In [14]:
def printvec(node):
    if node._vector is None:
        print(node._title)
        print(node._clf.coef_)
    if node.is_leaf():
        return
    printvec(node.get_down())
    printvec(node.get_up())
printvec(clf._tree)

root - Down - Up - Down - Up, <pure>


AttributeError: 'NoneType' object has no attribute 'coef_'

In [6]:
t = time.time()
yp = clf.predict_proba(X)
print(yp.shape)
print(f"{time.time() - t:.4f} secs")

[[ 0.01272453 -0.00869691  0.00261326  0.013119    0.00200886 -0.00177167
  -0.0059729  -0.01137639 -0.01553846 -0.02868033  0.01536246 -0.02983645
  -0.00347147 -0.03159505 -0.0020723  -0.02189612 -0.03363292 -0.01214814
   0.00413005 -0.00207565  0.00679726  0.00138744 -0.00159652  0.00035469
  -0.00397646 -0.00277537  0.01026267  0.00918578]] root - Down - Down, <cgaf> [-1.02881772]
[[ 0.04405501  0.06724601  0.07316834  0.01356261 -0.01851765  0.00286983
   0.01749505  0.02031383  0.00613145  0.00963547 -0.02055478  0.01969194
  -0.03824876  0.02122459  0.07457148  0.00967706  0.04688633 -0.00361551
   0.00927978  0.01916087 -0.01800233 -0.00642752  0.01972691 -0.00539432
  -0.02530949 -0.00125887 -0.00248638 -0.01560347]] root - Down - Up - Down - Down, <cgaf> [-0.01578117]
None root - Down - Up - Down - Up, <pure> 0.0


TypeError: 'NoneType' object is not subscriptable

# outcomes without optimization executing predict_proba. 87 seconds
(284807, 2)
87.5212 secs

In [7]:
t = time.time()
clf2 = LinearSVC(C=.01, random_state=random_state)
clf2.fit(X, y)
print(clf2.score(X, y))
print(f"{time.time() - t:.4f} secs")

0.9991397683343457
13.6326 secs


In [8]:
t = time.time()
clf3 = DecisionTreeClassifier(random_state=random_state)
clf3.fit(X, y)
print(clf3.score(X, y))
print(f"{time.time() - t:.4f} secs")

1.0
18.8308 secs


from sklearn.utils.estimator_checks import check_estimator
clf = Stree()
check_estimator(clf)

In [9]:
import numpy as np
a = np.array([-1, 2, 3, -4])
(np.abs(a) - a.mean()) / a.std()

array([0.36514837, 0.73029674, 1.09544512, 1.46059349])

In [10]:
a = clf2.decision_function(X)
b = clf2._predict_proba_lr(X)
print(a[:4], b[:4])
print(yp[:4])

[-1.05990681 -1.05181137 -0.99985686 -1.04574888] [[0.74267274 0.25732726]
 [0.74112258 0.25887742]
 [0.73103044 0.26896956]
 [0.73995773 0.26004227]]


NameError: name 'yp' is not defined