# Compare STree with different estimators

# Setup
Uncomment the next cell if STree is not already installed

In [1]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree

In [2]:
import datetime, time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from stree import Stree

In [3]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

# Tests

In [4]:
print(datetime.date.today(), time.strftime("%H:%M:%S"))

2021-01-14 11:30:51


## Load dataset and normalize values

In [5]:
# Load Dataset
df = pd.read_csv('data/creditcard.csv')
df.shape
random_state = 2020

In [6]:
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
print("Valid: {0:.3f}% {1:,}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))

Fraud: 0.173% 492
Valid: 99.827% 284,315


In [7]:
# Normalize Amount
from sklearn.preprocessing import RobustScaler
values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))
df['Amount_Scaled'] = values

In [8]:
# Remove unneeded features
y = df.Class.values
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
print(f"X shape: {X.shape}\ny shape: {y.shape}")

X shape: (284807, 29)
y shape: (284807,)


## Build the models

In [9]:
# Divide dataset
train_size = .7
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)

In [10]:
# Linear Tree
linear_tree = DecisionTreeClassifier(random_state=random_state)

In [11]:
# Naive Bayes
naive_bayes = GaussianNB()

In [12]:
# Stree
stree = Stree(random_state=random_state, C=.01, max_iter=1e3)

In [13]:
# Neural Network
mlp = MLPClassifier(random_state=random_state, alpha=1)

In [14]:
# SVC (linear)
svc = LinearSVC(random_state=random_state, C=.01, max_iter=1e3)

## Do the test

In [15]:
def try_model(name, model):
    print(f"************************** {name} **********************")
    now = time.time()
    model.fit(Xtrain, ytrain)
    spent = time.time() - now
    print(f"Train Model {name} took: {spent:.4} seconds")
    predict = model.predict(Xtrain)
    predictt = model.predict(Xtest)
    print(f"=========== {name} - Train {Xtrain.shape[0]:,} samples =============",)
    print(classification_report(ytrain, predict, digits=6))
    print(f"=========== {name} - Test {Xtest.shape[0]:,} samples =============")
    print(classification_report(ytest, predictt, digits=6))
    print("Confusion Matrix in Train")
    print(confusion_matrix(ytrain, predict))
    print("Confusion Matrix in Test")
    print(confusion_matrix(ytest, predictt))
    return f1_score(ytest, predictt), spent

In [16]:
# Train & Test models
models = {
    'Linear Tree':linear_tree, 'Naive Bayes': naive_bayes, 'Stree (SVM Tree)': stree,  
    'Neural Network': mlp, 'SVC (linear)': svc
}

best_f1 = 0
outcomes = []
for name, model in models.items():
    f1, time_spent = try_model(name, model)
    outcomes.append((name, f1, time_spent))
    if f1 > best_f1:
        best_model = name
        best_time = time_spent
        best_f1 = f1

************************** Linear Tree **********************
Train Model Linear Tree took: 10.25 seconds
              precision    recall  f1-score   support

           0   1.000000  1.000000  1.000000    199020
           1   1.000000  1.000000  1.000000       344

    accuracy                       1.000000    199364
   macro avg   1.000000  1.000000  1.000000    199364
weighted avg   1.000000  1.000000  1.000000    199364

              precision    recall  f1-score   support

           0   0.999578  0.999613  0.999596     85295
           1   0.772414  0.756757  0.764505       148

    accuracy                       0.999192     85443
   macro avg   0.885996  0.878185  0.882050     85443
weighted avg   0.999184  0.999192  0.999188     85443

Confusion Matrix in Train
[[199020      0]
 [     0    344]]
Confusion Matrix in Test
[[85262    33]
 [   36   112]]
************************** Naive Bayes **********************
Train Model Naive Bayes took: 0.09943 seconds
              p



Train Model Stree (SVM Tree) took: 28.47 seconds
              precision    recall  f1-score   support

           0   0.999623  0.999864  0.999744    199020
           1   0.908784  0.781977  0.840625       344

    accuracy                       0.999488    199364
   macro avg   0.954204  0.890921  0.920184    199364
weighted avg   0.999467  0.999488  0.999469    199364

              precision    recall  f1-score   support

           0   0.999637  0.999918  0.999777     85295
           1   0.943548  0.790541  0.860294       148

    accuracy                       0.999555     85443
   macro avg   0.971593  0.895229  0.930036     85443
weighted avg   0.999540  0.999555  0.999536     85443

Confusion Matrix in Train
[[198993     27]
 [    75    269]]
Confusion Matrix in Test
[[85288     7]
 [   31   117]]
************************** Neural Network **********************
Train Model Neural Network took: 9.76 seconds
              precision    recall  f1-score   support

           0  



Train Model SVC (linear) took: 8.207 seconds
              precision    recall  f1-score   support

           0   0.999237  0.999859  0.999548    199020
           1   0.872727  0.558140  0.680851       344

    accuracy                       0.999097    199364
   macro avg   0.935982  0.778999  0.840199    199364
weighted avg   0.999018  0.999097  0.998998    199364

              precision    recall  f1-score   support

           0   0.999344  0.999894  0.999619     85295
           1   0.910891  0.621622  0.738956       148

    accuracy                       0.999239     85443
   macro avg   0.955117  0.810758  0.869287     85443
weighted avg   0.999191  0.999239  0.999168     85443

Confusion Matrix in Train
[[198992     28]
 [   152    192]]
Confusion Matrix in Test
[[85286     9]
 [   56    92]]


In [17]:
print("*"*110)
print(f"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset")
print("*"*110)
for name, f1, time_spent in outcomes:
    print(f"Model: {name}\t Time: {time_spent:6.2f} seconds\t f1: {f1:.4}")

**************************************************************************************************************
*The best f1 model is Stree (SVM Tree), with a f1 score: 0.8603 in 28.4743 seconds with 0.7 samples in train dataset
**************************************************************************************************************
Model: Linear Tree	 Time:  10.25 seconds	 f1: 0.7645
Model: Naive Bayes	 Time:   0.10 seconds	 f1: 0.1154
Model: Stree (SVM Tree)	 Time:  28.47 seconds	 f1: 0.8603
Model: Neural Network	 Time:   9.76 seconds	 f1: 0.7381
Model: SVC (linear)	 Time:   8.21 seconds	 f1: 0.739


In [18]:
stree.get_params()

{'C': 0.01,
 'criterion': 'entropy',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_depth': None,
 'max_features': None,
 'max_iter': 1000.0,
 'min_samples_split': 0,
 'random_state': 2020,
 'split_criteria': 'impurity',
 'splitter': 'random',
 'tol': 0.0001}