# Compare STree with different estimators

# Setup
Uncomment the next cell if STree is not already installed

In [None]:
#
# Google Colab setup
#
#!pip install git+https://github.com/doctorado-ml/stree
!pip install pandas

In [None]:
import datetime, time
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from stree import Stree

In [None]:
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

# Tests

In [None]:
print(datetime.date.today(), time.strftime("%H:%M:%S"))

## Load dataset and normalize values

In [None]:
# Load Dataset
df = pd.read_csv('data/creditcard.csv')
df.shape
random_state = 2020

In [None]:
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
print("Valid: {0:.3f}% {1:,}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))

In [None]:
# Normalize Amount
from sklearn.preprocessing import RobustScaler
values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))
df['Amount_Scaled'] = values

In [None]:
# Remove unneeded features
y = df.Class.values
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
print(f"X shape: {X.shape}\ny shape: {y.shape}")

## Build the models

In [None]:
# Divide dataset
train_size = .7
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)

In [None]:
# Linear Tree
linear_tree = DecisionTreeClassifier(random_state=random_state)

In [None]:
# Naive Bayes
naive_bayes = GaussianNB()

In [None]:
# Stree
stree = Stree(random_state=random_state, C=.01, max_iter=1000, kernel="liblinear", multiclass_strategy="ovr")

In [None]:
# Neural Network
mlp = MLPClassifier(random_state=random_state, alpha=1)

In [None]:
# SVC (linear)
svc = LinearSVC(random_state=random_state, C=.01, max_iter=1000)

## Do the test

In [None]:
def try_model(name, model):
    print(f"************************** {name} **********************")
    now = time.time()
    model.fit(Xtrain, ytrain)
    spent = time.time() - now
    print(f"Train Model {name} took: {spent:.4} seconds")
    predict = model.predict(Xtrain)
    predictt = model.predict(Xtest)
    print(f"=========== {name} - Train {Xtrain.shape[0]:,} samples =============",)
    print(classification_report(ytrain, predict, digits=6))
    print(f"=========== {name} - Test {Xtest.shape[0]:,} samples =============")
    print(classification_report(ytest, predictt, digits=6))
    print("Confusion Matrix in Train")
    print(confusion_matrix(ytrain, predict))
    print("Confusion Matrix in Test")
    print(confusion_matrix(ytest, predictt))
    return f1_score(ytest, predictt), spent

In [None]:
# Train & Test models
models = {
    'Linear Tree':linear_tree, 'Naive Bayes': naive_bayes, 'Stree    ': stree,  
    'Neural Network': mlp, 'SVC (linear)': svc
}

best_f1 = 0
outcomes = []
for name, model in models.items():
    f1, time_spent = try_model(name, model)
    outcomes.append((name, f1, time_spent))
    if f1 > best_f1:
        best_model = name
        best_time = time_spent
        best_f1 = f1

In [None]:
print("*"*110)
print(f"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset")
print("*"*110)
for name, f1, time_spent in outcomes:
    print(f"Model: {name}\t Time: {time_spent:6.2f} seconds\t f1: {f1:.4}")

In [None]:
stree.get_params()