In [1]:
import datetime, time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from trees.Stree import Stree

In [2]:
import os
if not os.path.isfile('data/creditcard.csv'):
    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download
    !tar xzf creditcard.tgz

In [3]:
print(datetime.date.today(), time.strftime("%H:%M:%S"))

2020-05-17 16:15:24


In [4]:
# Load Dataset
df = pd.read_csv('data/creditcard.csv')
df.shape
random_state = 2020

In [5]:
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
print("Valid: {0:.3f}% {1:,}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))

Fraud: 0.173% 492
Valid: 99.827% 284,315


In [6]:
# Normalize Amount
from sklearn.preprocessing import RobustScaler
values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))
df['Amount_Scaled'] = values

In [7]:
# Remove unneeded features
y = df.Class.values
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
print(f"X shape: {X.shape}\ny shape: {y.shape}")

X shape: (284807, 29)
y shape: (284807,)


In [8]:
# Divide dataset
train_size = .7
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)

In [9]:
# Linear Tree
linear_tree = tree.DecisionTreeClassifier(random_state=random_state)

In [10]:
# Random Forest
random_forest = RandomForestClassifier(random_state=random_state, n_jobs=-1, n_estimators=100)

In [11]:
# Stree
stree = Stree(random_state=random_state, C=.01)

In [12]:
# AdaBoost
adaboost = AdaBoostClassifier(random_state=random_state)

In [13]:
# Gradient Boosting
gradient = GradientBoostingClassifier(random_state=random_state)

In [14]:
def try_model(name, model):
    print(f"************************** {name} **********************")
    now = time.time()
    model.fit(Xtrain, ytrain)
    spent = time.time() - now
    print(f"Train Model {name} took: {spent:.4} seconds")
    predict = model.predict(Xtrain)
    predictt = model.predict(Xtest)
    print(f"=========== {name} - Train {Xtrain.shape[0]:,} samples =============",)
    print(classification_report(ytrain, predict, digits=6))
    print(f"=========== {name} - Test {Xtest.shape[0]:,} samples =============")
    print(classification_report(ytest, predictt, digits=6))
    print("Confusion Matrix in Train")
    print(confusion_matrix(ytrain, predict))
    print("Confusion Matrix in Test")
    print(confusion_matrix(ytest, predictt))
    return f1_score(ytest, predictt), spent

In [15]:
# Train & Test models
models = {
    'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree,  
    'AdaBoost model': adaboost, 'Gradient Boost.': gradient
}

best_f1 = 0
outcomes = []
for name, model in models.items():
    f1, time_spent = try_model(name, model)
    outcomes.append((name, f1, time_spent))
    if f1 > best_f1:
        best_model = name
        best_time = time_spent
        best_f1 = f1

************************** Linear Tree **********************
Train Model Linear Tree took: 14.13 seconds
              precision    recall  f1-score   support

           0   1.000000  1.000000  1.000000    199020
           1   1.000000  1.000000  1.000000       344

    accuracy                       1.000000    199364
   macro avg   1.000000  1.000000  1.000000    199364
weighted avg   1.000000  1.000000  1.000000    199364

              precision    recall  f1-score   support

           0   0.999578  0.999613  0.999596     85295
           1   0.772414  0.756757  0.764505       148

    accuracy                       0.999192     85443
   macro avg   0.885996  0.878185  0.882050     85443
weighted avg   0.999184  0.999192  0.999188     85443

Confusion Matrix in Train
[[199020      0]
 [     0    344]]
Confusion Matrix in Test
[[85262    33]
 [   36   112]]
************************** Random Forest **********************


KeyboardInterrupt: 

In [1]:
print("*"*132)
print(f"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset")
print("*"*132)
for name, f1, time_spent in outcomes:
    print(f"Model: {name}\t Time: {time_spent:6.2f} seconds\t f1: {f1:.4}")

************************************************************************************************************************************


NameError: name 'best_model' is not defined

************************************************************************************************************************************
*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset
************************************************************************************************************************************
Model: Linear Tree	     Time:  23.05 seconds	 f1: 0.7645
Model: Random Forest	 Time: 218.97 seconds	 f1: 0.8815
Model: Stree (SVM Tree)	 Time:  49.45 seconds	 f1: 0.8467
Model: AdaBoost model	 Time:  73.83 seconds	 f1: 0.7509
Model: Gradient Boost.	 Time: 388.69 seconds	 f1: 0.5259
Model: Neural Network	 Time:  25.47 seconds	 f1: 0.8328