In [None]:
if 'google.colab' in str(get_ipython()):
    GITHUB_TOKEN = ""
    !rm -rf anti_money_laundering
    !git clone https://{GITHUB_TOKEN}@github.com/FedericoBruzzone/anti_money_laundering.git
    !mv anti_money_laundering/* 
    !rm -rf anti_money_laundering

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

import time
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from src.utils.kaggle_config            import setup_kaggle
from src.utils.kaggle_config            import download_dataset

from src.utils.datasets_handler         import (get_train_and_test,
                                                get_X_and_Y,
                                                print_dataset,
                                                label_encoder,
                                                split_timestamp)
from src.utils.performance_measures     import calculate_performances
from src.utils.dataset_sampling_methods import (oversampling,
                                                undersampling,
                                                bootstrap_sampling)

from src.utils.print_utils              import (printLBlue, printGreen)

from src.decision_tree.decision_tree    import CustomDecisionTree
from src.decision_tree.ID3              import DecisionTreeID3
from src.decision_tree.C45              import DecisionTreeC45
from src.decision_tree.entropy_type     import EntropyType
from src.decision_tree.criterion_type   import CriterionType

from IPython.display import Image, display

VERBOSE = int(os.getenv('VERBOSE'))
VIEW = os.getenv('VIEW')

setup_kaggle()
print("Downloading dataset...") 
download_dataset("iammustafatz/diabetes-prediction-dataset")
download_dataset("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
print("Done.")

hi_small_trans = "HI-Small_Trans.csv"
diabetes = "diabetes_prediction_dataset.csv"

Downloading dataset...
Done.


## Preliminary test: Diabetes Dataset

In [7]:
df_train, df_test = get_train_and_test(diabetes, verbose=VERBOSE)
X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
X_train, _ = label_encoder(X_train, ['gender', 'smoking_history'])
X_test,  _ = label_encoder(X_test, ['gender', 'smoking_history'])

### ID3

In [8]:
print("---------------------- ID3 --------------------------")    
start_time = time.time()
decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=10, 
                                                    num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-id3-diabetes", generate_png=True, view="")
print()
print("Performances: ")
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds") 
calculate_performances(predictions, y_test, "id3", verbose=True)
print("-------------------------- END ID3 --------------------------")

---------------------- ID3 --------------------------
PARAMETERS:
	MAX DEPTH: 10
	NUM THRESHOLDS NUMERICAL ATTR: 6


Performances: 
Fit time: 2.4182631969451904 seconds
    F1 score: 0.582247     Accuracy: 0.950350
   Precision: 1.000000       Recall: 0.410682
          TP:      692           TN:    18315
          FP:        0           FN:      993
         TPR: 0.410682          FPR: 0.000000
-------------------------- END ID3 --------------------------


In [None]:
display(Image(filename='./dot_figs/tree-id3-diabetes.png'))

### Custom

In [None]:
print("-------------------------- CUSTOM --------------------------")
start_time = time.time()
decision_tree = CustomDecisionTree(criterion=EntropyType.SHANNON, 
                                    type_criterion=CriterionType.BEST, 
                                    max_depth=10, 
                                    min_samples_split=20,
                                    num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-custom-diabetes", generate_png=True, view="")
print()
print("Performances: ") 
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds")
calculate_performances(predictions, y_test, "custom", verbose=True)
print("-------------------------- END CUSTOM --------------------------")

In [None]:
display(Image(filename='./dot_figs/tree-custom-diabetes.png'))

## IBM Money Laundering Dataset

In [16]:
original_df_train, original_df_test = get_train_and_test(hi_small_trans, verbose=VERBOSE)

print(original_df_train["Is Laundering"].value_counts())
print(original_df_test["Is Laundering"].value_counts())

print("Length of training set:", len(original_df_train), "    Length of test set:", len(original_df_test))

original_df_train.head()

Is Laundering
0    4058540
1       4136
Name: count, dtype: int64
Is Laundering
0    1014628
1       1041
Name: count, dtype: int64
Length of training set: 4062676     Length of test set: 1015669


Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
1068097,2022/09/01 22:48,318886,80B3EA590,24161,80B3EA770,3582.88,US Dollar,3582.88,US Dollar,Cheque,0
3824191,2022/09/08 03:57,14,80FD57700,43049,80FD69960,175991.6,Shekel,175991.6,Shekel,Wire,0
2416035,2022/09/05 05:31,235509,80CE5C840,235843,80D5A5E70,15451.15,Brazil Real,15451.15,Brazil Real,Cheque,0
126628,2022/09/01 00:04,2591,800602EA0,1299,800CBA7A0,3009.82,Euro,3009.82,Euro,Cheque,0
3797765,2022/09/08 02:36,2947,8044D4D50,2627,80B8F74D0,5236.26,US Dollar,5236.26,US Dollar,Credit Card,0


In [3]:
split_timestamp(original_df_train)
split_timestamp(original_df_test)

original_df_train.head()

Unnamed: 0,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Date,Hour,Minute,Is Laundering
1068097,318886,80B3EA590,24161,80B3EA770,3582.88,US Dollar,3582.88,US Dollar,Cheque,2022/09/01,22,48,0
3824191,14,80FD57700,43049,80FD69960,175991.6,Shekel,175991.6,Shekel,Wire,2022/09/08,3,57,0
2416035,235509,80CE5C840,235843,80D5A5E70,15451.15,Brazil Real,15451.15,Brazil Real,Cheque,2022/09/05,5,31,0
126628,2591,800602EA0,1299,800CBA7A0,3009.82,Euro,3009.82,Euro,Cheque,2022/09/01,0,4,0
3797765,2947,8044D4D50,2627,80B8F74D0,5236.26,US Dollar,5236.26,US Dollar,Credit Card,2022/09/08,2,36,0


In [4]:
original_df_train, _ = label_encoder(original_df_train, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])
original_df_test, _ = label_encoder(original_df_test, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])

### ID3

In [5]:
hp_n_thresholds_values = [2, 4, 6]
hp_max_depth_values = [4, 8, 12]

def id3_experiment(df_train, df_test, type):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
    
    for hp_n_thresholds in hp_n_thresholds_values:
        printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")

        for hp_max_depth in hp_max_depth_values:
            printGreen(f"   Max depth: {hp_max_depth}")
            start_time = time.time()
            decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=hp_max_depth, num_thresholds_numerical_attr=hp_n_thresholds)
            decision_tree.fit(X_train, y_train)
            end_time = time.time()
            decision_tree.create_dot_files(filename=f"tree-id3-{type}-{hp_n_thresholds}", generate_png=True, view=VIEW)
            print("PERFORMANCES:")
            predictions = list(decision_tree.predict_test(X_test))
        
            calculate_performances(predictions, y_test, "id3", verbose=True)

            print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
            print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [6]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "undersampling")

[94mPreprocessing: Undersampling[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 0.02 minutes
Predict time: 1.76 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 0.01 minutes
Predict time: 1.80 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014747     Accuracy: 0.881858
   Pre

KeyboardInterrupt: 

In [7]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "oversampling")

[94mPreprocessing: Oversampling[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 3.22 minutes
Predict time: 2.15 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 2.89 minutes
Predict time: 2.05 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Prec

In [8]:
printLBlue("Without preprocessing")
df_train, df_test = original_df_train, original_df_test
id3_experiment(df_train, df_test, "wo_preprocessing")

[94mWithout preprocessing[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision: 0.000000       Recall: 0.000000
          TP:        0           TN:  1014628
          FP:        0           FN:     1041
         TPR: 0.000000          FPR: 0.000000

Fit time: 4.57 minutes
Predict time: 2.15 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision: 0.000000       Recall: 0.000000
          TP:        0           TN:  1014628
          FP:        0           FN:     1041
         TPR: 0.000000          FPR: 0.000000

Fit time: 4.09 minutes
Predict time: 1.98 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision:

### Custom Algorithm

In [5]:
hp_max_depth_values = [4, 7, 10]
hp_n_thresholds_values = [2, 4, 6]
hp_min_samples_split_values = [2, 100]


def custom_alg_experiment(df_train, df_test, type): 
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    for hp_max_depth in hp_max_depth_values:
        printGreen(f"Max depth: {hp_max_depth}")

        for hp_n_thresholds in hp_n_thresholds_values:
            printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")

            for hp_min_samples_split in hp_min_samples_split_values:
                printGreen(f"Min samples split: {hp_min_samples_split}")
                start_time = time.time()
                decision_tree = CustomDecisionTree(max_depth=hp_max_depth, 
                                                    min_samples_split=hp_min_samples_split,
                                                    num_thresholds_numerical_attr=hp_n_thresholds)
                decision_tree.fit(X_train, y_train)
                end_time = time.time()
                decision_tree.create_dot_files(filename=f"tree-custom-{type}-{hp_n_thresholds}-{hp_min_samples_split}", generate_png=True, view=VIEW)
                print("PERFORMANCES:")
                predictions = list(decision_tree.predict_test(X_test))
            
                calculate_performances(predictions, y_test, "custom", verbose=True)

                print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
                print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [6]:
print("\nPreprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "undersampling")


Preprocessing: Undersampling
[32mMax depth: 4[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.011703     Accuracy: 0.842853
   Precision: 0.005889       Recall: 0.907781
          TP:      945           TN:   855115
          FP:   159513           FN:       96
         TPR: 0.907781          FPR: 0.157213

Fit time: 0.07 minutes
Predict time: 1.82 minutes
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.011703     Accuracy: 0.842853
   Precision: 0.005889       Recall: 0.907781
          TP:      945           TN:   855115
          FP:   159513           FN:       96
         TPR: 0.907781          FPR: 0.157213

Fit time: 0.07 minutes
Predict time: 1.78 minute

In [7]:
print("\nPreprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "oversampling")


Preprocessing: Oversampling
[32mMax depth: 4[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.022659     Accuracy: 0.928995
   Precision: 0.011492       Recall: 0.803074
          TP:      836           TN:   942715
          FP:    71913           FN:      205
         TPR: 0.803074          FPR: 0.070876

Fit time: 14.38 minutes
Predict time: 2.37 minutes
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.022659     Accuracy: 0.928995
   Precision: 0.011492       Recall: 0.803074
          TP:      836           TN:   942715
          FP:    71913           FN:      205
         TPR: 0.803074          FPR: 0.070876

Fit time: 14.34 minutes
Predict time: 2.36 minut

KeyboardInterrupt: 

### C4.5


In [None]:

def c45_experiment(df_train, df_test, type):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    max_depths = [6, 9, 12]

    for max_depth in max_depths:
        printGreen(f"Max depth: {max_depth}")
        start_time = time.time()
        decision_tree = DecisionTreeC45(max_depth=max_depth)
        decision_tree.fit(X_train, y_train)
        end_time = time.time()
        decision_tree.create_dot_files(filename=f"tree-c45-{type}", generate_png=True, view=VIEW)
        print("PERFORMANCES:")
        predictions = list(decision_tree.predict_test(X_test))
            
        calculate_performances(predictions, y_test, "c45", verbose=True)

        print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
        print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [None]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "undersampling")

In [None]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "oversampling")