In [None]:
if 'google.colab' in str(get_ipython()):
    GITHUB_TOKEN = ""
    !rm -rf anti_money_laundering
    !git clone https://{GITHUB_TOKEN}@github.com/FedericoBruzzone/anti_money_laundering.git
    !mv anti_money_laundering/* 
    !rm -rf anti_money_laundering

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import time
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from src.utils.kaggle_config            import setup_kaggle
from src.utils.kaggle_config            import download_dataset

from src.utils.datasets_handler         import (get_train_and_test,
                                                get_X_and_Y,
                                                print_dataset,
                                                label_encoder,
                                                split_timestamp)
from src.utils.performance_measures     import calculate_performances
from src.utils.dataset_sampling_methods import (oversampling,
                                                undersampling,
                                                bootstrap_sampling)

from src.utils.print_utils              import (printLBlue, printGreen)

from src.decision_tree.decision_tree    import CustomDecisionTree
from src.decision_tree.ID3              import DecisionTreeID3
from src.decision_tree.C45              import DecisionTreeC45
from src.decision_tree.entropy_type     import EntropyType
from src.decision_tree.criterion_type   import CriterionType

from IPython.display import Image, display

VERBOSE = int(os.getenv('VERBOSE'))
VIEW = os.getenv('VIEW')

setup_kaggle()
print("Downloading dataset...") 
download_dataset("iammustafatz/diabetes-prediction-dataset")
download_dataset("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
print("Done.")

hi_small_trans = "HI-Small_Trans.csv"
diabetes = "diabetes_prediction_dataset.csv"

## Preliminary test: Diabetes Dataset

In [None]:
df_train, df_test = get_train_and_test(diabetes, verbose=VERBOSE)
X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
X_train, _ = label_encoder(X_train, ['gender', 'smoking_history'])
X_test,  _ = label_encoder(X_test, ['gender', 'smoking_history'])

### ID3

In [None]:
print("---------------------- ID3 --------------------------")    
start_time = time.time()
decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=10, 
                                                    num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-id3-diabetes", generate_png=True, view="")
print()
print("Performances: ")
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds") 
calculate_performances(predictions, y_test, "id3", verbose=True)
print("-------------------------- END ID3 --------------------------")

In [None]:
display(Image(filename='./dot_figs/tree-id3-diabetes.png'))

### Custom

In [None]:
print("-------------------------- CUSTOM --------------------------")
start_time = time.time()
decision_tree = CustomDecisionTree(criterion=EntropyType.SHANNON, 
                                    type_criterion=CriterionType.BEST, 
                                    max_depth=10, 
                                    min_samples_split=20,
                                    num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-custom-diabetes", generate_png=True, view="")
print()
print("Performances: ") 
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds")
calculate_performances(predictions, y_test, "custom", verbose=True)
print("-------------------------- END CUSTOM --------------------------")

In [None]:
display(Image(filename='./dot_figs/tree-custom-diabetes.png'))

## IBM Money Laundering Dataset

In [None]:
original_df_train, original_df_test = get_train_and_test(hi_small_trans, verbose=VERBOSE)

print(original_df_train["Is Laundering"].value_counts())
print(original_df_test["Is Laundering"].value_counts())

print("Length of training set:", len(original_df_train), "    Length of test set:", len(original_df_test))

original_df_train.head()

In [None]:
split_timestamp(original_df_train)
split_timestamp(original_df_test)

original_df_train.head()

In [None]:
original_df_train, _ = label_encoder(original_df_train, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])
original_df_test, _ = label_encoder(original_df_test, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])

### ID3

In [None]:
hp_n_thresholds_values = [2, 4, 6]
hp_max_depth_values = [4, 8, 12]

def id3_experiment(df_train, df_test, type):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
    
    for hp_n_thresholds in hp_n_thresholds_values:
        printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")

        for hp_max_depth in hp_max_depth_values:
            printGreen(f"   Max depth: {hp_max_depth}")
            start_time = time.time()
            decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=hp_max_depth, num_thresholds_numerical_attr=hp_n_thresholds)
            decision_tree.fit(X_train, y_train)
            end_time = time.time()
            decision_tree.create_dot_files(filename=f"tree-id3-{type}-{hp_n_thresholds}", generate_png=True, view=VIEW)
            print("PERFORMANCES:")
            predictions = list(decision_tree.predict_test(X_test))
        
            calculate_performances(predictions, y_test, "id3", verbose=True)

            print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
            print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [None]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "undersampling")

In [None]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "oversampling")

In [None]:
printLBlue("Without preprocessing")
df_train, df_test = original_df_train, original_df_test
id3_experiment(df_train, df_test, "wo_preprocessing")

### Custom Algorithm

In [16]:
hp_max_depth_values = [4, 7, 10]
hp_n_thresholds_values = [2, 4, 6]
hp_min_samples_split_values = [2, 100]


def custom_alg_experiment(df_train, df_test, type, start_index=0, end_index=18): 
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    k = 0

    for hp_max_depth in hp_max_depth_values:
        for hp_n_thresholds in hp_n_thresholds_values:
            for hp_min_samples_split in hp_min_samples_split_values:

                if k < start_index or k > end_index:
                    k += 1
                    continue

                k += 1

                printGreen(f"Max depth: {hp_max_depth}")
                printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")
                printGreen(f"Min samples split: {hp_min_samples_split}")

                start_time = time.time()
                decision_tree = CustomDecisionTree(max_depth=hp_max_depth, 
                                                    min_samples_split=hp_min_samples_split,
                                                    num_thresholds_numerical_attr=hp_n_thresholds)
                decision_tree.fit(X_train, y_train)
                end_time = time.time()
                decision_tree.create_dot_files(filename=f"tree-custom-{type}-{hp_n_thresholds}-{hp_min_samples_split}", generate_png=True, view=VIEW)
                print("PERFORMANCES:")
                predictions = list(decision_tree.predict_test(X_test))
            
                calculate_performances(predictions, y_test, "custom", verbose=True)

                print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
                print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [None]:
print("\nPreprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "undersampling")

In [None]:
print("\nPreprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "oversampling", start_index=0, end_index=11)

In [None]:
custom_alg_experiment(df_train, df_test, "oversampling", start_index=12, end_index=18)

### C4.5


In [None]:

def c45_experiment(df_train, df_test, type):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    max_depths = [6, 9, 12]

    for max_depth in max_depths:
        printGreen(f"Max depth: {max_depth}")
        start_time = time.time()
        decision_tree = DecisionTreeC45(max_depth=max_depth)
        decision_tree.fit(X_train, y_train)
        end_time = time.time()
        decision_tree.create_dot_files(filename=f"tree-c45-{type}", generate_png=True, view=VIEW)
        print("PERFORMANCES:")
        predictions = list(decision_tree.predict_test(X_test))
            
        calculate_performances(predictions, y_test, "c45", verbose=True)

        print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
        print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [None]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "undersampling")

In [None]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "oversampling")