In [7]:
KAGGLE_USER=""
KAGGLE_KEY=""
VIEW="default-viewer"
VERBOSE="0"

In [1]:
if 'google.colab' in str(get_ipython()):
    GITHUB_TOKEN = ""
    !rm -rf anti-money-laundering
    #!git clone https://{GITHUB_TOKEN}@github.com/FedericoBruzzone/anti-money-laundering.git
    !git clone https://github.com/FedericoBruzzone/anti-money-laundering.git
    !mv anti-money-laundering/.* .
    !mv anti-money-laundering/* .
    !rm -rf ./anti-money-laundering
    !pip install -e .

In [9]:
if 'google.colab' in str(get_ipython()):
    !cp ./.env.example ./.env
    with open('.env', 'r') as f:
        mod = f.read().splitlines()
        mod = mod[2:-2]
        mod.append(f'KAGGLE_USER={KAGGLE_USER}')
        mod.append(f'KAGGLE_KEY={KAGGLE_KEY}')
        mod.append(f'VIEW={VIEW}')
        mod.append(f'VERBOSE={VERBOSE}')
    with open('.env', 'w') as f:
        for i in mod:
            f.write(i + '\n')

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

import time
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from src.utils.kaggle_config            import setup_kaggle
from src.utils.kaggle_config            import download_dataset

from src.utils.datasets_handler         import (get_train_and_test,
                                                get_X_and_Y,
                                                print_dataset,
                                                label_encoder,
                                                split_timestamp)
from src.utils.performance_measures     import calculate_performances
from src.utils.dataset_sampling_methods import (oversampling,
                                                undersampling,
                                                bootstrap_sampling)

from src.utils.print_utils              import (printLBlue, printGreen)

from src.decision_tree.decision_tree    import CustomDecisionTree
from src.decision_tree.ID3              import DecisionTreeID3
from src.decision_tree.C45              import DecisionTreeC45
from src.decision_tree.entropy_type     import EntropyType
from src.decision_tree.criterion_type   import CriterionType

from IPython.display import Image, display

VERBOSE = int(os.getenv('VERBOSE'))
VIEW = os.getenv('VIEW')

setup_kaggle()
print("Downloading dataset...") 
download_dataset("iammustafatz/diabetes-prediction-dataset")
download_dataset("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
print("Done.")

hi_small_trans = "HI-Small_Trans.csv"
diabetes = "diabetes_prediction_dataset.csv"

Downloading dataset...
Done.


## Preliminary test: Diabetes Dataset

In [3]:
df_train, df_test = get_train_and_test(diabetes, verbose=VERBOSE)
X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
X_train, _ = label_encoder(X_train, ['gender', 'smoking_history'])
X_test,  _ = label_encoder(X_test, ['gender', 'smoking_history'])

### ID3

In [4]:
print("---------------------- ID3 --------------------------")    
start_time = time.time()
decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=10, 
                                                 num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-id3-diabetes", generate_png=True, view="")
print()
print("Performances: ")
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds") 
calculate_performances(predictions, y_test, "id3", verbose=True)
print("-------------------------- END ID3 --------------------------")

---------------------- ID3 --------------------------
PARAMETERS:
	MAX DEPTH: 10
	NUM THRESHOLDS NUMERICAL ATTR: 6


Performances: 
Fit time: 2.076502561569214 seconds
    F1 score: 0.582247     Accuracy: 0.950350
   Precision: 1.000000       Recall: 0.410682
          TP:      692           TN:    18315
          FP:        0           FN:      993
         TPR: 0.410682          FPR: 0.000000
-------------------------- END ID3 --------------------------


In [None]:
display(Image(filename='./dot_figs/tree-id3-diabetes.png'))

### Custom

In [None]:
print("-------------------------- CUSTOM --------------------------")
start_time = time.time()
decision_tree = CustomDecisionTree(criterion=EntropyType.SHANNON, 
                                    type_criterion=CriterionType.BEST, 
                                    max_depth=10, 
                                    min_samples_split=20,
                                    num_thresholds_numerical_attr=6)
decision_tree.fit(X_train, y_train)
end_time = time.time()
decision_tree.create_dot_files(filename="tree-custom-diabetes", generate_png=True, view="")
print()
print("Performances: ") 
predictions = list(decision_tree.predict_test(X_test))
print(f"Fit time: {end_time - start_time} seconds")
calculate_performances(predictions, y_test, "custom", verbose=True)
print("-------------------------- END CUSTOM --------------------------")

In [None]:
display(Image(filename='./dot_figs/tree-custom-diabetes.png'))

## IBM Money Laundering Dataset

In [2]:
original_df_train, original_df_test = get_train_and_test(hi_small_trans, verbose=VERBOSE)

print(original_df_train["Is Laundering"].value_counts())
print(original_df_test["Is Laundering"].value_counts())

print("Length of training set:", len(original_df_train), "    Length of test set:", len(original_df_test))

original_df_train.head()

Is Laundering
0    4058540
1       4136
Name: count, dtype: int64
Is Laundering
0    1014628
1       1041
Name: count, dtype: int64
Length of training set: 4062676     Length of test set: 1015669


Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
1068097,2022/09/01 22:48,318886,80B3EA590,24161,80B3EA770,3582.88,US Dollar,3582.88,US Dollar,Cheque,0
3824191,2022/09/08 03:57,14,80FD57700,43049,80FD69960,175991.6,Shekel,175991.6,Shekel,Wire,0
2416035,2022/09/05 05:31,235509,80CE5C840,235843,80D5A5E70,15451.15,Brazil Real,15451.15,Brazil Real,Cheque,0
126628,2022/09/01 00:04,2591,800602EA0,1299,800CBA7A0,3009.82,Euro,3009.82,Euro,Cheque,0
3797765,2022/09/08 02:36,2947,8044D4D50,2627,80B8F74D0,5236.26,US Dollar,5236.26,US Dollar,Credit Card,0


In [3]:
split_timestamp(original_df_train)
split_timestamp(original_df_test)

original_df_train.head()

Unnamed: 0,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Date,Hour,Minute,Is Laundering
1068097,318886,80B3EA590,24161,80B3EA770,3582.88,US Dollar,3582.88,US Dollar,Cheque,2022/09/01,22,48,0
3824191,14,80FD57700,43049,80FD69960,175991.6,Shekel,175991.6,Shekel,Wire,2022/09/08,3,57,0
2416035,235509,80CE5C840,235843,80D5A5E70,15451.15,Brazil Real,15451.15,Brazil Real,Cheque,2022/09/05,5,31,0
126628,2591,800602EA0,1299,800CBA7A0,3009.82,Euro,3009.82,Euro,Cheque,2022/09/01,0,4,0
3797765,2947,8044D4D50,2627,80B8F74D0,5236.26,US Dollar,5236.26,US Dollar,Credit Card,2022/09/08,2,36,0


In [4]:
original_df_train, _ = label_encoder(original_df_train, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])
original_df_test, _ = label_encoder(original_df_test, ['Date', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])

### ID3

In [5]:
hp_n_thresholds_values = [2, 4, 6]
hp_max_depth_values = [4, 8, 12]

def id3_experiment(df_train, df_test, type):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)
    
    for hp_n_thresholds in hp_n_thresholds_values:
        printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")

        for hp_max_depth in hp_max_depth_values:
            printGreen(f"   Max depth: {hp_max_depth}")
            start_time = time.time()
            decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=hp_max_depth, num_thresholds_numerical_attr=hp_n_thresholds)
            decision_tree.fit(X_train, y_train)
            end_time = time.time()
            decision_tree.create_dot_files(filename=f"tree-id3-{type}-{hp_n_thresholds}", generate_png=True, view=VIEW)
            print("PERFORMANCES:")
            predictions = list(decision_tree.predict_test(X_test))
        
            calculate_performances(predictions, y_test, "id3", verbose=True)

            print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
            print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [6]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "undersampling")

[94mPreprocessing: Undersampling[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 0.01 minutes
Predict time: 1.66 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 0.01 minutes
Predict time: 1.74 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014747     Accuracy: 0.881858
   Pre

In [8]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
id3_experiment(df_train, df_test, "oversampling")

[94mPreprocessing: Oversampling[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 3.31 minutes
Predict time: 1.62 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 2.94 minutes
Predict time: 1.50 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Prec

In [7]:
printLBlue("Without preprocessing")
df_train, df_test = original_df_train, original_df_test
id3_experiment(df_train, df_test, "wo_preprocessing")

[94mWithout preprocessing[0m
[32mNumber of thresholds for num attr: 2[0m
[32m   Max depth: 4[0m
PARAMETERS:
	MAX DEPTH: 4
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision: 0.000000       Recall: 0.000000
          TP:        0           TN:  1014628
          FP:        0           FN:     1041
         TPR: 0.000000          FPR: 0.000000

Fit time: 4.20 minutes
Predict time: 1.61 minutes
[32m   Max depth: 8[0m
PARAMETERS:
	MAX DEPTH: 8
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision: 0.000000       Recall: 0.000000
          TP:        0           TN:  1014628
          FP:        0           FN:     1041
         TPR: 0.000000          FPR: 0.000000

Fit time: 3.89 minutes
Predict time: 1.49 minutes
[32m   Max depth: 12[0m
PARAMETERS:
	MAX DEPTH: 12
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.000000     Accuracy: 0.998975
   Precision:

### Custom Algorithm

In [9]:
hp_max_depth_values = [4, 7, 10, 20]
hp_n_thresholds_values = [2, 4, 6]
hp_min_samples_split_values = [2, 100]


def custom_alg_experiment(df_train, df_test, type, start_index=0, end_index=24): 
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    k = 0

    for hp_max_depth in hp_max_depth_values:
        for hp_n_thresholds in hp_n_thresholds_values:
            for hp_min_samples_split in hp_min_samples_split_values:

                if k < start_index or k > end_index:
                    k += 1
                    continue

                k += 1

                printGreen(f"Max depth: {hp_max_depth}")
                printGreen(f"Number of thresholds for num attr: {hp_n_thresholds}")
                printGreen(f"Min samples split: {hp_min_samples_split}")

                start_time = time.time()
                decision_tree = CustomDecisionTree(max_depth=hp_max_depth, 
                                                    min_samples_split=hp_min_samples_split,
                                                    num_thresholds_numerical_attr=hp_n_thresholds)
                decision_tree.fit(X_train, y_train)
                end_time = time.time()
                decision_tree.create_dot_files(filename=f"tree-custom-{type}-{hp_n_thresholds}-{hp_min_samples_split}", generate_png=True, view=VIEW)
                print("PERFORMANCES:")
                predictions = list(decision_tree.predict_test(X_test))
            
                calculate_performances(predictions, y_test, "custom", verbose=True)

                print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
                print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [10]:
print("\nPreprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "undersampling")


Preprocessing: Undersampling
[32mMax depth: 4[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.011703     Accuracy: 0.842853
   Precision: 0.005889       Recall: 0.907781
          TP:      945           TN:   855115
          FP:   159513           FN:       96
         TPR: 0.907781          FPR: 0.157213

Fit time: 0.08 minutes
Predict time: 1.21 minutes
[32mMax depth: 4[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 4
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 2

PERFORMANCES:
    F1 score: 0.011703     Accuracy: 0.842853
   Precision: 0.005889       Recall: 0.907781
          TP:      945           TN:   855115
          FP:   159513           FN:       96
         TPR: 0.907781     

In [None]:
print("\nPreprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "oversampling", start_index=0, end_index=11)

In [None]:
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "oversampling", start_index=12, end_index=18)

In [11]:
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
custom_alg_experiment(df_train, df_test, "oversampling", start_index=18, end_index=24)

[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 2



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.119816 to fit


PERFORMANCES:
    F1 score: 0.026557     Accuracy: 0.965787
   Precision: 0.013677       Recall: 0.455331
          TP:      474           TN:   980446
          FP:    34182           FN:      567
         TPR: 0.455331          FPR: 0.033689

Fit time: 55.74 minutes
Predict time: 2.65 minutes
[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 2[0m
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 2



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.119816 to fit


PERFORMANCES:
    F1 score: 0.026557     Accuracy: 0.965787
   Precision: 0.013677       Recall: 0.455331
          TP:      474           TN:   980446
          FP:    34182           FN:      567
         TPR: 0.455331          FPR: 0.033689

Fit time: 49.65 minutes
Predict time: 2.73 minutes
[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 4[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 4



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.117254 to fit


PERFORMANCES:
    F1 score: 0.024872     Accuracy: 0.963406
   Precision: 0.012785       Recall: 0.455331
          TP:      474           TN:   978028
          FP:    36600           FN:      567
         TPR: 0.455331          FPR: 0.036072

Fit time: 57.27 minutes
Predict time: 2.56 minutes
[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 4[0m
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 4



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.117254 to fit


PERFORMANCES:
    F1 score: 0.024872     Accuracy: 0.963406
   Precision: 0.012785       Recall: 0.455331
          TP:      474           TN:   978028
          FP:    36600           FN:      567
         TPR: 0.455331          FPR: 0.036072

Fit time: 57.44 minutes
Predict time: 2.56 minutes
[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 6[0m
[32mMin samples split: 2[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 2
	NUM THRESHOLDS NUMERICAL ATTR: 6



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.138289 to fit


PERFORMANCES:
    F1 score: 0.024460     Accuracy: 0.964109
   Precision: 0.012581       Recall: 0.439001
          TP:      457           TN:   978759
          FP:    35869           FN:      584
         TPR: 0.439001          FPR: 0.035352

Fit time: 64.89 minutes
Predict time: 2.89 minutes
[32mMax depth: 20[0m
[32mNumber of thresholds for num attr: 6[0m
[32mMin samples split: 100[0m
PARAMETERS:
	CRITERION: Entropy
	TYPE CRITERION: Best
	MAX DEPTH: 20
	MIN SAMPLES SPLIT: 100
	NUM THRESHOLDS NUMERICAL ATTR: 6



dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.138289 to fit


PERFORMANCES:
    F1 score: 0.024460     Accuracy: 0.964109
   Precision: 0.012581       Recall: 0.439001
          TP:      457           TN:   978759
          FP:    35869           FN:      584
         TPR: 0.439001          FPR: 0.035352

Fit time: 72.33 minutes
Predict time: 2.81 minutes


### C4.5


In [5]:
max_depths = [4, 8, 12]
hp_min_samples_split_values = [2, 100]

def c45_experiment(df_train, df_test, type, start_index=0, end_index=6):
    X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
    X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

    k = 0

    for max_depth in max_depths:
        for min_samples_split in hp_min_samples_split_values:

            if k < start_index or k > end_index:
                k += 1
                continue

            k += 1
            
            printGreen(f"Max depth: {max_depth}")
            printGreen(f"Min samples split: {min_samples_split}")
        
            start_time = time.time()
            decision_tree = DecisionTreeC45(max_depth=max_depth, min_samples_split=min_samples_split)
            decision_tree.fit(X_train, y_train)
            end_time = time.time()
            decision_tree.create_dot_files(filename=f"tree-c45-{type}", generate_png=True, view=VIEW)
            print("PERFORMANCES:")
            predictions = list(decision_tree.predict_test(X_test))
                
            calculate_performances(predictions, y_test, "c45", verbose=True)

            print("\nFit time: %.2f minutes" % ((end_time - start_time) / 60))
            print("Predict time: %.2f minutes" % ((time.time() - end_time) / 60))


In [18]:
printLBlue("Preprocessing: Undersampling")
df_train, df_test = original_df_train, original_df_test
df_train = undersampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "undersampling")

[94mPreprocessing: Undersampling[0m
[32mMax depth: 4[0m
[32mMin samples split: 2[0m
PARAMETERS:
	MAX DEPTH: 4

PERFORMANCES:
    F1 score: 0.013796     Accuracy: 0.869793
   Precision: 0.006952       Recall: 0.888569
          TP:      925           TN:   882497
          FP:   132131           FN:      116
         TPR: 0.888569          FPR: 0.130226

Fit time: 0.07 minutes
Predict time: 1.36 minutes
[32mMax depth: 4[0m
[32mMin samples split: 100[0m
PARAMETERS:
	MAX DEPTH: 4

PERFORMANCES:
    F1 score: 0.013600     Accuracy: 0.869175
   Precision: 0.006853       Recall: 0.879923
          TP:      916           TN:   881878
          FP:   132750           FN:      125
         TPR: 0.879923          FPR: 0.130836

Fit time: 0.07 minutes
Predict time: 1.32 minutes
[32mMax depth: 8[0m
[32mMin samples split: 2[0m
PARAMETERS:
	MAX DEPTH: 8

PERFORMANCES:
    F1 score: 0.012974     Accuracy: 0.861574
   Precision: 0.006535       Recall: 0.887608
          TP:      924     

In [19]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "oversampling", start_index=0, end_index=3)

[94mPreprocessing: Oversampling[0m
[32mMax depth: 4[0m
[32mMin samples split: 2[0m
PARAMETERS:
	MAX DEPTH: 4

PERFORMANCES:
    F1 score: 0.015387     Accuracy: 0.888608
   Precision: 0.007764       Recall: 0.849183
          TP:      884           TN:   901648
          FP:   112980           FN:      157
         TPR: 0.849183          FPR: 0.111351

Fit time: 49.64 minutes
Predict time: 1.80 minutes
[32mMax depth: 4[0m
[32mMin samples split: 100[0m
PARAMETERS:
	MAX DEPTH: 4

PERFORMANCES:
    F1 score: 0.015387     Accuracy: 0.888608
   Precision: 0.007764       Recall: 0.849183
          TP:      884           TN:   901648
          FP:   112980           FN:      157
         TPR: 0.849183          FPR: 0.111351

Fit time: 49.48 minutes
Predict time: 1.78 minutes
[32mMax depth: 8[0m
[32mMin samples split: 2[0m
PARAMETERS:
	MAX DEPTH: 8

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896    

In [6]:
printLBlue("Preprocessing: Oversampling")
df_train, df_test = original_df_train, original_df_test
df_train = oversampling(df_train, VERBOSE=False)
c45_experiment(df_train, df_test, "oversampling", start_index=4, end_index=6)

[94mPreprocessing: Oversampling[0m
[32mMax depth: 12[0m
[32mMin samples split: 2[0m
PARAMETERS:
	MAX DEPTH: 12

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 47.06 minutes
Predict time: 1.60 minutes
[32mMax depth: 12[0m
[32mMin samples split: 100[0m
PARAMETERS:
	MAX DEPTH: 12

PERFORMANCES:
    F1 score: 0.014815     Accuracy: 0.882675
   Precision: 0.007472       Recall: 0.860711
          TP:      896           TN:   895610
          FP:   119018           FN:      145
         TPR: 0.860711          FPR: 0.117302

Fit time: 47.33 minutes
Predict time: 1.54 minutes
