In [1]:
if 'google.colab' in str(get_ipython()):
    GITHUB_TOKEN = ""
    !rm -rf anti_money_laundering
    !git clone https://{GITHUB_TOKEN}@github.com/FedericoBruzzone/anti_money_laundering.git
    !mv anti_money_laundering/.* .
    !mv anti_money_laundering/* .
    !rm -rf ./anti_money_laundering

In [2]:
# import os
# from dotenv import load_dotenv

# load_dotenv()

# os.environ['KAGGLE_USERNAME'] = os.getenv("KAGGLE_USER")
# os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')
# !cd datasets
# !kaggle datasets download -d --unzip {os.getenv('KAGGLE_DATASET_LINK')}
# !cd ..

In [3]:
def create_trees(partition_elements, verbose=False):
    list_series = []
    for element in partition_elements:
        series_tmp = pd.Series(element.asDict())
        list_series.append(series_tmp)

    part_df = pd.DataFrame(list_series, columns=COLUMNS_NAME)
    X_train, y_train = get_X_and_Y(part_df, verbose=VERBOSE)
    X_train, _ = label_encoder(X_train, ['Timestamp', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])

    decision_tree: DecisionTreeID3 = DecisionTreeID3(max_depth=8,
                                                     num_thresholds_numerical_attr=2,
                                                     VERBOSE=False)

    decision_tree.fit(X_train, y_train)

    if verbose:
        ctx = TaskContext()
        decision_tree.create_dot_files(filename="tree" + str(ctx.partitionId()),
                                       generate_png=True,
                                       view="default-viewer")
    yield decision_tree

def predict_trees(new_line):
    def wrap(tree):
        prediction = tree.predict(new_line)
        return prediction
    return wrap

def predict_trees_all(X_test):
    def wrap(tree):
        predictions = tree.predict_test_no_gen(X_test)
        return predictions
    return wrap

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

import time
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from src.utils.kaggle_config            import setup_kaggle
from src.utils.kaggle_config            import download_dataset

from src.utils.datasets_handler         import (get_train_and_test,
                                                get_X_and_Y,
                                                print_dataset,
                                                label_encoder,
                                                split_timestamp)
from src.utils.performance_measures     import calculate_performances
from src.utils.dataset_sampling_methods import (oversampling,
                                                undersampling,
                                                bootstrap_sampling)

from src.utils.print_utils              import (printLBlue, printGreen)

from src.decision_tree.decision_tree    import CustomDecisionTree
from src.decision_tree.ID3              import DecisionTreeID3
from src.decision_tree.C45              import DecisionTreeC45
from src.decision_tree.entropy_type     import EntropyType
from src.decision_tree.criterion_type   import CriterionType

from IPython.display import Image, display

from src.utils.spark_config import get_spark_session
from pyspark import TaskContext
import pandas as pd

VERBOSE = int(os.getenv('VERBOSE'))
VIEW = os.getenv('VIEW')

setup_kaggle()
print("Downloading dataset...") 
download_dataset("iammustafatz/diabetes-prediction-dataset")
download_dataset("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
print("Done.")

hi_small_trans = "HI-Small_Trans.csv"
diabetes = "diabetes_prediction_dataset.csv"

Downloading dataset...
Done.


In [5]:
name = "AntiMoneyLaundering"

spark = get_spark_session(name, VERBOSE)

df_train, df_test = get_train_and_test(hi_small_trans, verbose=VERBOSE)

df_train_p = df_train[df_train["Is Laundering"] == 1]
df_train_n = df_train[df_train["Is Laundering"] == 0]
df_train_n = df_train_n.sample(frac=0.1, random_state=2)
df_train = pd.concat([df_train_p, df_train_n])

df_train = oversampling(df_train, VERBOSE=False)
df_train = bootstrap_sampling(df_train)
print("len(df_train):", len(df_train))

COLUMNS_NAME: list = df_train.columns.tolist()
X_train, y_train = get_X_and_Y(df_train, verbose=VERBOSE)
X_test, y_test = get_X_and_Y(df_test, verbose=VERBOSE)

# X_train, _ = label_encoder(X_train, ['Timestamp', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])
X_test, _ = label_encoder(X_test, ['Timestamp', 'Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format'])

23/09/11 16:15:48 WARN Utils: Your hostname, federicobruzzone resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface wlo1)
23/09/11 16:15:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/11 16:15:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


len(df_train): 670558


In [6]:
df = spark.createDataFrame(df_train)

print("Printing spark dataframe...")
df.show()

rdd = df.rdd

Printing spark dataframe...
+----------------+---------+---------+-------+---------+---------------+------------------+-----------+-----------------+--------------+-------------+
|       Timestamp|From Bank|  Account|To Bank|Account.1|Amount Received|Receiving Currency|Amount Paid| Payment Currency|Payment Format|Is Laundering|
+----------------+---------+---------+-------+---------+---------------+------------------+-----------+-----------------+--------------+-------------+
|2022/09/02 13:32|     3698|801997D20|  11974|801998AE0|        4109.34|         US Dollar|    4109.34|        US Dollar|           ACH|            1|
|2022/09/06 08:59|    20486|807EF78D0|  19477|80750D550|      127652.54|              Yuan|  127652.54|             Yuan|           ACH|            1|
|2022/09/07 05:35|   228101|80A48A710|  23289|808839F70|       12934.66|          UK Pound|   12934.66|         UK Pound|           ACH|            1|
|2022/09/08 09:41|      222|811D80C30|    121|8000E1590|        51


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

In [7]:
def map_to_column_value_pairs(row):
    return [(i, row[i]) for i in range(len(row))]

def count_values(a, b):
    return a + b

predictions = rdd.mapPartitions(create_trees, False) \
                 .map(predict_trees_all(X_test)) \
                 .flatMap(map_to_column_value_pairs) \
                 .map(lambda x: (x, 1)) \
                 .reduceByKey(count_values) \
                 .map(lambda x: (x[0][0], [(x[0][1], x[1])])) \
                 .reduceByKey(count_values) \
                 .map(lambda x: (x[0], max(x[1], key=lambda item: item[1]))) \
                 .map(lambda x: x[1][0]) \
                 .collect()

predictions

                                                                                

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [10]:
calculate_performances(predictions, y_test, verbose=True)

    F1 score: 0.002109     Accuracy: 0.998136
   Precision: 0.002336       Recall: 0.001921
          TP:        2           TN:  1013774
          FP:      854           FN:     1039
         TPR: 0.001921          FPR: 0.000842


(<function src.utils.performance_measures.accuracy(tp, tn, n_y) -> float>,
 <function src.utils.performance_measures.f1_score(tp, fn, fp) -> float>)

In [9]:
spark.stop()