<a href="https://colab.research.google.com/github/EdgarTorresF/proyecto_data_science/blob/main/Modelo_Deteccion_de_Fraudes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datetime import date, datetime, timedelta
import os
import math

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl

import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
transactions_df = pd.read_feather("combined_file.feather")
transactions_df.sample(10, random_state=0)

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
1733953,450193,2018-05-17 18:28:48,4079,5282,20.19,4040928,46,0,0
241598,778848,2018-06-21 07:34:06,4755,525,66.87,7025646,81,0,0
1643723,944853,2018-07-08 11:51:34,1636,1064,33.54,8509894,98,0,0
1064505,1667786,2018-09-21 21:39:04,2471,1876,27.98,15025144,173,0,0
1030163,722992,2018-06-15 10:31:19,1360,5365,58.82,6517879,75,0,0
109238,1288642,2018-08-13 10:48:16,2825,7670,17.24,11616496,134,0,0
873682,308350,2018-05-03 05:51:22,3584,9548,35.59,2785882,32,0,0
335594,1456766,2018-08-30 23:49:17,3285,5928,59.79,13132157,151,0,0
1589071,1139048,2018-07-28 16:47:09,3158,8486,28.23,10255629,118,0,0
436905,370241,2018-05-09 13:04:12,2743,7764,96.02,3330252,38,0,0


In [None]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 9 columns):
 #   Column             Dtype         
---  ------             -----         
 0   TRANSACTION_ID     int64         
 1   TX_DATETIME        datetime64[ns]
 2   CUSTOMER_ID        int64         
 3   TERMINAL_ID        int64         
 4   TX_AMOUNT          float64       
 5   TX_TIME_SECONDS    int64         
 6   TX_TIME_DAYS       int64         
 7   TX_FRAUD           int64         
 8   TX_FRAUD_SCENARIO  int64         
dtypes: datetime64[ns](1), float64(1), int64(7)
memory usage: 120.4 MB


In [None]:
# Descripcion de la informacion de nuestra base de datos
transactions_df.describe()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
count,1754155.0,1754155,1754155.0,1754155.0,1754155.0,1754155.0,1754155.0,1754155.0,1754155.0
mean,877077.0,2018-07-01 11:20:33.708571904,2504.011,4996.733,53.6323,7903234.0,90.9726,0.008369272,0.01882388
min,0.0,2018-04-01 00:00:31,0.0,0.0,0.0,31.0,0.0,0.0,0.0
25%,438538.5,2018-05-16 14:40:46.500000,1252.0,2502.0,21.01,3940846.0,45.0,0.0,0.0
50%,877077.0,2018-07-01 11:11:10,2506.0,4994.0,44.64,7902670.0,91.0,0.0,0.0
75%,1315616.0,2018-08-16 08:01:01.500000,3765.0,7495.0,76.95,11865660.0,137.0,0.0,0.0
max,1754154.0,2018-09-30 23:59:57,4999.0,9999.0,2628.0,15811200.0,182.0,1.0,3.0
std,506381.1,,1445.987,2886.101,42.32649,4565172.0,52.83709,0.09110012,0.2113263


In [None]:
# Revisamos valores nulos
transactions_df.isna().sum()

Unnamed: 0,0
TRANSACTION_ID,0
TX_DATETIME,0
CUSTOMER_ID,0
TERMINAL_ID,0
TX_AMOUNT,0
TX_TIME_SECONDS,0
TX_TIME_DAYS,0
TX_FRAUD,0
TX_FRAUD_SCENARIO,0


In [None]:
# Revisamos valores duplicados
transactions_df.duplicated().sum()

0

In [None]:
# Revisamos cantidad de transacciones fraudulentas vs no fraudulentas y totales
not_fraud_count, fraud_count = np.bincount(transactions_df["TX_FRAUD"])

total_count = not_fraud_count + fraud_count
print(
    (
        f"Data:\n"
        f"    Total: {total_count}\n"
        f"    Not Fraud: {not_fraud_count} ({100 * not_fraud_count / total_count:.2f}% of total)\n"
        f"    Fraud: {fraud_count} ({100 * fraud_count / total_count:.2f}% of total)\n"
    )
)

Data:
    Total: 1754155
    Not Fraud: 1739474 (99.16% of total)
    Fraud: 14681 (0.84% of total)



In [None]:
df = pd.concat(
    [
        transactions_df[transactions_df["TX_FRAUD"] == 0].sample(1000, random_state=0),
        transactions_df[transactions_df["TX_FRAUD"] == 1].sample(1000, random_state=0),
    ]
)

fig = px.histogram(
    df,
    title="Transaction count for different amounts",
    x="TX_AMOUNT",
    color="TX_FRAUD",
    marginal="box",
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()


In [None]:
cleaned_df = pd.DataFrame()

In [None]:
cleaned_df["amount"] = transactions_df["TX_AMOUNT"]
cleaned_df["is_fraud"] = transactions_df["TX_FRAUD"]
cleaned_df["is_weekend"] = transactions_df["TX_DATETIME"].dt.weekday >= 5
cleaned_df["is_night"] = transactions_df["TX_DATETIME"].dt.hour <= 6

In [None]:
cleaned_df["customer_num_transactions_1_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("1d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["customer_num_transactions_7_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("7d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["customer_num_transactions_30_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("30d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["customer_avg_amount_1_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("1d", on="TX_DATETIME").mean()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["customer_avg_amount_7_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("7d", on="TX_DATETIME").mean()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["customer_avg_amount_30_day"] = transactions_df.groupby(
    "CUSTOMER_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("30d", on="TX_DATETIME").mean()
, include_groups=False)["TX_AMOUNT"]


In [None]:
DAY_DELAY = 7

def get_count_risk_rolling_window(
    terminal_transactions, window_size, delay_period=DAY_DELAY
):
    # Ordenar por TX_DATETIME
    terminal_transactions = terminal_transactions.sort_values("TX_DATETIME")

    frauds_in_delay = terminal_transactions.rolling(
        str(delay_period) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].sum()
    transactions_in_delay = terminal_transactions.rolling(
        str(delay_period) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].count()

    frauds_until_window = terminal_transactions.rolling(
        str(delay_period + window_size) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].sum()
    transactions_until_window = terminal_transactions.rolling(
        str(delay_period + window_size) + "d", on="TX_DATETIME"
    )["TX_FRAUD"].count()

    frauds_in_window = frauds_until_window - frauds_in_delay
    transactions_in_window = transactions_until_window - transactions_in_delay

    terminal_transactions["fraud_risk"] = (
        frauds_in_window / transactions_in_window
    ).fillna(0)

    return terminal_transactions


cleaned_df["terminal_num_transactions_1_day"] = transactions_df.groupby(
    "TERMINAL_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("1d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["terminal_num_transactions_7_day"] = transactions_df.groupby(
    "TERMINAL_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("7d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["terminal_num_transactions_30_day"] = transactions_df.groupby(
    "TERMINAL_ID", group_keys=False
).apply(
    lambda x: x.sort_values("TX_DATETIME")[["TX_DATETIME", "TX_AMOUNT"]]
    .rolling("30d", on="TX_DATETIME").count()
, include_groups=False)["TX_AMOUNT"]

cleaned_df["terminal_fraud_risk_1_day"] = transactions_df.groupby("TERMINAL_ID", group_keys=False).apply(
    lambda x: get_count_risk_rolling_window(x.sort_values("TX_DATETIME"), 1, 7)
, include_groups=False)["fraud_risk"]

cleaned_df["terminal_fraud_risk_7_day"] = transactions_df.groupby("TERMINAL_ID", group_keys=False).apply(
    lambda x: get_count_risk_rolling_window(x.sort_values("TX_DATETIME"), 7, 7)
, include_groups=False)["fraud_risk"]

cleaned_df["terminal_fraud_risk_30_day"] = transactions_df.groupby("TERMINAL_ID", group_keys=False).apply(
    lambda x: get_count_risk_rolling_window(x.sort_values("TX_DATETIME"), 30, 7)
, include_groups=False)["fraud_risk"]


In [None]:
cleaned_df["day"] = transactions_df["TX_TIME_DAYS"]
cleaned_df["datetime"] = transactions_df["TX_DATETIME"]
cleaned_df["customer_id"] = transactions_df["CUSTOMER_ID"]
cleaned_df["id"] = transactions_df["TRANSACTION_ID"]

In [None]:
pd.concat(
    # show some fraudulent and non-fraudulent transactions
    [
        cleaned_df[cleaned_df["is_fraud"] == 1].sample(5, random_state=0),
        cleaned_df[cleaned_df["is_fraud"] == 0].sample(5, random_state=0),
    ]
).sample(10, random_state=0)

Unnamed: 0,amount,is_fraud,is_weekend,is_night,customer_num_transactions_1_day,customer_num_transactions_7_day,customer_num_transactions_30_day,customer_avg_amount_1_day,customer_avg_amount_7_day,customer_avg_amount_30_day,terminal_num_transactions_1_day,terminal_num_transactions_7_day,terminal_num_transactions_30_day,terminal_fraud_risk_1_day,terminal_fraud_risk_7_day,terminal_fraud_risk_30_day,day,datetime,customer_id,id
695361,45.6,1,True,False,5.0,17.0,74.0,72.238,58.062941,59.366486,5.0,13.0,40.0,1.0,1.0,0.351351,139,2018-08-18 12:36:13,4606,1337564
237487,12.18,0,True,False,5.0,34.0,111.0,16.414,16.072059,15.184865,1.0,6.0,23.0,0.0,0.0,0.0,56,2018-05-27 16:08:57,2989,545060
344019,58.92,1,True,False,2.0,14.0,61.0,43.08,86.172143,89.290328,3.0,4.0,25.0,0.0,1.0,0.625,41,2018-05-12 18:08:01,2754,401990
286604,10.19,0,False,False,1.0,2.0,6.0,10.19,8.855,7.93,3.0,11.0,15.0,0.0,0.0,0.0,12,2018-04-13 18:41:20,4490,123727
1037717,44.79,1,False,False,1.0,13.0,72.0,44.79,69.1,77.758194,1.0,5.0,18.0,0.0,0.0,0.0,68,2018-06-08 07:11:08,1278,653940
951252,11.66,0,True,False,5.0,20.0,53.0,15.542,14.8465,14.298113,1.0,5.0,9.0,0.0,0.0,0.0,20,2018-04-21 07:11:14,3047,193354
1499648,38.53,0,True,False,2.0,28.0,112.0,57.365,74.033571,66.179911,3.0,10.0,29.0,0.0,0.0,0.0,48,2018-05-19 11:19:30,155,465107
1401113,61.1,1,False,True,4.0,15.0,74.0,73.31,64.962,66.948784,2.0,6.0,33.0,1.0,1.0,0.315789,74,2018-06-14 06:32:19,2882,711162
432852,62.88,1,False,True,3.0,19.0,75.0,85.943333,68.16,68.287467,1.0,10.0,40.0,0.0,1.0,0.555556,38,2018-05-09 06:32:55,1905,366188
1582257,19.76,0,True,True,6.0,18.0,63.0,16.923333,12.206667,12.492381,1.0,6.0,17.0,0.0,0.0,0.0,118,2018-07-28 05:04:55,747,1132234


In [None]:
def get_train_test_set(
    df,
    start_date_training,
    delta_train=7,
    delta_delay=DAY_DELAY,
    delta_test=7,
    random_state=0,
):

    # Get the training set data
    train_df = df[
        (df["datetime"] >= start_date_training)
        & (df["datetime"] < start_date_training + timedelta(days=delta_train))
    ]

    # Get the test set data
    test_df = []

    # Note: Cards known to be compromised after the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) are removed

    # First, get known defrauded customers from the training set
    known_defrauded_customers = set(train_df[train_df["is_fraud"] == 1]["customer_id"])

    # Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
    start_tx_time_days_training = train_df["day"].min()

    # Then, for each day of the test set
    for day in range(delta_test):

        # Get test data for that day
        test_df_day = df[
            df["day"] == start_tx_time_days_training + delta_train + delta_delay + day
        ]

        # Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
        test_df_day_delay_period = df[
            df["day"] == start_tx_time_days_training + delta_train + day - 1
        ]

        new_defrauded_customers = set(
            test_df_day_delay_period[test_df_day_delay_period["is_fraud"] == 1][
                "customer_id"
            ]
        )
        known_defrauded_customers = known_defrauded_customers.union(
            new_defrauded_customers
        )

        test_df_day = test_df_day[
            ~test_df_day["customer_id"].isin(known_defrauded_customers)
        ]

        test_df.append(test_df_day)

    test_df = pd.concat(test_df)

    # Sort data sets by ascending order of transaction ID
    train_df = train_df.sort_values("id")
    test_df = test_df.sort_values("id")

    return (train_df, test_df)


train_df, test_df = get_train_test_set(
    cleaned_df, datetime(2018, 7, 25), delta_train=21
)
train_df, val_df = get_train_test_set(train_df, datetime(2018, 7, 25))

In [None]:
label_columns = ["is_fraud"]
feature_columns = [
    "amount",
    "is_weekend",
    "is_night",
    "customer_num_transactions_1_day",
    "customer_num_transactions_7_day",
    "customer_num_transactions_30_day",
    "customer_avg_amount_1_day",
    "customer_avg_amount_7_day",
    "customer_avg_amount_30_day",
    "terminal_num_transactions_1_day",
    "terminal_num_transactions_7_day",
    "terminal_num_transactions_30_day",
    "terminal_fraud_risk_1_day",
    "terminal_fraud_risk_7_day",
    "terminal_fraud_risk_30_day",
]

train_labels = np.array(train_df[label_columns])
val_labels = np.array(val_df[label_columns])
test_labels = np.array(test_df[label_columns])

train_features = np.array(train_df[feature_columns])
val_features = np.array(val_df[feature_columns])
test_features = np.array(test_df[feature_columns])

In [None]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (67240, 1)
Validation labels shape: (58264, 1)
Test labels shape: (50321, 1)
Training features shape: (67240, 15)
Validation features shape: (58264, 15)
Test features shape: (50321, 15)


In [None]:
weight_for_not_fraud = (1.0 / not_fraud_count) * total_count / 2.0
weight_for_fraud = (1.0 / fraud_count) * total_count / 2.0

class_weight = {0: weight_for_not_fraud, 1: weight_for_fraud}

class_weight

{0: 0.5042199538481172, 1: 59.74235406307473}

In [None]:
# bias fix to speed up training
# see https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#optional_set_the_correct_initial_bias
output_bias = tf.keras.initializers.Constant(np.log([fraud_count / not_fraud_count]))

model = keras.Sequential(
    [
        keras.layers.Dense(
            500, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dense(
            500, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
    ]
)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        keras.metrics.AUC(name="auc"),
        keras.metrics.AUC(name="prc", curve="PR"),
    ],
)
model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [None]:
BATCH_SIZE = 64

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_prc", verbose=1, patience=10, mode="max", restore_best_weights=True
)

training_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=40,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels),
    class_weight=class_weight,
)


Epoch 1/40
[1m1051/1051[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - auc: 0.7955 - loss: 0.8823 - prc: 0.3411 - precision: 0.0843 - recall: 0.6445 - val_auc: 0.8467 - val_loss: 0.1650 - val_prc: 0.5507 - val_precision: 0.1431 - val_recall: 0.6909
Epoch 2/40
[1m1051/1051[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - auc: 0.9039 - loss: 0.3670 - prc: 0.5258 - precision: 0.0916 - recall: 0.7963 - val_auc: 0.8583 - val_loss: 0.3541 - val_prc: 0.4874 - val_precision: 0.0410 - val_recall: 0.7403
Epoch 3/40
[1m1051/1051[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - auc: 0.9070 - loss: 0.3630 - prc: 0.4923 - precision: 0.0830 - recall: 0.7932 - val_auc: 0.8381 - val_loss: 0.2541 - val_prc: 0.4345 - val_precision: 0.0863 - val_recall: 0.7117
Epoch 4/40
[1m1051/1051[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - auc: 0.9067 - loss: 0.3433 - prc: 0.5465 - precision: 0.1087 - recall: 0.8021 - val_auc: 0.8574 - val_lo

In [None]:
res = []

metrics_to_plot = [
    ("loss", "Loss"),
    ("precision", "Precision"),
    ("recall", "Recall"),
    ("auc", "Area under ROC curve"),
    ("prc", "Area under PR curve"),
]
fig = make_subplots(rows=len(metrics_to_plot), cols=1)

for metric, name in metrics_to_plot:
    fig = go.Figure(
        data=[
            go.Scatter(
                x=training_history.epoch,
                y=training_history.history[metric],
                mode="lines",
                name="Training",
            ),
            go.Scatter(
                x=training_history.epoch,
                y=training_history.history["val_" + metric],
                mode="lines",
                line={"dash": "dash"},
                name="Validation",
            ),
        ]
    )
    fig.update_yaxes(title=name)
    fig.update_xaxes(title="Epoch")

    if (metric, name) == metrics_to_plot[0]:
        fig.update_layout(
            height=250, title="Training history", margin={"b": 0, "t": 50}
        )
    else:
        fig.update_layout(height=200, margin={"b": 0, "t": 0})
    fig.show()

In [None]:
train_predictions = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)

predictions_df = pd.DataFrame(
    {"Prediction": train_predictions.ravel(), "Label": train_labels.ravel()}
)
predictions_df = pd.concat(
    [
        predictions_df[predictions_df["Label"] == 0].sample(5000, random_state=0),
        predictions_df[predictions_df["Label"] == 1].sample(500, random_state=0),
    ]
)
fig = px.histogram(
    predictions_df,
    x="Prediction",
    title="Prediction values",
    color="Label",
    marginal="box",
    labels={"0": "Legitimate", "1": "Fraudulent"},
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()

[1m1051/1051[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
def make_roc_df(name, predictions, labels):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)
    return pd.DataFrame({"fp": fp * 100, "tp": tp * 100, "Dataset": name})


roc_df = pd.concat(
    [
        make_roc_df("Training", train_predictions, train_labels),
        make_roc_df("Test", test_predictions, test_labels),
    ]
)

fig = px.line(
    roc_df,
    title="ROC Curve",
    x="fp",
    y="tp",
    color="Dataset",
    labels={"fp": "False Positives (%)", "tp": "True Positives (%)"},
)
fig.update_yaxes(range=[60, 100])
fig.update_traces(line={"dash": "dash"}, selector={"name": "test"})
fig.show()