In [2]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import xgboost as xgb  # For using the XGBoost algorithm
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix  # For evaluating model performance

# Load the preprocessed training and testing datasets from Parquet files
# Parquet is a columnar storage file format optimized for use with big data processing frameworks
train_df = pd.read_parquet("preprocessed_train_data.parquet")  # Load the training dataset
test_df = pd.read_parquet("preprocessed_test_data.parquet")  # Load the testing dataset

# Define features (independent variables) and target (dependent variable)
# Features are all columns except the target column "FlagImpaye"
X_train = train_df.drop(columns=["FlagImpaye"])  # Features for the training set
y_train = train_df["FlagImpaye"].astype(int)  # Target for the training set, converted to numeric (int)

X_test = test_df.drop(columns=["FlagImpaye"])  # Features for the testing set
y_test = test_df["FlagImpaye"].astype(int)  # Target for the testing set, converted to numeric (int)

#1. Cost-Sensitive Learning with XGBoost
XGBoost provides the scale_pos_weight parameter to handle imbalanced datasets. This parameter adjusts the weight of the positive class (fraudulent transactions) relative to the negative class (non-fraudulent transactions).

In [None]:
# Calculate scale_pos_weight for cost-sensitive learning
# The scale_pos_weight parameter is used to handle class imbalance in binary classification.
# It adjusts the weight of the positive class (fraudulent transactions) relative to the negative class (non-fraudulent transactions).
# This is calculated as the ratio of the number of negative samples to the number of positive samples.
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Train XGBoost model with cost-sensitive learning
# Initialize the XGBoost classifier with specific parameters:
# - scale_pos_weight: Adjusts the balance between classes based on the calculated ratio.
# - objective="binary:logistic": Specifies that this is a binary classification problem with logistic loss.
# - eval_metric="logloss": Sets the evaluation metric to log loss, which is commonly used for binary classification.
# - random_state=42: Ensures reproducibility by fixing the random seed.
model_xgb = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

# Fit the XGBoost model to the training data
# This trains the model using the features (X_train) and target variable (y_train).
model_xgb.fit(X_train, y_train)

# Evaluate the model
# Use the trained model to make predictions on the test set (X_test).
y_pred = model_xgb.predict(X_test)

# Print a classification report to evaluate model performance
# The classification report includes metrics such as precision, recall, F1-score, and support for each class.
# These metrics help assess how well the model performs, especially for imbalanced datasets.
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.68      0.81    740838
           1       0.02      0.67      0.04      6573

    accuracy                           0.68    747411
   macro avg       0.51      0.67      0.42    747411
weighted avg       0.99      0.68      0.80    747411



#2. Cost-Sensitive Learning with LightGBM
LightGBM allows us to specify the is_unbalance parameter or provide instance weights directly.

In [None]:
import lightgbm as lgb

# Create LightGBM datasets
# LightGBM uses its own Dataset format for training and evaluation.
# `X_train` contains the feature matrix for the training set, and `y_train` contains the corresponding labels.
train_data = lgb.Dataset(X_train, label=y_train)

# Similarly, create a dataset for the test set. The `reference` parameter ensures that the test set aligns with the training set's structure.
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters for cost-sensitive learning
# These parameters configure the LightGBM model for binary classification with cost-sensitive adjustments.
params = {
    "objective": "binary",  # Specifies that this is a binary classification problem.
    "metric": "binary_logloss",  # Use binary log loss as the evaluation metric.
    "is_unbalance": True,  # Automatically adjusts for class imbalance by weighting classes inversely proportional to their frequency.
    "boosting_type": "gbdt",  # Gradient Boosting Decision Tree (GBDT) is the default boosting method.
    "random_state": 42  # Set a random seed for reproducibility.
}

# Train LightGBM model
# The `train` function trains the model using the specified parameters and training dataset.
model_lgb = lgb.train(params, train_data)

# Evaluate the model
# Predict probabilities for the test set. LightGBM outputs probabilities for the positive class by default.
y_pred = model_lgb.predict(X_test)

# Convert predicted probabilities into binary predictions (0 or 1).
# A threshold of 0.5 is used: if the predicted probability is greater than 0.5, classify as 1; otherwise, classify as 0.
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

# Print a classification report to evaluate the model's performance.
# This includes metrics like precision, recall, F1-score, and support for each class.
print(classification_report(y_test, y_pred_binary))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 23346, number of negative: 3865122
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2107
[LightGBM] [Info] Number of data points in the train set: 3888468, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.006004 -> initscore=-5.109323
[LightGBM] [Info] Start training from score -5.109323
              precision    recall  f1-score   support

           0       0.99      0.60      0.75    740838
           1       0.01      0.66      0.03      6573

    accuracy                           0.60    747411
   macro avg       0.50      0.63      0.39    747411
weighted avg       0.99      0.60      0.74    747411



#Cost-Sensitive Learning with XGBoost + Bayesian Optimization
Bayesian optimization can be used to fine-tune hyperparameters, including scale_pos_weight. We'll use the hyperopt library for this purpose.

In [4]:
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import roc_auc_score

# Define the objective function for Bayesian optimization
def objective(params):
    """
    This function defines the objective to minimize during Bayesian optimization.
    It trains an XGBoost model using the given hyperparameters and evaluates its performance
    on the test set using the ROC-AUC score. The negative AUC is returned because `fmin`
    minimizes the objective function.
    """
    # Create an XGBoost classifier with the current set of hyperparameters
    model = xgb.XGBClassifier(
        scale_pos_weight=params['scale_pos_weight'],  # Weight for positive class (cost-sensitive learning)
        max_depth=int(params['max_depth']),          # Maximum depth of the tree
        learning_rate=params['learning_rate'],       # Step size shrinkage used in updates
        n_estimators=int(params['n_estimators']),    # Number of boosting rounds
        random_state=42                              # Fixed random seed for reproducibility
    )

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predict probabilities for the positive class on the test set
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate the ROC-AUC score and return its negative value (to minimize)
    return -roc_auc_score(y_test, y_pred_proba)

# Define the search space for hyperparameters
space = {
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),  # Uniform distribution for scale_pos_weight
    'max_depth': hp.quniform('max_depth', 3, 10, 1),           # Quantized uniform distribution for max_depth
    'learning_rate': hp.loguniform('learning_rate', -5, 0),   # Log-uniform distribution for learning_rate
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50)   # Quantized uniform distribution for n_estimators
}

# Perform Bayesian optimization
trials = Trials()  # Object to store details of each evaluation
best_params = fmin(
    fn=objective,            # Objective function to minimize
    space=space,             # Search space for hyperparameters
    algo=tpe.suggest,        # Tree-structured Parzen Estimator (TPE) algorithm for optimization
    max_evals=20,            # Maximum number of evaluations
    trials=trials            # Store trial information
)

# Train the final model with the best parameters found by Bayesian optimization
best_model = xgb.XGBClassifier(
    scale_pos_weight=best_params['scale_pos_weight'],  # Use the best scale_pos_weight
    max_depth=int(best_params['max_depth']),          # Convert max_depth to integer
    learning_rate=best_params['learning_rate'],       # Use the best learning_rate
    n_estimators=int(best_params['n_estimators']),    # Convert n_estimators to integer
    random_state=42                                    # Fixed random seed for reproducibility
)
best_model.fit(X_train, y_train)  # Train the model on the full training set

# Evaluate the final model on the test set
y_pred = best_model.predict(X_test)  # Predict class labels for the test set
print(classification_report(y_test, y_pred))  # Print a detailed classification report

100%|██████████| 20/20 [36:21<00:00, 109.06s/trial, best loss: -0.7466774058138965]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    740838
           1       0.10      0.16      0.12      6573

    accuracy                           0.98    747411
   macro avg       0.55      0.57      0.56    747411
weighted avg       0.98      0.98      0.98    747411



# Second Part
## Calculating Margin

This function calculates the financial impact (margin or loss) of a single transaction based on the actual and predicted labels. It incorporates a cost-sensitive approach by assigning different costs to false positives and false negatives, depending on the transaction amount. This allows the model to prioritize decisions that minimize financial losses while maximizing profits.

In [5]:
def calculate_margin(actual, predicted, transaction_amount):
    """
    Calculate the margin for a single transaction based on the cost matrix.

    Parameters:
        actual (int): Actual class (0 = non-fraudulent, 1 = fraudulent).
        predicted (int): Predicted class (0 = accepted, 1 = rejected).
        transaction_amount (float): Transaction amount.

    Returns:
        float: Margin generated or lost.
    """
    # Define the margin rate for good (non-fraudulent) transactions
    r = 0.05  # Margin rate for good transactions (e.g., 5% profit margin)

    # Case 1: True Negative (TN)
    # The transaction is non-fraudulent (actual == 0) and was correctly predicted as non-fraudulent (predicted == 0).
    # In this case, we earn a margin proportional to the transaction amount.
    if actual == 0 and predicted == 0:
        return r * transaction_amount

    # Case 2: False Positive (FP)
    # The transaction is non-fraudulent (actual == 0) but was incorrectly predicted as fraudulent (predicted == 1).
    # This results in a loss due to rejecting a valid transaction. The loss is proportional to the margin rate.
    elif actual == 0 and predicted == 1:
        return -0.7 * r * transaction_amount  # Loss is 70% of the potential margin

    # Case 3: False Negative (FN)
    # The transaction is fraudulent (actual == 1) but was incorrectly predicted as non-fraudulent (predicted == 0).
    # The loss depends on the transaction amount, as larger fraudulent transactions result in greater losses.
    elif actual == 1 and predicted == 0:
        if transaction_amount <= 20:
            return 0  # No loss for very small transactions (below €20)
        elif transaction_amount <= 50:
            return -0.2 * transaction_amount  # 20% loss for transactions up to €50
        elif transaction_amount <= 100:
            return -0.3 * transaction_amount  # 30% loss for transactions up to €100
        elif transaction_amount <= 200:
            return -0.5 * transaction_amount  # 50% loss for transactions up to €200
        else:
            return -0.8 * transaction_amount  # 80% loss for transactions above €200

    # Case 4: True Positive (TP)
    # The transaction is fraudulent (actual == 1) and was correctly predicted as fraudulent (predicted == 1).
    # In this case, there is no gain or loss because the fraudulent transaction was successfully blocked.
    elif actual == 1 and predicted == 1:
        return 0  # No gain or loss

In [6]:
# Initialize total margin
# This variable will accumulate the total margin across all transactions.
total_margin = 0

# Loop through all predictions
# Iterate over each transaction in the test dataset to calculate the margin for each one.
for i in range(len(y_test)):
    # Retrieve the actual and predicted labels for the current transaction
    # `y_test.iloc[i]` gives the true label (0 or 1) for the transaction.
    # `y_pred[i]` gives the predicted label (0 or 1) from the model.
    actual = y_test.iloc[i]
    predicted = y_pred[i]

    # Retrieve the transaction amount for the current transaction
    # `X_test.iloc[i]["Montant"]` extracts the transaction amount (e.g., monetary value)
    # associated with the current transaction.
    transaction_amount = X_test.iloc[i]["Montant"]

    # Calculate margin for the current transaction
    # Call the `calculate_margin` function, which computes the margin based on:
    # - The actual label (whether the transaction was fraudulent or not),
    # - The predicted label (whether the model correctly identified it),
    # - The transaction amount (to weigh the impact of the prediction).
    total_margin += calculate_margin(actual, predicted, transaction_amount)

# Print the total margin
# After processing all transactions, display the accumulated total margin.
print(f"Total Margin: {total_margin}")

Total Margin: 1963703.8396503185


In [7]:
# Predict probabilities for the positive class
# The `predict_proba` method of the XGBoost model returns a 2D array where:
# - The first column corresponds to the probability of the negative class (class 0).
# - The second column corresponds to the probability of the positive class (class 1).
# By using `[:, 1]`, we extract only the probabilities for the positive class (class 1).
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

In [8]:
import numpy as np

# Define a range of thresholds between 0.1 and 0.9, with 50 evenly spaced values
# This will be used to evaluate different decision thresholds for classification
thresholds = np.linspace(0.1, 0.9, 50)

# Initialize variables to track the best threshold and the corresponding margin
# `best_threshold` stores the threshold that yields the highest total margin
# `best_margin` is initialized to negative infinity to ensure any valid margin will replace it
best_threshold = 0
best_margin = float('-inf')

# Evaluate each threshold in the defined range
for threshold in thresholds:
    # Convert predicted probabilities (`y_pred_proba`) into binary predictions (0 or 1)
    # based on the current threshold. If the probability >= threshold, predict 1; otherwise, predict 0.
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Initialize a variable to calculate the total margin for the current threshold
    total_margin = 0

    # Iterate through each test sample to calculate the margin
    for i in range(len(y_test)):
        # Get the actual class label (0 or 1) for the current sample
        actual = y_test.iloc[i]

        # Get the predicted class label (0 or 1) for the current sample
        predicted = y_pred[i]

        # Get the transaction amount (e.g., monetary value) for the current sample
        transaction_amount = X_test.iloc[i]["Montant"]

        # Calculate the margin for the current sample using a custom function `calculate_margin`
        # This function should take into account the actual label, predicted label, and transaction amount
        total_margin += calculate_margin(actual, predicted, transaction_amount)

    # Check if the total margin for the current threshold is better than the best margin so far
    # If yes, update the best margin and the corresponding threshold
    if total_margin > best_margin:
        best_margin = total_margin
        best_threshold = threshold

# Print the best threshold and the corresponding total margin
print(f"Best Threshold: {best_threshold}")
print(f"Best Total Margin: {best_margin}")

Best Threshold: 0.7040816326530612
Best Total Margin: 2008401.9618003068


# Comparing Margins for Fraud Detection Strategies A/B Testing

In [9]:
# Group A: No fraud detection (all transactions are accepted)
group_a_margin = 0
for i in range(len(y_test)):
    actual = y_test.iloc[i]  # Actual label (0: Non-Fraud, 1: Fraud)
    transaction_amount = X_test.iloc[i]["Montant"]  # Transaction amount for the current instance
    group_a_margin += calculate_margin(actual, 0, transaction_amount)  # Always accept (predicted=0)

# Explanation:
# In this group, we simulate a scenario where no fraud detection is applied.
# All transactions are accepted regardless of whether they are fraudulent or not.
# The margin is calculated based on the actual label and the transaction amount.

# Group B: Fraud detection applied
group_b_margin = 0
for i in range(len(y_test)):
    actual = y_test.iloc[i]  # Actual label (0: Non-Fraud, 1: Fraud)
    predicted = y_pred[i]  # Predicted label from the fraud detection model (0: Non-Fraud, 1: Fraud)
    transaction_amount = X_test.iloc[i]["Montant"]  # Transaction amount for the current instance
    group_b_margin += calculate_margin(actual, predicted, transaction_amount)

# Explanation:
# In this group, we apply the fraud detection model to predict whether a transaction is fraudulent.
# The margin is calculated based on the actual label, the predicted label, and the transaction amount.
# This simulates the impact of using a fraud detection system.

# Print the results for comparison
print(f"Group A Margin (No Fraud Detection): {group_a_margin}")
print(f"Group B Margin (With Fraud Detection): {group_b_margin}")

# Explanation:
# The margins for both groups are compared to evaluate the financial impact of applying fraud detection.
# A higher margin in Group B indicates that the fraud detection model improves profitability by reducing losses due to fraud.

Group A Margin (No Fraud Detection): 1941851.6945002985
Group B Margin (With Fraud Detection): 1982455.2089003082


## Simulating Transaction Fraud Rates and Calculating Margins

In [10]:
import random

# Define a function to calculate margin based on actual fraud, predicted fraud, and transaction amount
def calculate_margin(actual, predicted, transaction_amount):
    """
    Calculate the margin for a transaction based on actual fraud status, predicted fraud status,
    and the transaction amount.

    Parameters:
        actual (int): Actual fraud status (1 for fraud, 0 for non-fraud).
        predicted (int): Predicted fraud status (1 for fraud, 0 for non-fraud).
        transaction_amount (float): The transaction amount.

    Returns:
        float: The calculated margin.
    """
    # Example margin logic (customize as needed)
    if actual == 1 and predicted == 1:  # Correctly predicted fraud
        return -transaction_amount * 0.1  # Assume a 10% loss due to fraud prevention
    elif actual == 1 and predicted == 0:  # Missed fraud
        return -transaction_amount  # Full loss of transaction amount
    elif actual == 0 and predicted == 1:  # False positive
        return -transaction_amount * 0.05  # Assume a 5% cost for false positives
    else:  # Correctly predicted non-fraud
        return transaction_amount * 0.02  # Assume a 2% profit for successful transactions


# Simulate transactions with varying fraud rates
fraud_rates = [0.01, 0.05, 0.1, 0.2]  # Example fraud rates (1%, 5%, 10%, 20%)
results = []  # Store results for each fraud rate simulation

for fraud_rate in fraud_rates:
    # Simulate test labels (y_test) based on the current fraud rate
    simulated_y_test = [
        1 if random.random() < fraud_rate else 0 for _ in range(len(y_test))
    ]
    simulated_margin = 0  # Initialize the total margin for this simulation

    # Iterate through each transaction in the test set
    for i in range(len(simulated_y_test)):
        actual = simulated_y_test[i]  # Simulated actual fraud status
        predicted = y_pred[i]  # Model's predicted fraud status
        transaction_amount = X_test.iloc[i]["Montant"]  # Transaction amount from test data

        # Calculate the margin for the current transaction and add it to the total margin
        simulated_margin += calculate_margin(actual, predicted, transaction_amount)

    # Store the results for the current fraud rate
    results.append((fraud_rate, simulated_margin))

# Print the results of the simulations
print("Simulation Results:")
for fraud_rate, margin in results:
    print(f"Fraud Rate: {fraud_rate}, Simulated Margin: {margin}")

Fraud Rate: 0.01, Simulated Margin: 2109040.7784003373
Fraud Rate: 0.05, Simulated Margin: 1274532.9787500459
Fraud Rate: 0.1, Simulated Margin: 242417.2435499819
Fraud Rate: 0.2, Simulated Margin: -1858523.518149816
