In [13]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import uniform, randint

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    StratifiedKFold,
)
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

RAW_DATA_PATH = "data.xlsx"

CAT_FEATURES = ["country", "card", "PSP"]

CYCLICAL_FEATURES = {"day": 31, "dow": 7, "hour": 24}

PSP_COSTS = {
    "Moneycard": {"success": 5, "failure": 2},
    "Goldcard": {"success": 10, "failure": 5},
    "UK_Card": {"success": 3, "failure": 1},
    "Simplecard": {"success": 1, "failure": 0.5},
}

PARAM_DIST = {
    "learning_rate": uniform(0.01, 0.2),
    "max_iter": randint(100, 500),
    "max_depth": randint(3, 10),
    "l2_regularization": uniform(0, 1),
    "min_samples_leaf": randint(20, 100),
}

In [14]:
def engineer_features(data: pd.DataFrame, ohc: OneHotEncoder) -> pd.DataFrame:
    """Drop duplicates and generate Features."""
    data = data.copy()
    data = data.drop_duplicates()

    # Informationen aus Zeitstempel extrahieren
    data["month"] = data.loc[:, "tmsp"].dt.month.astype("int64")
    data["week"] = data.loc[:, "tmsp"].dt.isocalendar().week.astype("int64")
    data["day"] = data.loc[:, "tmsp"].dt.day.astype("int64")
    data["dow"] = data.loc[:, "tmsp"].dt.dayofweek.astype("int64")
    data["hour"] = data.loc[:, "tmsp"].dt.hour.astype("int64")
    data["second"] = data.loc[:, "tmsp"].dt.second.astype("int64")
    data["is_weekend"] = data["dow"] >= 5
    data["is_business_hours"] = (data["hour"] >= 8) & (data["hour"] < 20)

    # Zeit-Features zyklisch kodieren
    # week und month nicht zyklisch kodieren da kein Zyklusübergang
    for key, value in CYCLICAL_FEATURES.items():
        data[f"{key}_sin"] = np.sin(2 * np.pi * data[key] / value)
        data[f"{key}_cos"] = np.cos(2 * np.pi * data[key] / value)

    # Kosten
    data["cost_if_success"] = data["PSP"].map(lambda psp: PSP_COSTS[psp]["success"])
    data["cost_if_failure"] = data["PSP"].map(lambda psp: PSP_COSTS[psp]["failure"])

    # Wiederholte Transaktionsversuche aufgrund fehlgeschlagener Transaktionen
    data["timedelta"] = data["tmsp"].diff().dt.total_seconds().fillna(0).astype("int64")
    cols_to_compare = ["country", "amount", "3D_secured", "card"]
    data["is_retry"] = (data[cols_to_compare] == data[cols_to_compare].shift(1)).all(
        axis=1
    )

    # Anzahl kontinuierlicher Retry Versuche
    retry_groups = (~data["is_retry"]).cumsum()
    data["retry_count"] = (
        data.groupby(retry_groups)["is_retry"].cumsum().astype("int64")
    )

    # Wechsel PSP bei Retry
    data["PSP_switch"] = data.groupby(retry_groups)["PSP"].transform(
        lambda x: (x != x.shift()).fillna(False).cumsum() > 0
    )

    # kategorische Merkmale encodieren
    encoded_array = ohc.transform(data[CAT_FEATURES])
    encoded_columns = ohc.get_feature_names_out(CAT_FEATURES)
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=data.index)
    data = pd.concat([data, encoded_df], axis=1)

    # Timestamp und nicht kategorische features entfernen
    data = data.drop(columns=["tmsp", "PSP", "country", "card"], axis=1)

    return data

In [15]:
def train_ohc_encoder(data: pd.DataFrame) -> OneHotEncoder:
    """Trains and saves a OneHotEncoder for categorical features."""
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoder.fit_transform(data)
    return one_hot_encoder


def train_decision_tree(
    x_train: pd.DataFrame,
    y_train: pd.Series,
) -> DecisionTreeClassifier:
    """Trains a Decision Tree Classifier."""
    decision_tree_model = DecisionTreeClassifier(max_depth=5, random_state=42)
    decision_tree_model.fit(x_train, y_train)

    return decision_tree_model


def train_hgboost(
    x_train: pd.DataFrame,
    y_train: pd.Series,
) -> HistGradientBoostingClassifier:
    """Trains a HGBoost Classifier."""
    hgboost_model = HistGradientBoostingClassifier(
        max_leaf_nodes=30, learning_rate=0.05, random_state=42
    )
    hgboost_model.fit(x_train, y_train)

    return hgboost_model


def calculate_success_probability(model, features: pd.DataFrame) -> float:
    """Calculates the success probability for a given model and features."""
    return model.predict_proba(features)[:, 1]

In [16]:
def evaluate_technical_performance(
    model, x_test: pd.DataFrame, y_test: pd.DataFrame
) -> None:
    """Evaluates the model on the test set by calculating accuracy."""
    y_pred_proba = calculate_success_probability(model, x_test)
    score = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC AUC Score: {score:.4f}")


def _calculate_actual_costs(choices, y_true) -> float:
    """
    Helper function to calculate the total actual cost for a series of PSP choices
    based on the true transaction outcomes.
    """
    total_cost = 0

    for index, psp_choice in choices.items():
        if psp_choice in PSP_COSTS:
            cost_dict = PSP_COSTS[psp_choice]
            # Use the true outcome (y_true) to determine the actual cost
            actual_cost = (
                cost_dict["success"] if y_true.loc[index] else cost_dict["failure"]
            )
            total_cost += actual_cost
    return total_cost


def evaluate_business_impact(
    model, x_test: pd.DataFrame, y_test: pd.DataFrame, original_data: pd.DataFrame
) -> None:
    """Evaluates and compares the financial outcome of the model's routing strategy
    against the legacy system's strategy on the test set."""
    all_model_columns = x_test.columns.tolist()
    expected_costs_df = pd.DataFrame(index=x_test.index)

    for psp in PSP_COSTS:
        simulated_features = x_test.copy()

        for col in all_model_columns:
            if col.startswith("PSP_"):
                simulated_features[col] = 0
        simulated_features[f"PSP_{psp}"] = 1
        simulated_features = simulated_features.reindex(
            columns=all_model_columns, fill_value=0
        )
        prob_success = calculate_success_probability(model, simulated_features)

        expected_costs_df[psp] = (
            prob_success * PSP_COSTS[psp]["success"]
            + (1 - prob_success) * PSP_COSTS[psp]["failure"]
        )

    # Calculate Model Strategy Cost
    model_choices = expected_costs_df.idxmin(axis=1)
    total_cost_model = _calculate_actual_costs(model_choices, y_test)

    # Calculate Legacy System Cost
    legacy_choices = original_data.loc[x_test.index, "PSP"]
    total_cost_legacy = _calculate_actual_costs(legacy_choices, y_test)

    # Report the Financial Outcome
    savings = total_cost_legacy - total_cost_model
    savings_percent = (
        (savings / total_cost_legacy) * 100 if total_cost_legacy > 0 else 0
    )

    print(f"  Legacy System Cost: {total_cost_legacy:,.2f} €")
    print(f"  Model Strategy Cost: {total_cost_model:,.2f} €")
    print(f"  Savings: {savings:,.2f} € ({savings_percent:.2f}%)")

In [23]:
def find_best_hgb_model(
    x_train: pd.DataFrame, y_train: pd.DataFrame
) -> HistGradientBoostingClassifier:
    """Run randomized search to find the best hyperparameters."""
    hgboost_model = HistGradientBoostingClassifier(random_state=42)
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=hgboost_model,
        param_distributions=PARAM_DIST,
        n_iter=1,
        cv=cv_strategy,
        scoring="roc_auc",
        n_jobs=-1,
        random_state=42,
        verbose=0,
    )

    random_search.fit(x_train, y_train)

    print(f"Beste Parameter gefunden: {random_search.best_params_}")
    print(f"Bester CV AUC-Score: {random_search.best_score_:.4f}")

    final_model = HistGradientBoostingClassifier(
        **random_search.best_params_, random_state=42
    )
    final_model.fit(x_train, y_train)

    return final_model

In [24]:
raw_data = pd.read_excel(RAW_DATA_PATH, index_col=0)

In [25]:
ohc = train_ohc_encoder(data=raw_data[CAT_FEATURES])

In [26]:
processed_data = engineer_features(data=raw_data, ohc=ohc)

In [27]:
y = processed_data["success"]
X = processed_data.drop(columns=["success"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [33]:
# Train models
dcm = train_decision_tree(x_train=X_train, y_train=y_train)
hgbm = train_hgboost(x_train=X_train, y_train=y_train)
oghbm = find_best_hgb_model(x_train=X_train, y_train=y_train)
models_to_evaluate = {
    "decision_tree_model": dcm,
    "hgboost_model": hgbm,
    "optimized_hgboost_model": oghbm,
}


print("\n--- Model Evaluation ---")
for name, model in models_to_evaluate.items():
    print(f"\nEvaluating {name}")
    evaluate_technical_performance(model=model, x_test=X_test, y_test=y_test)
    evaluate_business_impact(
        model=model, x_test=X_test, y_test=y_test, original_data=raw_data
    )
print("\n--- Evaluation complete ---")

Beste Parameter gefunden: {'l2_regularization': 0.3745401188473625, 'learning_rate': 0.20014286128198325, 'max_depth': 5, 'max_iter': 171, 'min_samples_leaf': 80}
Bester CV AUC-Score: 0.6750

--- Model Evaluation ---

Evaluating decision_tree_model
ROC AUC Score: 0.6600
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

Evaluating hgboost_model
ROC AUC Score: 0.6790
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

Evaluating optimized_hgboost_model
ROC AUC Score: 0.6757
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

--- Evaluation complete ---


In [34]:
# Train models
dcm = train_decision_tree(x_train=X_train, y_train=y_train)
hgbm = train_hgboost(x_train=X_train, y_train=y_train)
models_to_evaluate = {
    "decision_tree_model": dcm,
    "hgboost_model": hgbm,
    "optimized_hgboost_model": oghbm,
}


print("\n--- Model Evaluation ---")
dcm = train_decision_tree(x_train=X_train, y_train=y_train)
evaluate_technical_performance(model=dcm, x_test=X_test, y_test=y_test)
evaluate_business_impact(
    model=dcm, x_test=X_test, y_test=y_test, original_data=raw_data
)
print("\n--- Evaluation complete ---")

print("\n--- Model Evaluation ---")
hgbm = train_hgboost(x_train=X_train, y_train=y_train)
evaluate_technical_performance(model=hgbm, x_test=X_test, y_test=y_test)
evaluate_business_impact(
    model=hgbm, x_test=X_test, y_test=y_test, original_data=raw_data
)
print("\n--- Evaluation complete ---")

print("\n--- Model Evaluation ---")
oghbm = find_best_hgb_model(x_train=X_train, y_train=y_train)
evaluate_technical_performance(model=oghbm, x_test=X_test, y_test=y_test)
evaluate_business_impact(
    model=oghbm, x_test=X_test, y_test=y_test, original_data=raw_data
)
print("\n--- Evaluation complete ---")


--- Model Evaluation ---
ROC AUC Score: 0.6600
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

--- Evaluation complete ---

--- Model Evaluation ---
ROC AUC Score: 0.6790
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

--- Evaluation complete ---

--- Model Evaluation ---
Beste Parameter gefunden: {'l2_regularization': 0.3745401188473625, 'learning_rate': 0.20014286128198325, 'max_depth': 5, 'max_iter': 171, 'min_samples_leaf': 80}
Bester CV AUC-Score: 0.6750
ROC AUC Score: 0.6757
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

--- Evaluation complete ---


In [35]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import uniform, randint

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    StratifiedKFold,
)
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

RAW_DATA_PATH = "data.xlsx"

CAT_FEATURES = ["country", "card", "PSP"]

CYCLICAL_FEATURES = {"day": 31, "dow": 7, "hour": 24}

PSP_COSTS = {
    "Moneycard": {"success": 5, "failure": 2},
    "Goldcard": {"success": 10, "failure": 5},
    "UK_Card": {"success": 3, "failure": 1},
    "Simplecard": {"success": 1, "failure": 0.5},
}

PARAM_DIST = {
    "learning_rate": uniform(0.01, 0.2),
    "max_iter": randint(100, 500),
    "max_depth": randint(3, 10),
    "l2_regularization": uniform(0, 1),
    "min_samples_leaf": randint(20, 100),
}


def engineer_features(data: pd.DataFrame, ohc: OneHotEncoder) -> pd.DataFrame:
    """Drop duplicates and generate Features."""
    data = data.copy()
    data = data.drop_duplicates()

    # Informationen aus Zeitstempel extrahieren
    data["month"] = data.loc[:, "tmsp"].dt.month.astype("int64")
    data["week"] = data.loc[:, "tmsp"].dt.isocalendar().week.astype("int64")
    data["day"] = data.loc[:, "tmsp"].dt.day.astype("int64")
    data["dow"] = data.loc[:, "tmsp"].dt.dayofweek.astype("int64")
    data["hour"] = data.loc[:, "tmsp"].dt.hour.astype("int64")
    data["second"] = data.loc[:, "tmsp"].dt.second.astype("int64")
    data["is_weekend"] = data["dow"] >= 5
    data["is_business_hours"] = (data["hour"] >= 8) & (data["hour"] < 20)

    # Zeit-Features zyklisch kodieren
    # week und month nicht zyklisch kodieren da kein Zyklusübergang
    for key, value in CYCLICAL_FEATURES.items():
        data[f"{key}_sin"] = np.sin(2 * np.pi * data[key] / value)
        data[f"{key}_cos"] = np.cos(2 * np.pi * data[key] / value)

    # Kosten
    data["cost_if_success"] = data["PSP"].map(lambda psp: PSP_COSTS[psp]["success"])
    data["cost_if_failure"] = data["PSP"].map(lambda psp: PSP_COSTS[psp]["failure"])

    # Wiederholte Transaktionsversuche aufgrund fehlgeschlagener Transaktionen
    data["timedelta"] = data["tmsp"].diff().dt.total_seconds().fillna(0).astype("int64")
    cols_to_compare = ["country", "amount", "3D_secured", "card"]
    data["is_retry"] = (data[cols_to_compare] == data[cols_to_compare].shift(1)).all(
        axis=1
    )

    # Anzahl kontinuierlicher Retry Versuche
    retry_groups = (~data["is_retry"]).cumsum()
    data["retry_count"] = (
        data.groupby(retry_groups)["is_retry"].cumsum().astype("int64")
    )

    # Wechsel PSP bei Retry
    data["PSP_switch"] = data.groupby(retry_groups)["PSP"].transform(
        lambda x: (x != x.shift()).fillna(False).cumsum() > 0
    )

    # kategorische Merkmale encodieren
    encoded_array = ohc.transform(data[CAT_FEATURES])
    encoded_columns = ohc.get_feature_names_out(CAT_FEATURES)
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=data.index)
    data = pd.concat([data, encoded_df], axis=1)

    # Timestamp und nicht kategorische features entfernen
    data = data.drop(columns=["tmsp", "PSP", "country", "card"], axis=1)

    return data


def train_ohc_encoder(data: pd.DataFrame) -> OneHotEncoder:
    """Trains and saves a OneHotEncoder for categorical features."""
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoder.fit_transform(data)
    return one_hot_encoder


def train_decision_tree(
    x_train: pd.DataFrame,
    y_train: pd.Series,
) -> DecisionTreeClassifier:
    """Trains a Decision Tree Classifier."""
    decision_tree_model = DecisionTreeClassifier(max_depth=5, random_state=42)
    decision_tree_model.fit(x_train, y_train)

    return decision_tree_model


def train_hgboost(
    x_train: pd.DataFrame,
    y_train: pd.Series,
) -> HistGradientBoostingClassifier:
    """Trains a HGBoost Classifier."""
    hgboost_model = HistGradientBoostingClassifier(
        max_leaf_nodes=30, learning_rate=0.05, random_state=42
    )
    hgboost_model.fit(x_train, y_train)

    return hgboost_model


def calculate_success_probability(model, features: pd.DataFrame) -> float:
    """Calculates the success probability for a given model and features."""
    return model.predict_proba(features)[:, 1]


def evaluate_technical_performance(
    model, x_test: pd.DataFrame, y_test: pd.DataFrame
) -> None:
    """Evaluates the model on the test set by calculating accuracy."""
    y_pred_proba = calculate_success_probability(model, x_test)
    score = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC AUC Score: {score:.4f}")


def _calculate_actual_costs(choices, y_true) -> float:
    """
    Helper function to calculate the total actual cost for a series of PSP choices
    based on the true transaction outcomes.
    """
    total_cost = 0

    for index, psp_choice in choices.items():
        if psp_choice in PSP_COSTS:
            cost_dict = PSP_COSTS[psp_choice]
            # Use the true outcome (y_true) to determine the actual cost
            actual_cost = (
                cost_dict["success"] if y_true.loc[index] else cost_dict["failure"]
            )
            total_cost += actual_cost
    return total_cost


def evaluate_business_impact(
    model, x_test: pd.DataFrame, y_test: pd.DataFrame, original_data: pd.DataFrame
) -> None:
    """Evaluates and compares the financial outcome of the model's routing strategy
    against the legacy system's strategy on the test set."""
    all_model_columns = x_test.columns.tolist()
    expected_costs_df = pd.DataFrame(index=x_test.index)

    for psp in PSP_COSTS:
        simulated_features = x_test.copy()

        for col in all_model_columns:
            if col.startswith("PSP_"):
                simulated_features[col] = 0
        simulated_features[f"PSP_{psp}"] = 1
        simulated_features = simulated_features.reindex(
            columns=all_model_columns, fill_value=0
        )
        prob_success = calculate_success_probability(model, simulated_features)

        expected_costs_df[psp] = (
            prob_success * PSP_COSTS[psp]["success"]
            + (1 - prob_success) * PSP_COSTS[psp]["failure"]
        )

    # Calculate Model Strategy Cost
    model_choices = expected_costs_df.idxmin(axis=1)
    total_cost_model = _calculate_actual_costs(model_choices, y_test)

    # Calculate Legacy System Cost
    legacy_choices = original_data.loc[x_test.index, "PSP"]
    total_cost_legacy = _calculate_actual_costs(legacy_choices, y_test)

    # Report the Financial Outcome
    savings = total_cost_legacy - total_cost_model
    savings_percent = (
        (savings / total_cost_legacy) * 100 if total_cost_legacy > 0 else 0
    )

    print(f"  Legacy System Cost: {total_cost_legacy:,.2f} €")
    print(f"  Model Strategy Cost: {total_cost_model:,.2f} €")
    print(f"  Savings: {savings:,.2f} € ({savings_percent:.2f}%)")


def find_best_hgb_model(
    x_train: pd.DataFrame, y_train: pd.DataFrame
) -> HistGradientBoostingClassifier:
    """Run randomized search to find the best hyperparameters."""
    hgboost_model = HistGradientBoostingClassifier(random_state=42)
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=hgboost_model,
        param_distributions=PARAM_DIST,
        n_iter=1,
        cv=cv_strategy,
        scoring="roc_auc",
        n_jobs=-1,
        random_state=42,
        verbose=0,
    )

    random_search.fit(x_train, y_train)

    print(f"Beste Parameter gefunden: {random_search.best_params_}")
    print(f"Bester CV AUC-Score: {random_search.best_score_:.4f}")

    final_model = HistGradientBoostingClassifier(
        **random_search.best_params_, random_state=42
    )
    final_model.fit(x_train, y_train)

    return final_model


raw_data = pd.read_excel(RAW_DATA_PATH, index_col=0)
ohc = train_ohc_encoder(data=raw_data[CAT_FEATURES])
processed_data = engineer_features(data=raw_data, ohc=ohc)
y = processed_data["success"]
X = processed_data.drop(columns=["success"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Train models
dcm = train_decision_tree(x_train=X_train, y_train=y_train)
hgbm = train_hgboost(x_train=X_train, y_train=y_train)
oghbm = find_best_hgb_model(x_train=X_train, y_train=y_train)
models_to_evaluate = {
    "decision_tree_model": dcm,
    "hgboost_model": hgbm,
    "optimized_hgboost_model": oghbm,
}


print("\n--- Model Evaluation ---")
for name, model in models_to_evaluate.items():
    print(f"\nEvaluating {name}")
    evaluate_technical_performance(model=model, x_test=X_test, y_test=y_test)
    evaluate_business_impact(
        model=model, x_test=X_test, y_test=y_test, original_data=raw_data
    )
print("\n--- Evaluation complete ---")

Beste Parameter gefunden: {'l2_regularization': 0.3745401188473625, 'learning_rate': 0.20014286128198325, 'max_depth': 5, 'max_iter': 171, 'min_samples_leaf': 80}
Bester CV AUC-Score: 0.6750

--- Model Evaluation ---

Evaluating decision_tree_model
ROC AUC Score: 0.6600
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

Evaluating hgboost_model
ROC AUC Score: 0.6790
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

Evaluating optimized_hgboost_model
ROC AUC Score: 0.6757
  Legacy System Cost: 17,927.00 €
  Model Strategy Cost: 6,074.00 €
  Savings: 11,853.00 € (66.12%)

--- Evaluation complete ---
