In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
COSTS = {
    "Moneycard": {"success": 5, "failed": 2},
    "Goldcard": {"success": 10, "failed": 5},
    "UK_Card": {"success": 3, "failed": 1},
    "Simplecard": {"success": 1, "failed": 0.5},
}
TIME_FEATURES = {"day": 31, "dow": 7, "hour": 24}

df = pd.read_excel("data.xlsx")
df = df.drop(columns=["Unnamed: 0"])


def process_data(processed: pd.DataFrame = df) -> pd.DataFrame:
    """Drop duplicates and generate Features."""
    processed = processed.drop_duplicates()
    processed = processed.copy()

    # Informationen aus Zeitstempel extrahieren
    processed["month"] = processed.loc[:, "tmsp"].dt.month.astype("int64")
    processed["week"] = processed.loc[:, "tmsp"].dt.isocalendar().week.astype("int64")
    processed["day"] = processed.loc[:, "tmsp"].dt.day.astype("int64")
    processed["dow"] = processed.loc[:, "tmsp"].dt.dayofweek.astype("int64")
    processed["hour"] = processed.loc[:, "tmsp"].dt.hour.astype("int64")
    processed["second"] = processed.loc[:, "tmsp"].dt.second.astype("int64")
    processed["is_weekend"] = np.where(processed["dow"] >= 5, True, False)
    processed["is_business_hours"] = np.where(
        (processed["hour"] >= 8) & (processed["hour"] < 20), True, False
    )

    # Zeit-Features zyklisch kodieren
    # week und month nicht zyklisch kodieren da kein Zyklusübergang
    for key, value in TIME_FEATURES.items():
        processed[f"{key}_sin"] = np.sin(2 * np.pi * processed[key] / value)
        processed[f"{key}_cos"] = np.cos(2 * np.pi * processed[key] / value)

    # Kosten
    processed["cost"] = processed.apply(
        lambda row: COSTS[row["PSP"]]["success"]
        if row["success"]
        else COSTS[row["PSP"]]["failed"],
        axis=1,
    )

    # Wiederholte Transaktionsversuche aufgrund fehlgeschlagener Transaktionen
    processed["timedelta"] = (
        processed["tmsp"].diff().dt.total_seconds().fillna(0).astype("int64")
    )
    cols_to_compare = ["country", "amount", "3D_secured", "card"]
    processed["is_retry"] = (
        processed[cols_to_compare] == processed[cols_to_compare].shift(1)
    ).all(axis=1)

    # Anzahl kontinuierlicher Retry Versuche
    retry_groups = (~processed["is_retry"]).cumsum()
    processed["retry_count"] = (
        processed.groupby(retry_groups)["is_retry"].cumsum().astype("int64")
    )

    # Wechsel PSP bei Retry
    processed["PSP_switch"] = False
    processed["PSP_switch"] = np.where(
        (processed["is_retry"]) & (processed["PSP"] != processed["PSP"].shift(1))
        | processed["is_retry"] & (processed["PSP_switch"].shift(1)),
        True,
        False,
    )
    # Anzahl aufeinanderfolgende failed unterschiedlicher Umsätze

    # kategorische Merkmale encodieren
    cat_features = processed[["country", "card"]]
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    encoded_array = one_hot_encoder.fit_transform(cat_features)
    encoded_columns = one_hot_encoder.get_feature_names_out(cat_features.columns)
    encoded_df = pd.DataFrame(
        encoded_array, columns=encoded_columns, index=processed.index
    )
    processed = pd.concat([processed, encoded_df], axis=1)

    # Timestamp und nicht kategorische features entfernen
    processed = processed.drop(columns=["tmsp", "country", "card"])

    return processed


df = process_data()

In [5]:
df

Unnamed: 0,amount,success,PSP,3D_secured,month,week,day,dow,hour,second,...,timedelta,is_retry,retry_count,PSP_switch,country_Austria,country_Germany,country_Switzerland,card_Diners,card_Master,card_Visa
0,89,0,UK_Card,0,1,1,1,1,0,11,...,0,False,0,False,0.0,1.0,0.0,0.0,0.0,1.0
1,89,1,UK_Card,0,1,1,1,1,0,17,...,6,True,1,False,0.0,1.0,0.0,0.0,0.0,1.0
2,238,0,UK_Card,1,1,1,1,1,0,49,...,92,False,0,False,0.0,1.0,0.0,1.0,0.0,0.0
3,238,1,UK_Card,1,1,1,1,1,0,13,...,24,True,1,False,0.0,1.0,0.0,1.0,0.0,0.0
4,124,0,Simplecard,0,1,1,1,1,0,33,...,80,False,0,False,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50324,415,0,UK_Card,0,2,9,28,3,23,39,...,123,False,0,False,0.0,0.0,1.0,0.0,0.0,1.0
50325,91,0,UK_Card,0,2,9,28,3,23,48,...,69,False,0,False,1.0,0.0,0.0,0.0,1.0,0.0
50326,91,0,UK_Card,0,2,9,28,3,23,4,...,16,True,1,False,1.0,0.0,0.0,0.0,1.0,0.0
50327,91,0,UK_Card,0,2,9,28,3,23,36,...,32,True,2,False,1.0,0.0,0.0,0.0,1.0,0.0


In [6]:
X = df.drop(columns=["PSP", "success"])
y = df["PSP"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
model = tree.DecisionTreeClassifier(random_state=42)

In [8]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [9]:
model.predict_proba(X_test["PSP"].to_frame().T)[0, 1]

KeyError: 'PSP'