In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import sys

print(sys.executable)

C:\Users\Erik\PycharmProjects\fallstudie_model_engineering\.venv\Scripts\python.exe


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src.config import RAW_DATA_PATH, CAT_FEATURES
from src.features import engineer_features, create_categorial_features
from src.predictions import _get_all_predictions, _calculate_strategy_kpis, min_expected_cost, max_success_prob, evaluate_business_strategies

from src.models import (
    train_ohc_encoder,
    train_decision_tree,
    train_hgboost,
    tune_hyperparameters,
)
from src.metrics import (
    get_scores,
    plot_confusion_matrix,
    plot_multiple_precision_recall_curves,
    find_best_f1_threshold,
)

In [7]:
raw_data = pd.read_excel(RAW_DATA_PATH, index_col=0)

In [8]:
# Split features and target
data = raw_data.drop_duplicates()
y = data["success"]
X = data.drop(columns=["success"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
# Engineer Features
X_train = create_categorial_features(data=X_train)
X_test = create_categorial_features(data=X_test)
ohc = train_ohc_encoder(data=X_train[CAT_FEATURES])
X_train = engineer_features(data=X_train, encoder=ohc)
X_test = engineer_features(data=X_test, encoder=ohc)

Train Models

In [10]:
dtm = train_decision_tree(x_train=X_train, y_train=y_train)

In [11]:
hgbm = train_hgboost(x_train=X_train, y_train=y_train)

Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File [35m"C:\Users\Erik\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m1041[0m, in [35m_bootstrap_inner[0m
    [31mself.run[0m[1;31m()[0m
    [31m~~~~~~~~[0m[1;31m^^[0m
  File [35m"C:\Users\Erik\PycharmProjects\fallstudie_model_engineering\.venv\Lib\site-packages\ipykernel\ipkernel.py"[0m, line [35m788[0m, in [35mrun_closure[0m
    [31m_threading_Thread_run[0m[1;31m(self)[0m
    [31m~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^[0m
  File [35m"C:\Users\Erik\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m992[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"C:\Users\Erik\AppData\Local\Programs\Python\Python313\Lib\subprocess.py"[0m, line [35m1609[0m, in [35m_readerthread[0m
    buffer.append([31mfh.read[0m[1;31m()[

In [12]:
ohgbm = tune_hyperparameters(x_train=X_train, y_train=y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Beste Parameter gefunden: {'l2_regularization': np.float64(0.8607305832563434), 'learning_rate': np.float64(0.011390426106238142), 'max_depth': 10, 'max_iter': 236, 'min_samples_leaf': 81}


In [13]:
models_to_evaluate = {
    "decision_tree_model": dtm,
    "hgboost_model": hgbm,
    "optimized_hgboost_model": ohgbm,
    "optimized_hgboost_model_1": ohgbm,
}

In [None]:
scores = []
for name, model in models_to_evaluate.items():
    precision, recall, accuracy, f1, roc_auc, cm = get_scores(
        name=name, model=model, y_true=y_test, x_test=X_test, threshold=0.511
    )
    scores.append(
        {
            "model": name,
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "accuracy": round(accuracy, 4),
            "f1": round(f1, 4),
            "roc-auc": round(roc_auc, 4),
        }
    )
scores = pd.DataFrame(scores)
print(scores)

In [None]:
plot_confusion_matrix(x_test=X_test, y_test=y_test, models=models_to_evaluate)

In [None]:
plot_multiple_precision_recall_curves(
    models=models_to_evaluate, x_test=X_test, y_test=y_test
)

In [None]:
find_best_f1_threshold(x_test=X_test, y_test=y_test, model=ohgbm)

In [None]:
from src.metrics import feature_importance

feature_importance = get_feature_importance(
    models_to_evaluate=models_to_evaluate, x_test=X_test, y_test=y_test
)

In [None]:
feature_importance.dtm.sort_values(ascending=False).head(20)

In [16]:
predictions_df = _get_all_predictions(
    model=ohgbm, x_test=X_test, original_data=raw_data, encoder=ohc
)

In [17]:
predictions_df

Unnamed: 0,original_psp,Moneycard,Goldcard,UK_Card,Simplecard
16866,Simplecard,0.599146,0.688460,0.291215,0.339504
6304,Simplecard,0.240194,0.692742,0.576616,0.462857
43346,Simplecard,0.494667,0.701169,0.248044,0.282984
19952,Simplecard,0.510574,0.666648,0.573469,0.517232
18012,UK_Card,0.454884,0.672690,0.582936,0.352807
...,...,...,...,...,...
32021,Moneycard,0.537555,0.648268,0.271000,0.528781
20378,UK_Card,0.510880,0.728761,0.274349,0.315591
36486,UK_Card,0.663155,0.662544,0.571805,0.371024
13658,UK_Card,0.669966,0.656115,0.567911,0.317281


In [18]:
_calculate_strategy_kpis(choices=predictions_df["original_psp"], y_test=y_test)

(20.683488972779653, 17927.0)

In [19]:
test = min_expected_cost(predictions_df)
test

16866    Simplecard
6304     Simplecard
43346    Simplecard
19952    Simplecard
18012    Simplecard
            ...    
32021    Simplecard
20378    Simplecard
36486    Simplecard
13658    Simplecard
13149    Simplecard
Length: 10066, dtype: object

In [21]:
predictions_df

Unnamed: 0,original_psp,Moneycard,Goldcard,UK_Card,Simplecard
16866,Simplecard,0.599146,0.688460,0.291215,0.339504
6304,Simplecard,0.240194,0.692742,0.576616,0.462857
43346,Simplecard,0.494667,0.701169,0.248044,0.282984
19952,Simplecard,0.510574,0.666648,0.573469,0.517232
18012,UK_Card,0.454884,0.672690,0.582936,0.352807
...,...,...,...,...,...
32021,Moneycard,0.537555,0.648268,0.271000,0.528781
20378,UK_Card,0.510880,0.728761,0.274349,0.315591
36486,UK_Card,0.663155,0.662544,0.571805,0.371024
13658,UK_Card,0.669966,0.656115,0.567911,0.317281


In [24]:
best_probs = max_success_prob(predictions_df)
best_probs.value_counts()

Goldcard      9311
Moneycard      540
UK_Card        174
Simplecard      41
Name: count, dtype: int64

In [25]:
predictions_df.describe()

Unnamed: 0,Moneycard,Goldcard,UK_Card,Simplecard
count,10066.0,10066.0,10066.0,10066.0
mean,0.497541,0.700226,0.461688,0.41132
std,0.112137,0.076501,0.142458,0.103403
min,0.208231,0.544829,0.209079,0.231028
25%,0.419127,0.651965,0.290392,0.324833
50%,0.517576,0.682351,0.558758,0.38255
75%,0.565375,0.724367,0.577083,0.476587
max,0.762069,0.843515,0.664094,0.713105


In [27]:
test = evaluate_business_strategies(ohgbm, X_test, y_test, data, ohc)

In [28]:
test

Unnamed: 0,Total Cost (â‚¬),Success Rate (%)
Legacy System,20.683489,17927.0
Cost-Optimized Model,20.683489,6074.0
Success-Optimized Model,20.683489,57751.5
