In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

print(sys.executable)

C:\Users\Erik\PycharmProjects\fallstudie_model_engineering\.venv\Scripts\python.exe


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src.config import RAW_DATA_PATH, CAT_FEATURES, DECISION_TREE_PATH, HGBOOST_OPTIMIZED_PATH, HGBOOST_PATH
from src.features import engineer_features, create_categorial_features
from src.predictions import _get_all_predictions, _calculate_strategy_kpis, min_expected_cost, max_success_prob, evaluate_business_strategies

from src.models import (
    train_ohc_encoder,
    train_decision_tree,
    train_hgboost,
    tune_hyperparameters,
)
from src.metrics import (
    get_scores,
    plot_confusion_matrix,
    plot_multiple_precision_recall_curves,
    find_best_f1_threshold,
)

In [None]:
raw_data = pd.read_excel(RAW_DATA_PATH, index_col=0)

In [None]:
# Split features and target
data = raw_data.drop_duplicates()
y = data["success"]
X = data.drop(columns=["success"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Engineer Features
X_train = create_categorial_features(data=X_train)
X_test = create_categorial_features(data=X_test)
ohc = train_ohc_encoder(data=X_train[CAT_FEATURES])
X_train = engineer_features(data=X_train, encoder=ohc)
X_test = engineer_features(data=X_test, encoder=ohc)

Train Models

In [None]:
decision_tree_model = train_decision_tree(x_train=X_train, y_train=y_train)

In [None]:
hgboost_model = train_hgboost(x_train=X_train, y_train=y_train)

In [None]:
hgboost_optimized_model = tune_hyperparameters(x_train=X_train, y_train=y_train)

In [None]:
models_to_evaluate = {
    "decision_tree_model": decision_tree_model,
    "hgboost_model": hgboost_model,
    "optimized_hgboost_model": hgboost_optimized_model,
}

In [None]:
scores = []
for name, model in models_to_evaluate.items():
    precision, recall, accuracy, f1, roc_auc, cm = get_scores(
        name=name, model=model, y_true=y_test, x_test=X_test, threshold=0.511
    )
    scores.append(
        {
            "model": name,
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "accuracy": round(accuracy, 4),
            "f1": round(f1, 4),
            "roc-auc": round(roc_auc, 4),
        }
    )
scores = pd.DataFrame(scores)
print(scores)

In [None]:
plot_confusion_matrix(x_test=X_test, y_test=y_test, models=models_to_evaluate)

In [None]:
plot_multiple_precision_recall_curves(
    models=models_to_evaluate, x_test=X_test, y_test=y_test
)

In [None]:
find_best_f1_threshold(x_test=X_test, y_test=y_test, model=ohgboost_model)

In [None]:
from src.metrics import feature_importance

feature_importance = get_feature_importance(
    models_to_evaluate=models_to_evaluate, x_test=X_test, y_test=y_test
)

In [None]:
feature_importance.decision_tree_model.sort_values(ascending=False).head(20)

In [None]:
predictions_df = _get_all_predictions(
    model=ohgboost_model, x_test=X_test, original_data=raw_data, encoder=ohc
)

In [None]:
predictions_df

In [None]:
_calculate_strategy_kpis(choices=predictions_df["original_psp"], y_test=y_test)

In [None]:
test = min_expected_cost(predictions_df)
test

In [None]:
predictions_df

In [None]:
best_probs = max_success_prob(predictions_df)
best_probs.value_counts()

In [None]:
predictions_df.describe()

In [None]:
test = evaluate_business_strategies(ohgboost_model, X_test, y_test, data, ohc)

In [None]:
test

In [7]:
from src.main import main