In [None]:
# Function to make sure the script autostops, in case of wrong lakehouse destination

def lakehouse_gatekeep(expected_db = "ml_curate_lakehouse"):
    db = spark.catalog.currentDatabase()

    if db != expected_db:
       raise Exception(
        f"Forkert Lakehouse er tilkoblet som default: '{db}'."
        f"Det rigtige lakehouse er: '{expected_db}'. Sæt default lakehouse til '{expected_db}' i Explorer i venstre side."
        )
    else:
        print(f"Det korrekte lakehouse: '{db}' er valgt. scriptet fortsættes")

lakehouse_gatekeep()

In [None]:
#### This script will be used to test different algorithms for the Win/lose model thats working for the active pipeline #############################
#### It is intended to test different algorithms to both ensure and validate that the usage of XGB over other algorithms was the correct choice. ####

# The following scripts will be tested within the scripts
    # - XGB
    # - LightGBM
    # - CatBoost

# I've chosen to use decisions trees, more specifically decision trees that are very good at handling the data structure thats present within the data;
# a mixed structure of both catvars and numvars

# Some more known decisiontrees such as AdaBoost and RandomForest has been excluded from the getgo due to their harsh handling of categorical variables and the need of one hot encoding. 
# One hot encoding would simply explode the dataset due to the sheer amount of levels the categories has.

In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

import logging
logging.getLogger().setLevel(logging.CRITICAL)




In [None]:
#### Data retrieval and preparation ####


solutions_ml = spark.read.format("delta").load("cantshow")
solutions_ml = solutions_ml.toPandas()

# Lakehouse import/export automatically changed categorical variable to object types, therefore: 


for col in solutions_ml.select_dtypes(include = 'object').columns:
    solutions_ml[col] = solutions_ml[col].astype('category')

solutions_ml['Is_EOQ'] = solutions_ml['Is_EOQ'].astype('category') 
solutions_ml['Quarter'] = solutions_ml['Quarter'].astype('category') 
solutions_ml['YearName'] = solutions_ml['YearName'].astype('category') 


X = solutions_ml.drop(columns=['OpportunityState',
'OpportunityStatusGroup',
'OpportunityNumber',
'CloseDate',
'IndustryName',
'OpportunitySalesTeam',
'AnnualContractValueDKK',
'MarginValueDKK',
'YearName'
#'Title',
#'Site', 
#'Department'
#'OpportunityState_num'
])

y = solutions_ml[['OpportunityState']]

opp_id = solutions_ml[['OpportunityNumber']]


## Mapper won: 1 & lost: 0

le = LabelEncoder()

y = le.fit_transform(y)

y = pd.Series(y)


X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size = 0.3,
    stratify = y,
    random_state = 123
)




In [None]:
X.dtypes

# showcasing performance of algorithms Prior to Hyperparameter tuning

In [None]:
##################
#### CatBoost ####
##################

#### CatBoost ####

cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# Create CatBoostClassifier
sales_cb = CatBoostClassifier(
    iterations = 100,
    learning_rate = 1,
    verbose = True
)


# Fit the model
sales_cb.fit(
    X_train, 
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test)
)



In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd

#### Validating the CatBoost Model ####

# Predict probabilities
y_train_proba = sales_cb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_cb.predict_proba(X_test)[:, 1]

# Apply threshold
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

# Classification reports
print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

# CatBoost feature importance is accessed via get_feature_importance()
model_featureimportance = sales_cb.get_feature_importance()

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_cb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()

In [None]:
##################
#### LightGBM ####
##################




sales_lgb = lgb.LGBMClassifier()


sales_lgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    categorical_feature= cat_features
)





In [None]:
#### LightGBM ####


#### Validating the LightGBM Model ####

# Predict probabilities
y_train_proba = sales_lgb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_lgb.predict_proba(X_test)[:, 1]

# Apply threshold
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

# Classification reports
print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

model_featureimportance = sales_lgb.feature_importances_

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_lgb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()


In [None]:
#################
#### XGBoost ####
#################




from xgboost import XGBClassifier


sales_xgb = XGBClassifier(
    iterations = 10,
    learning_rate = 1,
    enable_categorical = True
)

sales_xgb.fit(X_train, y_train)




In [None]:
# XGBoost

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd

#### Validating the CatBoost Model ####

# Predict probabilities
y_train_proba = sales_xgb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_xgb.predict_proba(X_test)[:, 1]

# Apply threshold
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

# Classification reports
print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

model_featureimportance = sales_xgb.get_feature_importance()

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_xgb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()

# Testing the following algorithm/model: CatBoost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# categorical feature indices
cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

cb_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Accuracy", 
    verbose=False,
    random_seed=1234
)

param_distributions = {
    "iterations": [300, 500, 700, 900],
    "learning_rate": np.linspace(0.01, 0.2, 10),
    "depth": [3, 4, 5, 6],
    "l2_leaf_reg": [1, 3, 5, 7, 10],
    "rsm": [0.5, 0.7, 0.85, 1.0],
    "bagging_temperature": [0, 0.5, 1, 2],
    "random_strength": [1, 2, 3, 5]
}

random_search = RandomizedSearchCV(
    estimator=cb_model,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="balanced_accuracy",
    cv=5,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_train, y_train, cat_features=cat_features)

print("Best params:", random_search.best_params_)
print("Best balanced CV accuracy:", random_search.best_score_)


In [None]:
#### CatBoost ####

cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# Create CatBoostClassifier
sales_cb = CatBoostClassifier(
    iterations=700,         
    learning_rate=0.157,
    depth=4,                 
    l2_leaf_reg = 1,           
    random_seed=1234,
    loss_function="Logloss", 
    eval_metric="Accuracy",
    verbose=False,      
    rsm = 0.5,
    bagging_temperature = 0.5,
    random_strength = 3,
    auto_class_weights='Balanced'
)




sales_cb.fit(
    X_train, 
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test)
)

In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd

#### Validating the CatBoost Model ####

# Predict probabilities
y_train_proba = sales_cb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_cb.predict_proba(X_test)[:, 1]

# Apply threshold
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

# Classification reports
print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

model_featureimportance = sales_cb.get_feature_importance()

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_cb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

thresholds = np.arange(0.1, 0.91, 0.05)
threshold_scores = {t: [] for t in thresholds}

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    sales_cb.fit(X_train, y_train,
    cat_features=cat_features,
)

    y_val_proba = sales_cb.predict_proba(X_val)[:, 1]

    # Evaluate each threshold
    for t in thresholds:
        y_val_pred = (y_val_proba >= t).astype(int)
        score = balanced_accuracy_score(y_val, y_val_pred)
        threshold_scores[t].append(score)

    print(f"Fold {fold} complete.")

# Compute mean accuracy per threshold
mean_scores = {t: np.mean(scores) for t, scores in threshold_scores.items()}
best_threshold = max(mean_scores, key=mean_scores.get)
best_score = mean_scores[best_threshold]

print("\n Threshold Tuning Results:")
for t in sorted(mean_scores):
    print(f"Threshold {t:.2f}: Mean Balanced Accuracy = {mean_scores[t]:.4f}")

print(f"\n Best Threshold: {best_threshold:.2f} with Mean Balanced Accuracy = {best_score:.4f}")

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
threshold = 0.5

val_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train_fold = X.iloc[train_idx].copy()
    X_val_fold   = X.iloc[val_idx].copy()
    y_train_fold = y.iloc[train_idx].copy()
    y_val_fold   = y.iloc[val_idx].copy()

    sales_cb.fit(X_train_fold, y_train_fold, cat_features=cat_features)

    y_train_proba = sales_cb.predict_proba(X_train_fold)[:, 1]
    y_val_proba   = sales_cb.predict_proba(X_val_fold)[:, 1]

    y_train_pred = (y_train_proba >= threshold).astype(int)
    y_val_pred   = (y_val_proba >= threshold).astype(int)

    train_score = balanced_accuracy_score(y_train_fold, y_train_pred)
    val_score   = balanced_accuracy_score(y_val_fold, y_val_pred)

    val_scores.append(val_score)

    print(f"Fold {fold}:")
    print(f"    Training Balanced Accuracy:   {train_score:.4f}")
    print(f"    Validation Balanced Accuracy: {val_score:.4f}\n")

# Show mean CV score
print(f"Mean Validation Balanced Accuracy: {np.mean(val_scores):.4f}")

# Testing the following algorithm/model: LightGBM

In [None]:
#### LightGBM RandomizedSearch to check different hypertuning combinations ####


cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
for c in cat_cols:
    X_train[c] = X_train[c].astype("category")


lgb_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    random_state=1234,
    n_jobs=-1,
    class_weight = "balanced"
)

param_distributions = {
    "n_estimators": [300, 500, 700, 900],           # equivalent to CatBoost 'iterations'
    "learning_rate": np.linspace(0.01, 0.2, 10),
    "max_depth": [3, 4, 5, 6],                      # equivalent to CatBoost 'depth'
    "reg_alpha": [0, 1, 3, 5, 7],                   # equivalent-ish to 'l2_leaf_reg'
    "reg_lambda": [0, 1, 3, 5, 7],                  # add regularization flexibility
    "subsample": [0.7, 0.85, 1.0],                  # similar to 'rsm/bagging_temperature'
    "colsample_bytree": [0.5, 0.7, 0.85, 1.0],
    "min_child_samples": [10, 20, 50], 
    "num_leaves": [15, 31, 63, 127]
}

random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_distributions,
    n_iter=30,                    # Number of random combinations
    scoring="balanced_accuracy",
    cv=5,
    verbose=2,
    n_jobs=1,
    random_state=1234
)


random_search.fit(X_train, y_train)

print("Best params:", random_search.best_params_)
print("Best CV score:", random_search.best_score_)


In [None]:
import lightgbm as lgb

cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
for c in cat_cols:
    X_train[c] = X_train[c].astype('category')
    X_test[c] = X_test[c].astype('category')

sales_lgb = lgb.LGBMClassifier(
    n_estimators=500,        # from your best params
    learning_rate = 0.1788,
    max_depth=6,
    reg_lambda = 7,
    reg_alpha =7,        # roughly corresponds to CatBoost l2_leaf_reg
    subsample=0.7,           # adjust according to your best params (rsm/bagging equivalent)
    colsample_bytree=0.85,    # corresponds to rsm
    min_child_samples=50, 
    num_leaves = 63,
    random_state=1234,
    objective='binary',
    boosting_type='gbdt',
    n_jobs=-1,
    class_weight = "balanced"
)

sales_lgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',   
    categorical_feature=cat_cols
)



In [None]:
#### LightGBM ####

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd

#### Validating the LGBM Model ####

y_train_proba = sales_lgb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_lgb.predict_proba(X_test)[:, 1]

threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

# Classification reports
print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

model_featureimportance = sales_lgb.feature_importances_

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_lgb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
threshold = 0.5

val_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train_fold = X.iloc[train_idx].copy()
    X_val_fold   = X.iloc[val_idx].copy()
    y_train_fold = y.iloc[train_idx].copy()
    y_val_fold   = y.iloc[val_idx].copy()

    sales_lgb.fit(X_train_fold, y_train_fold, categorical_feature=cat_cols)

    y_train_proba = sales_lgb.predict_proba(X_train_fold)[:, 1]
    y_val_proba   = sales_lgb.predict_proba(X_val_fold)[:, 1]

    y_train_pred = (y_train_proba >= threshold).astype(int)
    y_val_pred   = (y_val_proba >= threshold).astype(int)

    train_score = balanced_accuracy_score(y_train_fold, y_train_pred)
    val_score   = balanced_accuracy_score(y_val_fold, y_val_pred)

    val_scores.append(val_score)

    print(f"Fold {fold}:")
    print(f"    Training Balanced Accuracy:   {train_score:.4f}")
    print(f"    Validation Balanced Accuracy: {val_score:.4f}\n")

# Show mean CV score
print(f"Mean Validation Balanced Accuracy: {np.mean(val_scores):.4f}")

# XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np

# Compute balanced sample weights
weights = compute_sample_weight(class_weight="balanced", y=y_train)

sales_xg = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=1234,
    tree_method="hist",
    enable_categorical=True,
    verbosity=0
)

param_distributions = {
    "n_estimators": [200, 400, 600, 800],
    "learning_rate": np.linspace(0.01, 0.2, 10),
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 2, 3, 5],
    "subsample": [0.5, 0.7, 0.9, 1.0],
    "colsample_bytree": [0.5, 0.7, 0.85, 1.0],
    "colsample_bylevel": [0.5, 0.7, 1.0],
    "colsample_bynode": [0.5, 0.7, 0.85, 1.0],
    "gamma": [0, 0.1, 0.3, 0.5, 0.7, 1.0],
    "reg_alpha": [0, 0.1, 0.5, 1.0],
    "reg_lambda": [1, 2, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=sales_xg,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="balanced_accuracy",
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=1234
)

random_search.fit(X_train, y_train, **{"sample_weight": weights})

print("Best params:", random_search.best_params_)
print("Best CV balanced accuracy:", random_search.best_score_)

In [None]:
import xgboost as xgb



sales_xgb = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.1788,
    max_depth=5,
    min_child_weight=2,
    subsample=1,
    colsample_bytree=0.5,
    colsample_bylevel = 1,
    colsample_bynode = 0.5,
    gamma=0.3,
    reg_lambda=10,
    reg_alpha=1,

    objective="binary:logistic",
    eval_metric="logloss", 
    random_state=1234,
    n_jobs=-1,
    use_label_encoder=False,
    enable_categorical = True
)


sales_xgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=False,
    sample_weight = weights
)


In [None]:
#### XGBoost ####

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd

#### Validating the XGB Model ####

y_train_proba = sales_xgb.predict_proba(X_train)[:, 1]
y_test_proba  = sales_xgb.predict_proba(X_test)[:, 1]

threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype(int)
y_test_pred  = (y_test_proba  >= threshold).astype(int)

print("Træningsdata:\n", classification_report(y_train, y_train_pred))
print("Testdata:\n", classification_report(y_test, y_test_pred))

cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

# ------------------------------
# Feature Importance (CatBoost)
# ------------------------------

model_featureimportance = sales_xgb.feature_importances_

grid_fe = pd.DataFrame({
    "Importance": model_featureimportance,
    "Feature": X_train.columns
}).sort_values("Importance", ascending=False)

print(grid_fe)

# ------------------------------
# Permutation Feature Importance
# ------------------------------

perm_imp_grid = permutation_importance(
    sales_xgb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=123
)

perm_imp_grid_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': perm_imp_grid.importances_mean,
    'Std': perm_imp_grid.importances_std
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(
    perm_imp_grid_df['Feature'],
    perm_imp_grid_df['Importance'],
    xerr=perm_imp_grid_df['Std']
)
plt.gca().invert_yaxis()
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.show()


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

thresholds = np.arange(0.1, 0.91, 0.05)
threshold_scores = {t: [] for t in thresholds}

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    sales_xgb.fit(X_train, y_train,
)

    y_val_proba = sales_xgb.predict_proba(X_val)[:, 1]

    for t in thresholds:
        y_val_pred = (y_val_proba >= t).astype(int)
        score = balanced_accuracy_score(y_val, y_val_pred)
        threshold_scores[t].append(score)

    print(f"Fold {fold} complete.")

mean_scores = {t: np.mean(scores) for t, scores in threshold_scores.items()}
best_threshold = max(mean_scores, key=mean_scores.get)
best_score = mean_scores[best_threshold]

print("\n Threshold Tuning Results:")
for t in sorted(mean_scores):
    print(f"Threshold {t:.2f}: Mean Balanced Accuracy = {mean_scores[t]:.4f}")

print(f"\n Best Threshold: {best_threshold:.2f} with Mean Balanced Accuracy = {best_score:.4f}")

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
threshold = 0.5

val_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train_fold = X.iloc[train_idx].copy()
    X_val_fold   = X.iloc[val_idx].copy()
    y_train_fold = y.iloc[train_idx].copy()
    y_val_fold   = y.iloc[val_idx].copy()

    weights_fold = compute_sample_weight(class_weight="balanced", y=y_train_fold)

    sales_xgb.fit(X_train_fold, y_train_fold, sample_weight=weights_fold)

    y_train_proba = sales_xgb.predict_proba(X_train_fold)[:, 1]
    y_val_proba   = sales_xgb.predict_proba(X_val_fold)[:, 1]

    y_train_pred = (y_train_proba >= threshold).astype(int)
    y_val_pred   = (y_val_proba >= threshold).astype(int)

    train_score = balanced_accuracy_score(y_train_fold, y_train_pred)
    val_score   = balanced_accuracy_score(y_val_fold, y_val_pred)

    val_scores.append(val_score)

    print(f"Fold {fold}:")
    print(f"    Training Balanced Accuracy:   {train_score:.4f}")
    print(f"    Validation Balanced Accuracy: {val_score:.4f}\n")

# Show mean CV score
print(f"Mean Validation Balanced Accuracy: {np.mean(val_scores):.4f}")

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

def plot_xgb_importance(model, importance_type="gain", top_n=20):
    imp = model.get_booster().get_score(importance_type=importance_type)

    df_imp = (
        pd.DataFrame({"Feature": list(imp.keys()), "Importance": list(imp.values())})
        .sort_values("Importance", ascending=False)
    )

    plt.figure(figsize=(10, 8))
    sns.barplot(
        data=df_imp.head(top_n),
        x="Importance",
        y="Feature",
        palette= sns.color_palette("tab20")
    )

    plt.title(f"Variablernes vigtighed til modellen ({importance_type})", fontsize=14, weight="bold")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

plot_xgb_importance(sales_xgb, top_n=20)


In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

def plot_xgb_with_permutation(model, X_test, y_test, importance_type="gain", top_n=20):

    imp = model.get_booster().get_score(importance_type=importance_type)

    df_gain = (
        pd.DataFrame({"Feature": list(imp.keys()), "Gain": list(imp.values())})
        .sort_values("Gain", ascending=False)
    )

    # Tag top-N fra gain importance
    top_features = df_gain.head(top_n)["Feature"].tolist()


    perm = permutation_importance(
        model, X_test, y_test,
        n_repeats=10,
        random_state=123,
        n_jobs=-1
    )

    df_perm = pd.DataFrame({
        "Feature": X_test.columns,
        "Permutation": perm.importances_mean
    })

    # Filtrer permutation importance til kun gain-top features
    df_perm = df_perm[df_perm["Feature"].isin(top_features)]

    df_perm = df_perm.set_index("Feature").loc[top_features].reset_index()


    fig, axes = plt.subplots(1, 2, figsize=(18, 8), sharey=True)
    fig.suptitle("DaysBetweenCreateClose er den klart mest betydelige variabel i modellen", fontsize=18, weight="bold", y=1.02)

    # Gain plot
    sns.barplot(
        data=df_gain.head(top_n),
        x="Gain", y="Feature",
        palette=sns.color_palette("tab20"),
        ax=axes[0]
    )
    axes[0].set_title(f"XGBoost variabel vigtighed (gain)", fontsize=14, weight="bold")
    axes[0].set_xlabel("Bidrag til modelpræcision (gain)")
    axes[0].set_ylabel("Variabel")

    # Permutation plot
    sns.barplot(
        data=df_perm,
        x="Permutation", y="Feature",
        palette=sns.color_palette("tab20"),
        ax=axes[1]
    )
    axes[1].set_title("Permutation feature importance (XGBoost model)", fontsize=14, weight="bold")
    axes[1].set_xlabel("gennemsnitlig reduktion i modelperformance ved permutation")
    axes[1].set_ylabel("")


    plt.tight_layout()
    plt.show()


plot_xgb_with_permutation(sales_xgb, X_test, y_test, top_n=20)
