### XGBoost PIPELINE

In [18]:
import joblib
from datetime import datetime
import numpy as np
import os
from sklearn.model_selection import train_test_split

import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from ml_pipe.data.database.mongodb import MongoDb
from ml_pipe.data.featureEngineering.featureEngineering import featureEngineering

from ml_pipe.models.xgboost.model import XGBoostModel

In [19]:
def run_xgboost_pipeline():
    print("Starte XGBoost-Pipeline...")

    # Daten aus MongoDB holen
    mongo = MongoDb()
    raw_docs = mongo.get({}, "CareerData")

    # Feature Engineering
    fe = featureEngineering()
    X, y = fe.extract_features_and_labels(raw_docs)

    if len(X) == 0:
        print("Keine gültigen Daten für Training gefunden.")
        return

    # Tensor -> numpy
    X_np = X.view(-1, X.shape[1] * X.shape[2]).numpy()
    y_np = y.numpy().ravel()

    # Train/Validation Split
    X_train, X_val, y_train, y_val = train_test_split(X_np, y_np, test_size=0.3, random_state=42)

    print(f"Trainingsdaten: {X_train.shape}, Validierungsdaten: {X_val.shape}")

    # Modell trainieren
    model = XGBoostModel()
    model.train(X_train, y_train)

    # Evaluieren
    model.evaluate(X_val, y_val, show_report=True)

    # Modell speichern mit Timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/xgboost_model_{timestamp}.joblib"
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model.model, model_path)
    print(f"Modell gespeichert unter: {model_path}")

run_xgboost_pipeline()

Starte XGBoost-Pipeline...
Trainingsdaten: (589, 51), Validierungsdaten: (253, 51)
F1 Score:     0.7688
Accuracy:     0.6482
Klassifikationsbericht:
              precision    recall  f1-score   support

         0.0       0.33      0.22      0.26        72
         1.0       0.73      0.82      0.77       181

    accuracy                           0.65       253
   macro avg       0.53      0.52      0.52       253
weighted avg       0.61      0.65      0.63       253

Modell gespeichert unter: saved_models/xgboost_model_20250410_110704.joblib


Parameters: { "use_label_encoder" } are not used.



In [22]:
def predict(input_sequence, model_path="saved_models/xgboost_model_20250410_110704.joblib", expected_steps=51):
    model = joblib.load(model_path)

    expected_features_per_step = 3
    expected_steps_count = expected_steps // expected_features_per_step  # 17 Schritte

    # Sicherstellen, dass jeder Eintrag 3 Features hat
    padded = input_sequence[:expected_steps_count]  # kürzen auf 17
    while len(padded) < expected_steps_count:
        padded.append([0, 0, 0])  # auffüllen mit 0er Steps

    # In flaches Array umwandeln
    input_array = np.array(padded, dtype=np.float32).flatten().reshape(1, -1)

    prob = model.predict_proba(input_array)[0][1]
    status = "wechselbereit" if prob > 0.5 else "bleibt wahrscheinlich"
    return float(prob), status

In [23]:
example_input = [[24, 2, 1], [36, 3, 1], [12, 3, 1]]  # z. B. [Dauer in Monaten, Level, Branche]
prob, status = predict(example_input)
print(f"Wechselwahrscheinlichkeit: {prob:.2f} → Einschätzung: {status}")

Wechselwahrscheinlichkeit: 0.90 → Einschätzung: wechselbereit
