### XGBoost PIPELINE

In [2]:
import joblib
from datetime import datetime
import numpy as np
import os
from sklearn.model_selection import train_test_split

import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.database.mongodb import MongoDb
from backend.ml_pipe.data.featureEngineering.feature_engineering_xgb import prepare_xgb_data

from backend.ml_pipe.models.xgboost.model import XGBoostModel

In [3]:
def run_xgboost_pipeline():
    print("Starte XGBoost-Pipeline...")

    # Daten aus MongoDB holen
    mongo = MongoDb()
    raw_docs = mongo.get({}, "training_data2")

    # Feature Engineering für XGBoost
    X, y = prepare_xgb_data(raw_docs)

    print(f"Shape von X: {X.shape}")
    print(f"Shape von y: {y.shape}")

    if len(X) == 0:
        print("Keine gültigen Daten für Training gefunden.")
        return

    # Train/Validation Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

    print(f"Trainingsdaten: {X_train.shape}, Validierungsdaten: {X_val.shape}")

    # Modell trainieren
    model = XGBoostModel()
    model.train(X_train, y_train)

    # Evaluieren
    model.evaluate(X_val, y_val, show_report=True)

    # Modell speichern mit Timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/xgboost_model_{timestamp}.joblib"
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model.model, model_path)
    print(f"Modell gespeichert unter: {model_path}")

run_xgboost_pipeline()

Starte XGBoost-Pipeline...
Shape von X: (550, 12)
Shape von y: (550,)
Trainingsdaten: (385, 12), Validierungsdaten: (165, 12)
F1 Score:     0.8214
Accuracy:     0.8182
Klassifikationsbericht:
              precision    recall  f1-score   support

         0.0       0.79      0.85      0.81        78
         1.0       0.85      0.79      0.82        87

    accuracy                           0.82       165
   macro avg       0.82      0.82      0.82       165
weighted avg       0.82      0.82      0.82       165

Modell gespeichert unter: saved_models/xgboost_model_20250502_121211.joblib


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [1]:
import sys
import json
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.models.xgboost.predict import predict

# LinkedIn Profildaten als Raw-String (beachte das 'r' vor dem String)
linkedin_data_str = r'''{"skills":["Multitasking","Kundenservice","Interpersonelle Fähigkeiten","Kaltakquise","Hubspot CRM","Customer-Relationship-Management (CRM)"],"firstName":"Darya","lastName":"Chernuska","profilePicture":"https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI","linkedinProfile":"https://www.linkedin.com/in/daryachernuska","education":[{"duration":"01/01/2017 - 01/01/2022","institution":"Ludwig-Maximilians-Universität München","endDate":"01/01/2022","degree":"","startDate":"01/01/2017"}],"providerId":"ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I","workExperience":[{"duration":"01/03/2023 - Present","endDate":"Present","companyInformation":{"employee_count":515,"activities":["Telefonie","Internet","Vernetzung","Rechenzentrum","Glasfaser","Highspeed-Internet","Business-Internet","SIP-Trunk","Cloud-Lösungen","Connect-Cloud","Connect-LAN","Premium IP","Internet + Telefonie","Lösungen für Geschäftskunden"],"name":"M-net Telekommunikations GmbH","description":"Als regionaler Telekommunikationsanbieter versorgt M-net große Teile Bayerns, den Großraum Ulm sowie weite Teile des hessischen Landkreises Main-Kinzig mit zukunftssicherer Kommunikationstechnologie.","industry":["Telecommunications"]},"description":"","company":"M-net Telekommunikations GmbH","location":"München, Bayern, Deutschland · Hybrid","position":"Disponentin","startDate":"01/03/2023"},{"duration":"01/08/2022 - 01/12/2022","endDate":"01/12/2022","companyInformation":{"employee_count":2048,"activities":["HR Software","HR Management","Recruitung","Employee Management","Applicant Tracking System","Employee Selfservice","Time-Off Management","Cloud Software","Onboarding and Offboarding","HR Reporting","Performance Management","Payroll","HR","HR Tech","Human Resources"],"name":"Personio","description":"Personio's Intelligent HR Platform helps small and medium-sized organizations unlock the power of people by making complicated, time-consuming tasks simple and efficient.","industry":["Software Development"]},"description":"","company":"Personio","location":"München, Bayern, Deutschland","position":"Sales Development Representative","startDate":"01/08/2022"},{"duration":"01/11/2017 - 01/07/2022","endDate":"01/07/2022","companyInformation":{"employee_count":662,"activities":["Scandinavian design","Furniture","Design","Product design","Retail","Web","Steelcase partner","Wholesale","B2B","Contract sales","Online","Digital","Creativity"],"name":"BOLIA","description":"Our collection is inspired by the vivid Scandinavian nature","industry":["Retail Furniture and Home Furnishings"]},"description":"","company":"Bolia.com","location":"München, Bayern, Deutschland","position":"Sales Consultant","startDate":"01/11/2017"},{"duration":"01/10/2015 - 01/11/2017","endDate":"01/11/2017","companyInformation":{},"description":"","company":"Pepperminds","location":"München, Bayern, Deutschland","position":"Senior Team Lead","startDate":"01/10/2015"}],"location":"Munich, Bavaria, Germany","certifications":[],"headline":"-","languageSkills":{}}'''

try:
    # JSON-String in Dictionary umwandeln
    profile_data = json.loads(linkedin_data_str)
    
    # Vorhersage machen
    result = predict(profile_data)
    print("\nVorhersageergebnis:")
    print(f"Status: {result['status']}")
    print(f"Konfidenz: {result['confidence'][0]:.2%}")
    print("\nEmpfehlungen:")
    for rec in result['recommendations']:
        print(f"- {rec}")
    print("\nFeature-Wichtigkeiten:")
    for exp in result['explanations']:
        print(f"- {exp['feature']}: {exp['impact_percentage']:.1f}% - {exp['description']}")
        
except json.JSONDecodeError as e:
    print(f"JSON Fehler: {str(e)}")
    print(f"Fehler an Position: {e.pos}")
    print(f"Zeile: {e.lineno}, Spalte: {e.colno}")

KeyError: 'features'