### TFT PIPELINE

In [1]:
import torch
import numpy as np
from datetime import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.database.mongodb import MongoDb
from backend.ml_pipe.data.dataModule.dataModule import DataModule
from backend.ml_pipe.models.tft.model import TFTModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def run_pipeline():
    print("Starte Karriere-Vorhersage Pipeline...")
    
    # Datenquelle initialisieren
    print("\nLade Daten aus MongoDB...")
    mongo = MongoDb()
    datamodule = DataModule(mongo, batch_size=32)
    datamodule.setup()
    
    print(f"\nDatensatzgrößen:")
    print(f"- Training: {len(datamodule.train_data)} Kandidaten")
    print(f"- Validierung: {len(datamodule.val_data)} Kandidaten")
    print(f"- Test: {len(datamodule.test_data)} Kandidaten")
    
    # Modell initialisieren
    print("\nInitialisiere CareerLSTM Modell...")
    model = TFTModel(
        sequence_features=datamodule.sequence_dim,  # 13 Features pro Zeitschritt
        global_features=datamodule.global_dim,      # 9 globale Features
        hidden_size=128,
        num_layers=2,
        dropout=0.2,
        bidirectional=True,
        lr=1e-3
    )
    
    # Callbacks
    checkpoint_cb = ModelCheckpoint(
        monitor="val_loss",
        save_top_k=1,
        mode="min",
        filename="career_lstm-{epoch:02d}-{val_loss:.2f}"
    )
    
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=5,
        mode="min",
        min_delta=0.001
    )
    
    # Logger
    logger = TensorBoardLogger("lightning_logs", name="career_lstm")
    
    # Trainer Setup
    trainer = Trainer(
        max_epochs=50,
        logger=logger,
        callbacks=[checkpoint_cb, early_stopping],
        accelerator="auto",  # Automatische GPU-Erkennung
        devices=1,
        enable_progress_bar=True,
        log_every_n_steps=10
    )
    
    # Training
    print("\nStarte Training...")
    trainer.fit(model, datamodule=datamodule)
    
    # Evaluation auf Testdaten
    print("\nEvaluiere Modell auf Testdaten...")
    test_results = trainer.test(model, datamodule=datamodule)
    
    # Detaillierte Evaluation
    model.eval()

    # Modell speichern
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/career_lstm_{timestamp}.pt"
    
    print("\nSpeichere Modell...")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    torch.save({
        'model_state_dict': model.state_dict(),
        'hyperparameters': model.hparams,
        'sequence_dim': datamodule.sequence_dim,
        'global_dim': datamodule.global_dim,
        'training_metrics': {
            'best_val_loss': checkpoint_cb.best_model_score.item(),
        }
    }, model_path)
    
    print(f"\nModell gespeichert unter: {model_path}")
    print(f"Beste Validation Loss: {checkpoint_cb.best_model_score:.4f}")
    
    return model, trainer

if __name__ == "__main__":
    model, trainer = run_pipeline()

Starte Karriere-Vorhersage Pipeline...

Lade Daten aus MongoDB...


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Datensatz aufgeteilt in:
- Training: 385 Einträge
- Validierung: 82 Einträge
- Test: 83 Einträge

Feature-Dimensionen:
- Sequenz-Features pro Zeitschritt: 6
- Globale Features: 3

Datensatzgrößen:
- Training: 385 Kandidaten
- Validierung: 82 Kandidaten
- Test: 83 Kandidaten

Initialisiere CareerLSTM Modell...

Starte Training...



  | Name             | Type               | Params | Mode 
----------------------------------------------------------------
0 | loss_fn          | BCELoss            | 0      | train
1 | sequence_encoder | LSTM               | 534 K  | train
2 | attention        | MultiheadAttention | 263 K  | train
3 | layer_norm       | LayerNorm          | 512    | train
4 | global_encoder   | Sequential         | 17.0 K | train
5 | fusion_layer     | Sequential         | 57.5 K | train
6 | career_predictor | Sequential         | 2.1 K  | train
----------------------------------------------------------------
874 K     Trainable params
0         Non-trainable params
874 K     Total params
3.500     Total estimated model params size (MB)
25        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 15: 100%|██████████| 13/13 [00:45<00:00,  0.29it/s, v_num=13, train_loss=0.00732, train_acc=1.000, val_loss=0.261, val_acc=0.915]

Evaluiere Modell auf Testdaten...


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]



Speichere Modell...

Modell gespeichert unter: saved_models/career_lstm_20250502_111126.pt
Beste Validation Loss: 0.2287


In [3]:
import sys
import json
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.models.tft.predict import predict

# LinkedIn Profildaten als Raw-String (beachte das 'r' vor dem String)
linkedin_data_str = r'''{"skills":["Multitasking","Kundenservice","Interpersonelle Fähigkeiten","Kaltakquise","Hubspot CRM","Customer-Relationship-Management (CRM)"],"firstName":"Darya","lastName":"Chernuska","profilePicture":"https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI","linkedinProfile":"https://www.linkedin.com/in/daryachernuska","education":[{"duration":"01/01/2017 - 01/01/2022","institution":"Ludwig-Maximilians-Universität München","endDate":"01/01/2022","degree":"","startDate":"01/01/2017"}],"providerId":"ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I","workExperience":[{"duration":"01/03/2023 - Present","endDate":"Present","companyInformation":{"employee_count":515,"activities":["Telefonie","Internet","Vernetzung","Rechenzentrum","Glasfaser","Highspeed-Internet","Business-Internet","SIP-Trunk","Cloud-Lösungen","Connect-Cloud","Connect-LAN","Premium IP","Internet + Telefonie","Lösungen für Geschäftskunden"],"name":"M-net Telekommunikations GmbH","description":"Als regionaler Telekommunikationsanbieter versorgt M-net große Teile Bayerns, den Großraum Ulm sowie weite Teile des hessischen Landkreises Main-Kinzig mit zukunftssicherer Kommunikationstechnologie.","industry":["Telecommunications"]},"description":"","company":"M-net Telekommunikations GmbH","location":"München, Bayern, Deutschland · Hybrid","position":"Disponentin","startDate":"01/03/2023"},{"duration":"01/08/2022 - 01/12/2022","endDate":"01/12/2022","companyInformation":{"employee_count":2048,"activities":["HR Software","HR Management","Recruitung","Employee Management","Applicant Tracking System","Employee Selfservice","Time-Off Management","Cloud Software","Onboarding and Offboarding","HR Reporting","Performance Management","Payroll","HR","HR Tech","Human Resources"],"name":"Personio","description":"Personio's Intelligent HR Platform helps small and medium-sized organizations unlock the power of people by making complicated, time-consuming tasks simple and efficient.","industry":["Software Development"]},"description":"","company":"Personio","location":"München, Bayern, Deutschland","position":"Sales Development Representative","startDate":"01/08/2022"},{"duration":"01/11/2017 - 01/07/2022","endDate":"01/07/2022","companyInformation":{"employee_count":662,"activities":["Scandinavian design","Furniture","Design","Product design","Retail","Web","Steelcase partner","Wholesale","B2B","Contract sales","Online","Digital","Creativity"],"name":"BOLIA","description":"Our collection is inspired by the vivid Scandinavian nature","industry":["Retail Furniture and Home Furnishings"]},"description":"","company":"Bolia.com","location":"München, Bayern, Deutschland","position":"Sales Consultant","startDate":"01/11/2017"},{"duration":"01/10/2015 - 01/11/2017","endDate":"01/11/2017","companyInformation":{},"description":"","company":"Pepperminds","location":"München, Bayern, Deutschland","position":"Senior Team Lead","startDate":"01/10/2015"}],"location":"Munich, Bavaria, Germany","certifications":[],"headline":"-","languageSkills":{}}'''

try:
    # JSON-String in Dictionary umwandeln
    profile_data = json.loads(linkedin_data_str)
    
    # Vorhersage machen
    result = predict(profile_data)
    print("\nVorhersageergebnis:")
    print(f"Status: {result['status']}")
    print(f"Konfidenz: {result['confidence'][0]:.2%}")
    print("\nEmpfehlungen:")
    for rec in result['recommendations']:
        print(f"- {rec}")
    print("\nFeature-Wichtigkeiten:")
    for exp in result['explanations']:
        print(f"- {exp['feature']}: {exp['impact_percentage']:.1f}% - {exp['description']}")
        
except json.JSONDecodeError as e:
    print(f"JSON Fehler: {str(e)}")
    print(f"Fehler an Position: {e.pos}")
    print(f"Zeile: {e.lineno}, Spalte: {e.colno}")


Vorhersageergebnis:
Status: sehr wahrscheinlich wechselbereit
Konfidenz: 70.93%

Empfehlungen:
- Der Kandidat zeigt starke Anzeichen für einen bevorstehenden Wechsel.
- Aktive Ansprache empfohlen.
- Wechselwahrscheinlichkeit: 70.9%
- • Zeit seit Beginn: Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 44.6% mit. (44.6% Einfluss)
- • Zeit bis Ende: Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 25.8% mit. (25.8% Einfluss)
- • Beschäftigungsdauer: Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 18.8% mit. (18.8% Einfluss)

Feature-Wichtigkeiten:
- Zeit seit Beginn: 44.6% - Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 44.6% mit.
- Zeit bis Ende: 25.8% - Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 25.8% mit.
- Beschäftigungsdauer: 18.8% - Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 18.8% mit.
- Berufserfahrung: 6.2% - Dieses Feature beeinflusst die Wechselwahrscheinlichkeit zu 6.2% mit.
- Position Level: 2.

  checkpoint = torch.load(model_path)
