### TFT PIPELINE

In [1]:
import torch
import numpy as np
from datetime import datetime
import os
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.dataModule.tft.dataModule import CareerDataModule
from backend.ml_pipe.models.tft.model import TFTModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def run_pipeline():
    print("Starte Karriere-Vorhersage Pipeline...")
    
    # Datenmodule initialisieren
    print("\nLade Daten aus MongoDB...")
    datamodule = CareerDataModule(batch_size=32)
    datamodule.setup()
    
    print(f"\nDatensatzgrößen:")
    print(f"- Training: {len(datamodule.train_data)} Kandidaten")
    print(f"- Validierung: {len(datamodule.val_data)} Kandidaten")
    print(f"- Test: {len(datamodule.test_data)} Kandidaten")
    
    # Modell initialisieren
    print("\nInitialisiere TFT Modell...")
    model = TFTModel(
        sequence_features=2,    # Position + Wechselzeitraum
        hidden_size=64,
        dropout=0.1
    )
    
    # Callbacks
    checkpoint_cb = ModelCheckpoint(
        monitor="val_loss",
        save_top_k=1,
        mode="min",
        filename="tft-{epoch:02d}-{val_loss:.2f}"
    )
    
    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=5,
        mode="min",
        min_delta=0.001
    )
    
    # Logger
    logger = TensorBoardLogger("lightning_logs", name="tft")
    
    # Trainer Setup
    trainer = Trainer(
        max_epochs=2,
        logger=logger,
        callbacks=[checkpoint_cb, early_stopping],
        accelerator="auto",  # Automatische GPU-Erkennung
        devices=1,
        enable_progress_bar=True,
        log_every_n_steps=10
    )
    
    # Training
    print("\nStarte Training...")
    trainer.fit(model, datamodule=datamodule)
    
    # Evaluation auf Testdaten
    print("\nEvaluiere Modell auf Testdaten...")
    test_results = trainer.test(model, datamodule=datamodule)
    
    # Modell speichern
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/tft_{timestamp}.pt"
    
    print("\nSpeichere Modell...")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    torch.save({
        'model_state_dict': model.state_dict(),
        'hyperparameters': model.hparams,
        'position_to_idx': datamodule.train_dataset.position_to_idx,
        'training_metrics': {
            'best_val_loss': checkpoint_cb.best_model_score.item(),
        }
    }, model_path)
    
    print(f"\nModell gespeichert unter: {model_path}")
    print(f"Beste Validation Loss: {checkpoint_cb.best_model_score:.4f}")
    
    return model, trainer

if __name__ == "__main__":
    model, trainer = run_pipeline()

Starte Karriere-Vorhersage Pipeline...

Lade Daten aus MongoDB...


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


{'_id': '681c75686814a40634015e68', 'aktuelle_position': 'Sales Manager Europe', 'zeitpunkt': '01/05/2025', 'label': 1, 'wechselzeitraum': 0}

Datensatz aufgeteilt in:
- Training: 2174 Einträge
- Validierung: 466 Einträge
- Test: 467 Einträge

Datensatzgrößen:
- Training: 2174 Kandidaten
- Validierung: 466 Kandidaten
- Test: 467 Kandidaten

Initialisiere TFT Modell...

Starte Training...



  | Name               | Type             | Params | Mode 
----------------------------------------------------------------
0 | position_embedding | Embedding        | 32.0 K | train
1 | classifier         | Sequential       | 2.4 K  | train
2 | loss_fn            | CrossEntropyLoss | 0      | train
----------------------------------------------------------------
34.4 K    Trainable params
0         Non-trainable params
34.4 K    Total params
0.138     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


{'_id': '681c75686814a40634015e68', 'aktuelle_position': 'Sales Manager Europe', 'zeitpunkt': '01/05/2025', 'label': 1, 'wechselzeitraum': 0}

Datensatz aufgeteilt in:
- Training: 2174 Einträge
- Validierung: 466 Einträge
- Test: 467 Einträge
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 1: 100%|██████████| 68/68 [00:48<00:00,  1.39it/s, v_num=8, train_loss=0.888, train_acc=0.633, val_loss=0.831, val_acc=0.680]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 68/68 [00:49<00:00,  1.39it/s, v_num=8, train_loss=0.888, train_acc=0.633, val_loss=0.831, val_acc=0.680]

Evaluiere Modell auf Testdaten...
{'_id': '681c75686814a40634015e68', 'aktuelle_position': 'Sales Manager Europe', 'zeitpunkt': '01/05/2025', 'label': 1, 'wechselzeitraum': 0}

Datensatz aufgeteilt in:
- Training: 2174 Einträge
- Validierung: 466 Einträge
- Test: 467 Einträge


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 15/15 [00:00<00:00, 30.02it/s]



Speichere Modell...

Modell gespeichert unter: saved_models/tft_20250508_151352.pt
Beste Validation Loss: 0.8308


In [None]:
import json
from datetime import datetime
import torch
import sys

# Optional: Pfad zum Projekt hinzufügen, falls nötig
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.models.tft.model import TFTModel

# 1. LinkedIn-Profil-JSON laden (hier als String, sonst aus Datei/DB)
profile_str = r'''{"skills":["Multitasking","Kundenservice","Interpersonelle Fähigkeiten","Kaltakquise","Hubspot CRM","Customer-Relationship-Management (CRM)"],"firstName":"Darya","lastName":"Chernuska","profilePicture":"https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI","linkedinProfile":"https://www.linkedin.com/in/daryachernuska","education":[{"duration":"01/01/2017 - 01/01/2022","institution":"Ludwig-Maximilians-Universität München","endDate":"01/01/2022","degree":"","startDate":"01/01/2017"}],"providerId":"ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I","workExperience":[{"duration":"01/03/2023 - Present","endDate":"Present","companyInformation":{"employee_count":515,"activities":["Telefonie","Internet","Vernetzung","Rechenzentrum","Glasfaser","Highspeed-Internet","Business-Internet","SIP-Trunk","Cloud-Lösungen","Connect-Cloud","Connect-LAN","Premium IP","Internet + Telefonie","Lösungen für Geschäftskunden"],"name":"M-net Telekommunikations GmbH","description":"Als regionaler Telekommunikationsanbieter versorgt M-net große Teile Bayerns, den Großraum Ulm sowie weite Teile des hessischen Landkreises Main-Kinzig mit zukunftssicherer Kommunikationstechnologie.","industry":["Telecommunications"]},"description":"","company":"M-net Telekommunikations GmbH","location":"München, Bayern, Deutschland · Hybrid","position":"Disponentin","startDate":"01/03/2023"},{"duration":"01/08/2022 - 01/12/2022","endDate":"01/12/2022","companyInformation":{"employee_count":2048,"activities":["HR Software","HR Management","Recruitung","Employee Management","Applicant Tracking System","Employee Selfservice","Time-Off Management","Cloud Software","Onboarding and Offboarding","HR Reporting","Performance Management","Payroll","HR","HR Tech","Human Resources"],"name":"Personio","description":"Personio's Intelligent HR Platform helps small and medium-sized organizations unlock the power of people by making complicated, time-consuming tasks simple and efficient.","industry":["Software Development"]},"description":"","company":"Personio","location":"München, Bayern, Deutschland","position":"Sales Development Representative","startDate":"01/08/2022"},{"duration":"01/11/2017 - 01/07/2022","endDate":"01/07/2022","companyInformation":{"employee_count":662,"activities":["Scandinavian design","Furniture","Design","Product design","Retail","Web","Steelcase partner","Wholesale","B2B","Contract sales","Online","Digital","Creativity"],"name":"BOLIA","description":"Our collection is inspired by the vivid Scandinavian nature","industry":["Retail Furniture and Home Furnishings"]},"description":"","company":"Bolia.com","location":"München, Bayern, Deutschland","position":"Sales Consultant","startDate":"01/11/2017"},{"duration":"01/10/2015 - 01/11/2017","endDate":"01/11/2017","companyInformation":{},"description":"","company":"Pepperminds","location":"München, Bayern, Deutschland","position":"Senior Team Lead","startDate":"01/10/2015"}],"location":"Munich, Bavaria, Germany","certifications":[],"headline":"-","languageSkills":{}}'''

# Füge hier den JSON-String aus deiner CSV ein

profile = json.loads(profile_str)
work_experience = profile["workExperience"]

# 2. Hilfsfunktion für Datums-Parsing
def parse_date(date_str):
    if date_str == "Present":
        return datetime.now()
    try:
        return datetime.strptime(date_str, "%d/%m/%Y")
    except Exception:
        try:
            return datetime.strptime(date_str, "%m/%Y")
        except Exception:
            try:
                return datetime.strptime(date_str, "%Y")
            except Exception:
                return None

# 3. Nach Startdatum sortieren (neueste zuerst)
work_experience_sorted = sorted(
    work_experience,
    key=lambda x: parse_date(x["startDate"]) or datetime(1900, 1, 1),
    reverse=True
)

# 4. Aktuellste Position extrahieren
last_position = work_experience_sorted[0]
positionsname = last_position["position"]
start = parse_date(last_position["startDate"])
end = parse_date(last_position["endDate"])
if end is None:
    end = datetime.now()
wechselzeitraum = (end.year - start.year) * 12 + (end.month - start.month)

print("Positionsname:", positionsname)
print("Wechselzeitraum (Monate):", wechselzeitraum)

# 5. Mapping laden
with open("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/data/dataModule/tft/position_to_idx.json", "r") as f:
    position_to_idx = json.load(f)

if positionsname not in position_to_idx:
    print(f"Achtung: Position '{positionsname}' nicht im Mapping! Fallback auf 'UNK' oder Index 0.")
    pos_idx = position_to_idx.get("UNK", 0)
else:
    pos_idx = position_to_idx[positionsname]


# 6. Eingabevektor für das Modell bauen
x_seq = torch.tensor(
    [[position_to_idx[positionsname], wechselzeitraum]],
    dtype=torch.float32
)

print("Eingabevektor für Modell:", x_seq)

# 7. Modell laden
model = TFTModel(sequence_features=2, hidden_size=64, dropout=0.1)
model.load_state_dict(
    torch.load("saved_models/tft_20250508_150223.pt")["model_state_dict"]
)
model.eval()

# 8. Vorhersage
with torch.no_grad():
    pred = model(x_seq)
    pred_class = torch.argmax(pred, dim=1).item()
    print("Vorhergesagte Klasse:", pred_class)
    print("Softmax-Wahrscheinlichkeiten:", torch.softmax(pred, dim=1).numpy())

Positionsname: Disponentin
Wechselzeitraum (Monate): 26


KeyError: 'Disponentin'