### TFT PIPELINE

In [4]:
import torch
import numpy as np
from datetime import datetime
import os
from  lightning.pytorch import Trainer
from  lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from  lightning.pytorch.loggers import TensorBoardLogger
import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.dataModule.tft.dataModule import CareerDataModule
from backend.ml_pipe.models.tft.model import TFTModel
from  lightning.pytorch.loggers import CSVLogger
from pytorch_forecasting import TimeSeriesDataSet, GroupNormalizer, TemporalFusionTransformer, QuantileLoss


In [5]:
from backend.ml_pipe.data.database.mongodb import MongoDb

def load_data_from_mongodb():
    mongo_client = MongoDb(user='florianrunkel', password='ur04mathesis', db_name='Database')
    result = mongo_client.get_all('time_dataset')
    raw_data = result.get('data', [])
    # In DataFrame umwandeln
    import pandas as pd
    df = pd.DataFrame(raw_data)
    return df

In [17]:
import torch
from datetime import datetime
from  lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from  lightning.pytorch.loggers import TensorBoardLogger
import  lightning.pytorch as pl
import os
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_forecasting.metrics import MAE, RMSE

def run_pipeline():
    print("Starte Karriere-Vorhersage Pipeline...")

    # 1. Daten laden
    print("Lade Daten aus MongoDB...")
    df = load_data_from_mongodb()

    # 2. DataModule initialisieren
    max_encoder_length = 30
    max_prediction_length = 7
    datamodule = CareerDataModule(df, batch_size=64, max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length)
    datamodule.setup()
    print(f"Training: {len(datamodule.training)} Kandidaten")
    print(f"Validation: {len(datamodule.validation)} Kandidaten")

    # 3. Modell initialisieren
    print("Initialisiere TFT Modell...")
    model = TemporalFusionTransformer.from_dataset(
        datamodule.training_dataset,
        learning_rate=0.03,
        hidden_size=32,
        attention_head_size=2,
        dropout=0.1,
        hidden_continuous_size=16,
        output_size=7,  # Quantile output
        loss=QuantileLoss(),
        log_interval=10,
        reduce_on_plateau_patience=4,
    )
    print(f"Number of parameters in network: {model.size()/1e3:.1f}k")

    # 4. Trainer initialisieren
    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=1e-4,
        patience=10,
        verbose=False,
        mode="min"
    )

    lr_logger = LearningRateMonitor()

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="checkpoints",
        filename="tft-best-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        mode="min",
    )

    logger = TensorBoardLogger(save_dir="logs")

    trainer = Trainer(
        max_epochs=50,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices=1,
        gradient_clip_val=0.1,
        log_every_n_steps=10,
        callbacks=[lr_logger, checkpoint_callback],
        logger=logger,
    )

    # 5. Training starten
    print("Starte Training...")
    trainer.fit(
        model,
        train_dataloaders=datamodule.train_dataloader(),
        val_dataloaders=datamodule.val_dataloader(),
    )

    # 6. Bestes Modell laden
    best_model_path = checkpoint_callback.best_model_path
    print(f"Bestes Modell: {best_model_path}")

    best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

    # Evaluation auf Validierungsdaten
    val_dataloader = datamodule.val_dataloader()
    
    # Vorhersagen und tatsächliche Werte sammeln
    actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
    predictions = best_model.predict(val_dataloader)

    # Alle Tensoren auf dasselbe Gerät bringen
    actuals = actuals.to(predictions.device)

    # Metriken initialisieren
    from pytorch_forecasting.metrics import MAE, RMSE
    mae_metric = MAE().to(predictions.device)
    rmse_metric = RMSE().to(predictions.device)

    # Metriken berechnen
    mae = mae_metric(predictions, actuals)
    rmse = rmse_metric(predictions, actuals)

    print("\nModell-Evaluation auf Validierungsdaten:")
    print(f"MAE: {mae:.2f} Tage")
    print(f"RMSE: {rmse:.2f} Tage")

    # Speichere das Trainingsdataset
    torch.save(datamodule.training_dataset, "/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/saved_models/training_dataset.pt")
    print("Trainingsdataset gespeichert unter: training_dataset.pt")

    # 9. Modell speichern
    print("Speichere Modell...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"./saved_models/tft_{timestamp}.ckpt"
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    trainer.save_checkpoint(model_path)
    print(f"Modell gespeichert unter: {model_path}")

    return best_model, trainer

# Pipeline ausführen
best_model, trainer = run_pipeline()

Starte Karriere-Vorhersage Pipeline...
Lade Daten aus MongoDB...
Train: 50294 Val: 24954
Training: 50294 Kandidaten
Validation: 24954 Kandidaten
Initialisiere TFT Modell...


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3

Number of parameters in network: 100.2k
Starte Training...
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 49: 100%|██████████| 98/98 [00:27<00:00,  3.61it/s, v_num=28, train_loss_step=52.30, val_loss=122.0, train_loss_epoch=52.30]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 98/98 [00:27<00:00,  3.60it/s, v_num=28, train_loss_step=52.30, val_loss=122.0, train_loss_epoch=52.30]
Bestes Modell: /Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/checkpoints/tft-best-epoch=01-val_loss=99.52.ckpt


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'predict_dataloader' to speed up the dataloader worker initialization.



Modell-Evaluation auf Validierungsdaten:
MAE: 190.76 Tage
RMSE: 368.66 Tage
Trainingsdataset gespeichert unter: training_dataset.pt
Speichere Modell...
Modell gespeichert unter: ./saved_models/tft_20250515_163355.ckpt


In [5]:
import torch
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
import json
from rapidfuzz import process, fuzz

# 1. Trainingsdataset laden
training_dataset = torch.load("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/saved_models/training_dataset.pt")

# 2. Position Mapping laden
with open("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/data/featureEngineering/position_level.json", "r") as f:
    position_entries = json.load(f)

# Dict: Position (klein) -> (Level, Branche)
position_map = {entry["position"].lower(): (entry["level"], entry["branche"]) for entry in position_entries}
all_positions = list(position_map.keys())

# Level-Zahl auf String
level_map = {
    1: "Entry", 2: "Junior", 3: "Professional", 4: "Senior", 5: "Lead", 6: "Manager", 7: "Director", 8: "C-Level"
}

def map_position_fuzzy(pos, threshold=65):
    pos_clean = pos.lower().strip()
    if pos_clean in position_map:
        level, branche = position_map[pos_clean]
        match = pos_clean
        score = 100  # Maximale Ähnlichkeit, da exakter Treffer
    else:
        match, score, _ = process.extractOne(pos_clean, all_positions, scorer=fuzz.ratio)
        if score >= threshold:
            level, branche = position_map[match]
        else:
            return (None, None, None)
    # Level-Zahl auf String mappen
    level_str = level_map.get(level, str(level)) if isinstance(level, int) else str(level)
    return (match, level_str, branche)

# 3. LinkedIn-Profil verarbeiten
profile_data = {
    "workExperience": [
        {
            "position": "Senior Software Engineer",
            "company": "Tech Corp",
            "startDate": "01/01/2023",
            "endDate": "Present"
        },
        {
            "position": "Software Engineer",
            "company": "Startup GmbH",
            "startDate": "01/01/2021",
            "endDate": "31/12/2022"
        },
        {
            "position": "Junior Developer",
            "company": "IT Solutions",
            "startDate": "01/01/2020",
            "endDate": "31/12/2020"
        }
    ]
}

# 4. Daten für Vorhersage vorbereiten
time_points = []
experiences = sorted(
    profile_data['workExperience'],
    key=lambda x: datetime.strptime(x['startDate'], "%d/%m/%Y"),
    reverse=True
)

for i, exp in enumerate(experiences):
    start_date = datetime.strptime(exp['startDate'], "%d/%m/%Y")
    end_date = datetime.now() if exp['endDate'] == "Present" else datetime.strptime(exp['endDate'], "%d/%m/%Y")
    
    # Position mappen
    mapped_pos, level_str, branche = map_position_fuzzy(exp['position'])
    if mapped_pos is None:
        print(f"Warnung: Position '{exp['position']}' konnte nicht gemappt werden")
        continue
    
    # Erstelle 8 Zeitpunkte pro Position
    for j in range(8):
        timepoint = start_date + timedelta(days=int((end_date - start_date).days * (j + 1) / 8))
        
        # Berechne Features
        berufserfahrung = (timepoint - datetime.strptime(experiences[-1]['startDate'], "%d/%m/%Y")).days
        anzahl_wechsel = sum(1 for e in experiences if e['endDate'] != "Present" and datetime.strptime(e['endDate'], "%d/%m/%Y") <= timepoint)
        anzahl_jobs = sum(1 for e in experiences if datetime.strptime(e['startDate'], "%d/%m/%Y") <= timepoint)
        
        # Berechne durchschnittliche Jobdauer
        dauer_liste = []
        for e in experiences:
            s = datetime.strptime(e['startDate'], "%d/%m/%Y")
            e_date = datetime.now() if e['endDate'] == "Present" else datetime.strptime(e['endDate'], "%d/%m/%Y")
            if s < e_date and e_date <= timepoint:
                dauer_liste.append((e_date - s).days)
        durchschnittsdauer = sum(dauer_liste) / len(dauer_liste) if dauer_liste else 0
        
        # Erstelle DataFrame-Zeile
        row = {
            "profile_id": "predict_profile",
            "time_idx": i * 8 + j,
            "label": 0,  # Wird vorhergesagt
            "berufserfahrung_bis_zeitpunkt": berufserfahrung,
            "anzahl_wechsel_bisher": anzahl_wechsel,
            "anzahl_jobs_bisher": anzahl_jobs,
            "durchschnittsdauer_bisheriger_jobs": durchschnittsdauer,
            "zeitpunkt": timepoint.timestamp(),
            "aktuelle_position": exp['position'],
            "mapped_position": mapped_pos,
            "level_str": level_str,
            "branche": branche,
            "weekday": timepoint.weekday(),
            "weekday_sin": np.sin(2 * np.pi * timepoint.weekday() / 7),
            "weekday_cos": np.cos(2 * np.pi * timepoint.weekday() / 7),
            "month": timepoint.month,
            "month_sin": np.sin(2 * np.pi * timepoint.month / 12),
            "month_cos": np.cos(2 * np.pi * timepoint.month / 12)
        }
        time_points.append(row)

# 5. DataFrame erstellen
df = pd.DataFrame(time_points)

# 6. Vorhersage-Dataset erstellen
prediction_dataset = TimeSeriesDataSet.from_dataset(
    training_dataset,
    df,
    predict=True,
    stop_randomization=True,
    target_normalizer=None 
)

# 7. Modell laden
model = TemporalFusionTransformer.load_from_checkpoint("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/saved_models/tft_20250515_153035.ckpt")

# 8. Vorhersage machen
dataloader = prediction_dataset.to_dataloader(train=False, batch_size=1)
predictions = model.predict(dataloader)

# 9. Ergebnisse ausgeben
for i, pred in enumerate(predictions):
    print(f"Tag {i+1}: {float(pred[0]):.2f} Tage bis zum nächsten Jobwechsel")

  training_dataset = torch.load("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/saved_models/training_dataset.pt")
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck.

Tag 1: 0.50 Tage bis zum nächsten Jobwechsel


In [2]:
from predict import read_linkedin_profile, predict_next_job_change

# LinkedIn-Profil als String
linkedin_data_str = r'''{"skills":["Multitasking","Kundenservice","Interpersonelle Fähigkeiten","Kaltakquise","Hubspot CRM","Customer-Relationship-Management (CRM)"],"firstName":"Darya","lastName":"Chernuska","profilePicture":"https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI","linkedinProfile":"https://www.linkedin.com/in/daryachernuska","education":[{"duration":"01/01/2017 - 01/01/2022","institution":"Ludwig-Maximilians-Universität München","endDate":"01/01/2022","degree":"","startDate":"01/01/2017"}],"providerId":"ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I","workExperience":[{"duration":"01/03/2023 - Present","endDate":"Present","companyInformation":{"employee_count":515,"activities":["Telefonie","Internet","Vernetzung","Rechenzentrum","Glasfaser","Highspeed-Internet","Business-Internet","SIP-Trunk","Cloud-Lösungen","Connect-Cloud","Connect-LAN","Premium IP","Internet + Telefonie","Lösungen für Geschäftskunden"],"name":"M-net Telekommunikations GmbH","description":"Als regionaler Telekommunikationsanbieter versorgt M-net große Teile Bayerns, den Großraum Ulm sowie weite Teile des hessischen Landkreises Main-Kinzig mit zukunftssicherer Kommunikationstechnologie.","industry":["Telecommunications"]},"description":"","company":"M-net Telekommunikations GmbH","location":"München, Bayern, Deutschland · Hybrid","position":"Disponentin","startDate":"01/03/2023"},{"duration":"01/08/2022 - 01/12/2022","endDate":"01/12/2022","companyInformation":{"employee_count":2048,"activities":["HR Software","HR Management","Recruitung","Employee Management","Applicant Tracking System","Employee Selfservice","Time-Off Management","Cloud Software","Onboarding and Offboarding","HR Reporting","Performance Management","Payroll","HR","HR Tech","Human Resources"],"name":"Personio","description":"Personio's Intelligent HR Platform helps small and medium-sized organizations unlock the power of people by making complicated, time-consuming tasks simple and efficient.","industry":["Software Development"]},"description":"","company":"Personio","location":"München, Bayern, Deutschland","position":"Sales Development Representative","startDate":"01/08/2022"},{"duration":"01/11/2017 - 01/07/2022","endDate":"01/07/2022","companyInformation":{"employee_count":662,"activities":["Scandinavian design","Furniture","Design","Product design","Retail","Web","Steelcase partner","Wholesale","B2B","Contract sales","Online","Digital","Creativity"],"name":"BOLIA","description":"Our collection is inspired by the vivid Scandinavian nature","industry":["Retail Furniture and Home Furnishings"]},"description":"","company":"Bolia.com","location":"München, Bayern, Deutschland","position":"Sales Consultant","startDate":"01/11/2017"},{"duration":"01/10/2015 - 01/11/2017","endDate":"01/11/2017","companyInformation":{},"description":"","company":"Pepperminds","location":"München, Bayern, Deutschland","position":"Senior Team Lead","startDate":"01/10/2015"}],"location":"Munich, Bavaria, Germany","certifications":[],"headline":"-","languageSkills":{}}'''

# Profil einlesen
profile_data = read_linkedin_profile(linkedin_data_str)

if profile_data:
    # Vorhersage machen
    predictions = predict_next_job_change(profile_data)
    
    if predictions:
        # Ergebnisse ausgeben
        for pred in predictions:
            print(pred)

  training_dataset = torch.load("/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/backend/ml_pipe/models/tft/saved_models/training_dataset.pt")


Warnung: Position 'Disponentin' konnte nicht gemappt werden


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_work


Vorhersage für Tag 1:
  Median: 0.5 Tage (0.0 Monate)
  Untere Schranke: 0.5 Tage (0.0 Monate)
  Obere Schranke: 0.6 Tage (0.0 Monate)
  Unsicherheit: 0.1 Tage
  Interpretation: Sehr wahrscheinlicher Jobwechsel innerhalb des nächsten Monats
{'tag': 1, 'vorhersage': {'median': 0.5074630379676819, 'untere_schranke': 0.4871053397655487, 'obere_schranke': 0.5656402111053467, 'unsicherheit': 0.07853487133979797}}
