### GRU PIPELINE

In [1]:
from sklearn.metrics import f1_score
from pytorch_lightning import Trainer
from datetime import datetime
import torch
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.database.mongodb import MongoDb
from backend.ml_pipe.data.dataModule.dataModule import DataModule
from backend.ml_pipe.models.gru.model import GRUModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def run_pipeline():
    # Datenquelle initialisieren
    mongo = MongoDb()
    datamodule = DataModule(mongo, batch_size=32)  # Verwende die gleiche Collection wie TFT
    datamodule.setup()
    
    model = GRUModel(
        input_size=6,
        hidden_size=64,    # Mehr Units für komplexere Sequenzen
        num_layers=3,      # 3 GRU-Schichten
        dropout=0.3,       # Mehr Dropout gegen Overfitting
        lr=0.001,         # Kleinere Lernrate für stabileres Training
    )

    # Trainer Setup
    trainer = Trainer(
        max_epochs=10,                    # Mehr Epochen für besseres Lernen
        enable_checkpointing=True,
        logger=True,
        enable_model_summary=True,
        log_every_n_steps=2,
        accelerator="auto",
        devices="auto",
    )

    # Training
    trainer.fit(model, datamodule=datamodule)

    # Testdaten durchlaufen
    trainer.test(model, datamodule=datamodule)

    model.eval()
    model.freeze()
    # Modell speichern
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/gru_model_{timestamp}.pt"

    # Ordner anlegen (falls nicht vorhanden) und Modell speichern
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    torch.save(model.state_dict(), model_path)

    print(f"Modell gespeichert unter: {model_path}")

run_pipeline()

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Datensatz aufgeteilt in:
- Training: 385 Einträge
- Validierung: 82 Einträge
- Test: 83 Einträge

Feature-Dimensionen:
- Sequenz-Features pro Zeitschritt: 6
- Globale Features: 3
<class 'list'>
[{'_id': '681488237d298233cb653a6b', 'features': {'total_positions': 4, 'career_sequence': [{'level': 6, 'branche': 1, 'duration_months': 15, 'time_since_start': 15, 'time_until_end': 0, 'is_current': 1}, {'level': 3, 'branche': 1, 'duration_months': 23, 'time_since_start': 39, 'time_until_end': 16, 'is_current': 0}, {'level': 6, 'branche': 1, 'duration_months': 23, 'time_since_start': 63, 'time_until_end': 40, 'is_current': 0}, {'level': 2, 'branche': 1, 'duration_months': 42, 'time_since_start': 106, 'time_until_end': 64, 'is_current': 0}], 'company_changes': 2, 'total_experience_years': 8.58, 'location_changes': 0, 'unique_locations': 1, 'avg_position_duration_months': 25.75, 'highest_degree': 3, 'current_position': {'level': 2, 'branche': 1, 'duration_months': 42, 'time_since_start': 106}, 


  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | gru     | GRU               | 136 K  | train
1 | dropout | Dropout           | 0      | train
2 | fc      | Linear            | 65     | train
3 | loss_fn | BCEWithLogitsLoss | 0      | train
------------------------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.546     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: input.size(-1) must be equal to input_size. Expected 385, got 6

In [14]:
def predict(input_sequence, model_path="saved_models/gru_model_20250415_113500.pt"):

    # Modell mit den korrekten Hyperparametern initialisieren
    model = GRUModel(
        input_size=3,        # 3 Features: [Dauer, Level, Branche]
        hidden_size=64,      # 64 versteckte Units
        num_layers=4,        # 4 GRU-Schichten
        dropout=0.2,         # 20% Dropout
        lr=0.01             # Lernrate
    )
    
    # Modell laden
    checkpoint = torch.load(model_path)
    
    # Prüfen, ob es sich um ein Dictionary oder direkt um state_dict handelt
    if isinstance(checkpoint, dict):
        if 'model_state_dict' in checkpoint:
            model.load_state_dict(checkpoint['model_state_dict'])
        else:
            model.load_state_dict(checkpoint)
    else:
        model.load_state_dict(checkpoint)
    
    model.eval()

    # Input vorbereiten
    input_tensor = torch.tensor(input_sequence, dtype=torch.float32).unsqueeze(0)
    
    # Vorhersage machen
    with torch.no_grad():
        pred = model(input_tensor)
    
    # Vorhersage interpretieren
    pred_value = float(pred.item())
    
    # Interpretation der Vorhersage
    if pred_value > 0.7:
        status = "sehr wahrscheinlich wechselbereit"
    elif pred_value > 0.5:
        status = "wahrscheinlich wechselbereit"
    elif pred_value > 0.3:
        status = "möglicherweise wechselbereit"
    else:
        status = "bleibt wahrscheinlich"
    
    return pred_value, status

In [20]:
# Test für SDR zu AE Wechsel
prediction_input = [
    [64, 1, 12],  # Sales Development Representative: 12 Monate, Level 1, Sales (12)   # Account Executive: 1 Monat, Level 2, Sales (12)
]

prob, status = predict(prediction_input)
print(f"Wechselwahrscheinlichkeit: {prob:.2f} → Einschätzung: {status}")

Wechselwahrscheinlichkeit: 0.51 → Einschätzung: wahrscheinlich wechselbereit


  checkpoint = torch.load(model_path)
