### GRU PIPELINE

In [1]:
from sklearn.metrics import f1_score
from pytorch_lightning import Trainer
from datetime import datetime
import torch
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pytorch_lightning as pl

import sys
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.data.database.mongodb import MongoDb
from backend.ml_pipe.data.dataModule.gru.dataModule import DataModule
from backend.ml_pipe.models.gru.model import GRUModel


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def run_pipeline():
    # Datenquelle initialisieren
    mongo = MongoDb()
    datamodule = DataModule(mongo, batch_size=32)
    datamodule.setup()
    print(datamodule.train_dataloader())
    
    # Modell mit neuen Parametern initialisieren
    model = GRUModel(
        seq_input_size=8,      # Features pro Zeitschritt (level, branche, duration_months, etc.)
        hidden_size=128,       # Größerer Hidden Layer für komplexere Muster
        num_layers=2,          # 2 GRU-Schichten
        dropout=0.3,           # Dropout gegen Overfitting
        lr=0.001              # Lernrate
    )

    # Trainer Setup
    trainer = Trainer(
        max_epochs=10,                    # Mehr Epochen für besseres Lernen
        enable_checkpointing=True,
        logger=True,
        enable_model_summary=True,
        log_every_n_steps=2,
        accelerator="auto",
        devices="auto",
        callbacks=[
            pl.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=5,
                mode='min'
            ),
            pl.callbacks.ModelCheckpoint(
                monitor='val_loss',
                mode='min',
                save_top_k=3
            )
        ]
    )

    # Training
    trainer.fit(model, datamodule=datamodule)

    # Testdaten durchlaufen
    trainer.test(model, datamodule=datamodule)

    model.eval()
    model.freeze()
    
    # Modell speichern
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = f"saved_models/gru_model_{timestamp}.pt"

    # Ordner anlegen (falls nicht vorhanden) und Modell speichern
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    torch.save(model.state_dict(), model_path)

    print(f"Modell gespeichert unter: {model_path}")

# Pipeline ausführen
run_pipeline()

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | gru     | GRU     | 151 K  | train
1 | fc_out  | Linear  | 129    | train
2 | loss_fn | MSELoss | 0      | train
--------------------------------------------
151 K     Trainable params
0         Non-trainable params
151 K     Total params
0.607     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode



Datensatz aufgeteilt in:
- Training: 83686 Einträge
- Validierung: 17932 Einträge
- Test: 17934 Einträge
<torch.utils.data.dataloader.DataLoader object at 0x33409fe30>
                                                                           

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 6: 100%|██████████| 2616/2616 [00:48<00:00, 53.62it/s, v_num=11, train_loss=8.8e+4, val_loss=2.66e+5] 


/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:420: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 561/561 [00:01<00:00, 415.58it/s]


Modell gespeichert unter: saved_models/gru_model_20250520_180845.pt


In [1]:
import sys
import json
sys.path.insert(0, '/Users/florianrunkel/Documents/02_Uni/04_Masterarbeit/masterthesis/')

from backend.ml_pipe.models.gru.predict import predict

# LinkedIn Profildaten als Raw-String (beachte das 'r' vor dem String)
linkedin_data_str = r'''{"skills":["Multitasking","Kundenservice","Interpersonelle Fähigkeiten","Kaltakquise","Hubspot CRM","Customer-Relationship-Management (CRM)"],"firstName":"Darya","lastName":"Chernuska","profilePicture":"https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI","linkedinProfile":"https://www.linkedin.com/in/daryachernuska","education":[{"duration":"01/01/2017 - 01/01/2022","institution":"Ludwig-Maximilians-Universität München","endDate":"01/01/2022","degree":"","startDate":"01/01/2017"}],"providerId":"ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I","workExperience":[{"duration":"01/03/2023 - Present","endDate":"Present","companyInformation":{"employee_count":515,"activities":["Telefonie","Internet","Vernetzung","Rechenzentrum","Glasfaser","Highspeed-Internet","Business-Internet","SIP-Trunk","Cloud-Lösungen","Connect-Cloud","Connect-LAN","Premium IP","Internet + Telefonie","Lösungen für Geschäftskunden"],"name":"M-net Telekommunikations GmbH","description":"Als regionaler Telekommunikationsanbieter versorgt M-net große Teile Bayerns, den Großraum Ulm sowie weite Teile des hessischen Landkreises Main-Kinzig mit zukunftssicherer Kommunikationstechnologie.","industry":["Telecommunications"]},"description":"","company":"M-net Telekommunikations GmbH","location":"München, Bayern, Deutschland · Hybrid","position":"Disponentin","startDate":"01/03/2023"},{"duration":"01/08/2022 - 01/12/2022","endDate":"01/12/2022","companyInformation":{"employee_count":2048,"activities":["HR Software","HR Management","Recruitung","Employee Management","Applicant Tracking System","Employee Selfservice","Time-Off Management","Cloud Software","Onboarding and Offboarding","HR Reporting","Performance Management","Payroll","HR","HR Tech","Human Resources"],"name":"Personio","description":"Personio's Intelligent HR Platform helps small and medium-sized organizations unlock the power of people by making complicated, time-consuming tasks simple and efficient.","industry":["Software Development"]},"description":"","company":"Personio","location":"München, Bayern, Deutschland","position":"Sales Development Representative","startDate":"01/08/2022"},{"duration":"01/11/2017 - 01/07/2022","endDate":"01/07/2022","companyInformation":{"employee_count":662,"activities":["Scandinavian design","Furniture","Design","Product design","Retail","Web","Steelcase partner","Wholesale","B2B","Contract sales","Online","Digital","Creativity"],"name":"BOLIA","description":"Our collection is inspired by the vivid Scandinavian nature","industry":["Retail Furniture and Home Furnishings"]},"description":"","company":"Bolia.com","location":"München, Bayern, Deutschland","position":"Sales Consultant","startDate":"01/11/2017"},{"duration":"01/10/2015 - 01/11/2017","endDate":"01/11/2017","companyInformation":{},"description":"","company":"Pepperminds","location":"München, Bayern, Deutschland","position":"Senior Team Lead","startDate":"01/10/2015"}],"location":"Munich, Bavaria, Germany","certifications":[],"headline":"-","languageSkills":{}}'''

try:
    # JSON-String in Dictionary umwandeln
    profile_data = json.loads(linkedin_data_str)
    
    # Vorhersage machen
    result = predict(profile_data, with_llm_explanation=True)
    print("\nVorhersageergebnis:")
    print(f"Tage bis zum Wechsel: {result['confidence']}")
    print(f"Status: {result['status']}")
    print("\nEmpfehlungen:")
    for rec in result['recommendations']:
        print(f"- {rec}")
    print("\nFeature-Wichtigkeiten:")
    for exp in result['explanations']:
        print(f"- {exp['feature']}: {exp['impact_percentage']:.1f}% - {exp['description']}")
        
except json.JSONDecodeError as e:
    print(f"JSON Fehler: {str(e)}")
    print(f"Fehler an Position: {e.pos}")
    print(f"Zeile: {e.lineno}, Spalte: {e.colno}")

  from .autonotebook import tqdm as notebook_tqdm


{'skills': ['Multitasking', 'Kundenservice', 'Interpersonelle Fähigkeiten', 'Kaltakquise', 'Hubspot CRM', 'Customer-Relationship-Management (CRM)'], 'firstName': 'Darya', 'lastName': 'Chernuska', 'profilePicture': 'https://media.licdn.com/dms/image/v2/D4E03AQE0yuZ6cg8f4A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1670856025914?e=1749686400&v=beta&t=jI1mkiVnkD7teWPncsg8QtKAwZKB-az53_4ny7C7XvI', 'linkedinProfile': 'https://www.linkedin.com/in/daryachernuska', 'education': [{'duration': '01/01/2017 - 01/01/2022', 'institution': 'Ludwig-Maximilians-Universität München', 'endDate': '01/01/2022', 'degree': '', 'startDate': '01/01/2017'}], 'providerId': 'ACoAAD0rz_IBI0XfqqBDUscwHoFwuOqJa_c5T2I', 'workExperience': [{'duration': '01/03/2023 - Present', 'endDate': 'Present', 'companyInformation': {'employee_count': 515, 'activities': ['Telefonie', 'Internet', 'Vernetzung', 'Rechenzentrum', 'Glasfaser', 'Highspeed-Internet', 'Business-Internet', 'SIP-Trunk', 'Cloud-

  checkpoint = torch.load(model_path, map_location=torch.device('cpu'))



Vorhersageergebnis:
Tage bis zum Wechsel: 240.94363403320312
Status: langfristig

Empfehlungen:
- Jobwechsel in weiterer Zukunft (> 6 Monate)

Feature-Wichtigkeiten:
- Berufserfahrung: 100.0% - Die Gesamtberufserfahrung bis zum aktuellen Zeitpunkt


