In [1]:
# notebooks/prophet_gridsearch_parallel_optimized.ipynb

import pandas as pd
import itertools
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm
from lightgbm import LGBMRegressor
import numpy as np
import os
from random import seed, sample

# --------------------------------------
# 1. Daten laden
# --------------------------------------
df = pd.read_parquet("../data/processed/sickness_table.parquet")
df.index = pd.to_datetime(df.index)
df = df.asfreq('D')
df["ds"] = df.index

holiday_df = pd.read_parquet("../data/processed/holiday.parquet")
holiday_df = pd.DataFrame({
    "holiday": holiday_df["name"],
    "ds": holiday_df["date"],
    "lower_window": -2,
    "upper_window": 2
})

# --------------------------------------
# 2. Feature Grid vorbereiten
# --------------------------------------
grid = {
    "season": ["season_1", "season_2", "season_3"],
    "quarter": ["quarter_1", "quarter_2", "quarter_3"],
    "month_start_end": ["is_month_start", "is_month_end"],
    "cyclic_encoding_weekday": ["weekday_sin", "weekday_cos"],
    "cyclic_encoding_month": ["month_sin", "month_cos"],
    "weekend": ["is_weekend"],
    "holiday": ["holiday", "holiday_before", "holiday_after"],
    "holiday_sig": ["holiday_significant", "holiday_significant_before", "holiday_significant_after"],
    "holiday_window": ["holiday_window"],
    "holiday_effects": [
        "holiday_cluster", "holiday_density_7d", "holiday_on_monday_or_friday",
        "holiday_cluster_and_bruecke", "holiday_and_weekday_cat"
    ],
    "lag_target_short": ["lag_target_bereitschaft_1", "lag_target_bereitschaft_2"],
    "roll_target_short": ["roll_target_bereitschaft_mean_7", "roll_target_bereitschaft_std_7",
                           "roll_target_bereitschaft_min_7", "roll_target_bereitschaft_max_7"]
}

all_combinations = []
keys = list(grid.keys())
holiday_groups = {"holiday", "holiday_sig", "holiday_window"}
season_groups = {"season", "quarter"}
week_groups = {"cyclic_encoding_weekday", "weekend"}

for r in range(1, len(keys) + 1):
    for combo in itertools.combinations(keys, r):
        # Nur eine holiday-Gruppe pro Kombination zulassen
        holiday_included = [k for k in combo if k in holiday_groups]
        if len(holiday_included) > 1:
            continue  # überspringe diese Kombination

        # Nur eine season-Gruppe pro Kombination zulassen
        season_included = [k for k in combo if k in season_groups]
        if len(season_included) > 1:
            continue  # überspringe diese Kombination

        # Nur eine week-Gruppe pro Kombination zulassen
        week_included = [k for k in combo if k in week_groups]
        if len(week_included) > 1:
            continue  # überspringe diese Kombination

        feature_list = []
        for key in combo:
            feature_list.extend(grid[key])
        group_name = "+".join(sorted(combo))
        all_combinations.append({
            "groups": group_name,
            "features": feature_list
        })

df_combinations = pd.DataFrame(all_combinations)

# --------------------------------------
# 3. Parameter Grid & CV Splits
# --------------------------------------

# Teil 1: Fixe Kombinationen
fixed_params = [
    {"learning_rate": 0.1, "max_depth": 8, "n_estimators": 300, "num_leaves": 15, "reg_alpha": 0.1, "reg_lambda": 0.0},
    {"learning_rate": 0.1, "max_depth": 6, "n_estimators": 300, "num_leaves": 15, "reg_alpha": 0.1, "reg_lambda": 0.0},
    {"learning_rate": 0.1, "max_depth": 8, "n_estimators": 300, "num_leaves": 10, "reg_alpha": 0.1, "reg_lambda": 0.0},
    {"learning_rate": 0.05, "max_depth": 8, "n_estimators": 300, "num_leaves": 15, "reg_alpha": 0.1, "reg_lambda": 0.0},
    {"learning_rate": 0.1, "max_depth": 8, "n_estimators": 300, "num_leaves": 15, "reg_alpha": 0.1, "reg_lambda": 0.1},
]

# Teil 2: Randomisierte Kombinationen aus breitem Grid
param_grid_random = {
    "learning_rate": [0.01, 0.05, 0.1, 0.15],
    "max_depth": [4, 6, 8],
    "n_estimators": [200, 300],
    "num_leaves": [10, 15, 25],
    "reg_alpha": [0.0, 0.1],
    "reg_lambda": [0.0, 0.1],
}
seed(42)
param_names = list(param_grid_random.keys())
full_space = list(itertools.product(*param_grid_random.values()))
random_10 = sample(full_space, 10)
random_params = [dict(zip(param_names, vals)) for vals in random_10]

# Teil 3: Extreme Kombinationen (Spannweite)
extreme_params = [
    {"learning_rate": 0.15, "max_depth": 4, "n_estimators": 100, "num_leaves": 10, "reg_alpha": 0.0, "reg_lambda": 0.0},
    {"learning_rate": 0.1,  "max_depth": 10, "n_estimators": 300, "num_leaves": 50, "reg_alpha": 0.3, "reg_lambda": 0.0},
    {"learning_rate": 0.05, "max_depth": 4,  "n_estimators": 200, "num_leaves": 10, "reg_alpha": 0.2, "reg_lambda": 0.2},
    {"learning_rate": 0.01, "max_depth": 8,  "n_estimators": 400, "num_leaves": 31, "reg_alpha": 0.1, "reg_lambda": 0.1},
    {"learning_rate": 0.1,  "max_depth": 2,  "n_estimators": 100, "num_leaves": 7,  "reg_alpha": 0.0, "reg_lambda": 0.0},
]

# Finale Liste (20 Kombinationen)
all_params = fixed_params#+ random_params + extreme_params

min_train_days = 730
split_start = df["ds"].min() + pd.Timedelta(days=min_train_days)
split_end = df["ds"].max()
splits = []

current = pd.Timestamp(split_start.year, split_start.month, 15)
if split_start.day > 15:
    current += relativedelta(months=1)

while current + relativedelta(months=1) <= split_end:
    train_end = current
    test_start = (train_end + relativedelta(months=1)).replace(day=1)
    test_end = test_start + pd.offsets.MonthEnd(0)
    if test_end <= split_end:
        splits.append((train_end, test_start, test_end))
    current += relativedelta(months=1)

# --------------------------------------
# 5. Prophet-Forecasts vorberechnen
# --------------------------------------
best_models = pd.read_parquet("../models/prophet/best_params.parquet")
best_model = best_models.iloc[0]
df_calls_input = df.rename(columns={"calls": "y"})

prophet_cache = {}

def predict_calls(train_end):
    if train_end in prophet_cache:
        return prophet_cache[train_end]

    forecast_start = (train_end + pd.offsets.MonthBegin(1)).replace(day=1)
    forecast_end = forecast_start + pd.offsets.MonthEnd(0)
    forecast_dates = pd.date_range(start=forecast_start, end=forecast_end, freq="D")

    df_train = df_calls_input.loc[:train_end].copy()
    df_model = df_train[["ds", "y"] + list(best_model["features"])].copy()

    model = Prophet(
        weekly_seasonality=True,
        yearly_seasonality=True,
        holidays=holiday_df if best_model["groups"].endswith("+H") else None,
        **best_model["params"]
    )
    for feat in best_model["features"]:
        model.add_regressor(feat)

    model.fit(df_model)
    df_forecast_input = df.loc[forecast_dates][["ds"] + list(best_model["features"])].copy()
    forecast = model.predict(df_forecast_input)
    df_forecast = df_forecast_input.copy()
    df_forecast["calls_pred"] = forecast["yhat"].values
    prophet_cache[train_end] = df_forecast[["ds", "calls_pred"]]
    return prophet_cache[train_end]

calls_forecasts = {train_end: predict_calls(train_end) for train_end, _, _ in tqdm(splits, desc="calls_pred vorbereiten")}

# --------------------------------------
# 6. Evaluation mit Zwischenspeicherung
# --------------------------------------
partial_results = []

def evaluate_combination(group_string, features, params):
    fold_metrics = []

    for train_end, test_start, test_end in splits:
        try:
            df_calls_pred = calls_forecasts[train_end]

            df_train = df.loc[:train_end].copy()
            df_train["calls_pred"] = df_train["calls"]

            df_test = df.loc[test_start:test_end].copy()
            df_test = df_test.merge(df_calls_pred, on="ds", how="left")

            input_cols = ["target_bereitschaft", "calls_pred"] + features

            df_train = df_train[["ds"] + input_cols]
            df_test = df_test[["ds"] + input_cols]

            X_train = df_train.set_index("ds")[["calls_pred"] + features]
            y_train = df_train.set_index("ds")["target_bereitschaft"]

            model = LGBMRegressor(
                objective='quantile', alpha=0.9,
                random_state=42, verbose=-1, **params
            )
            model.fit(X_train, y_train)

            y_hist = df_train.set_index("ds")["target_bereitschaft"].copy()
            preds = []

            df_test_indexed = df_test.set_index("ds")

            for date in df_test_indexed.index:
                row = pd.DataFrame(index=[date])

                # Dynamische Lag-Features berechnen
                for feat in features:
                    if feat.startswith("lag_target_bereitschaft"):
                        try:
                            lag = int(feat.split("_")[-1])
                            row[feat] = y_hist.get(date - pd.Timedelta(days=lag), np.nan)
                        except Exception:
                            print(f"[Warnung] Fehler bei Lag-Feature: {feat}")

                    elif feat.startswith("roll_target_bereitschaft"):
                        try:
                            parts = feat.split("_")
                            stat = parts[-2]
                            win = int(parts[-1])
                            ts = date - pd.Timedelta(days=1)
                            series = y_hist.shift(1).rolling(window=win, min_periods=1)
                            if stat == "mean":
                                row[feat] = series.mean().get(ts, np.nan)
                            elif stat == "std":
                                row[feat] = series.std().get(ts, np.nan)
                            elif stat == "min":
                                row[feat] = series.min().get(ts, np.nan)
                            elif stat == "max":
                                row[feat] = series.max().get(ts, np.nan)
                        except Exception:
                            print(f"[Warnung] Fehler bei Rolling-Feature: {feat}")

                # Statische Features aus df_test
                static_feats = [f for f in features if not (f.startswith("lag_") or f.startswith("roll_"))]
                for feat in static_feats:
                    row[feat] = df_test_indexed.at[date, feat]

                # calls_pred einfügen
                row["calls_pred"] = df_test_indexed.at[date, "calls_pred"]

                # Relevante Spalten selektieren und vervollständigen
                row = row[["calls_pred"] + features].bfill().ffill()

                # Vorhersage und Update
                pred = model.predict(row)[0]
                preds.append((date, pred))
                y_hist.loc[date] = pred

            df_preds = pd.DataFrame(preds, columns=["ds", "yhat"]).set_index("ds")
            df_test_indexed["yhat"] = df_preds["yhat"]

            df_test_indexed["under"] = (df_test_indexed["yhat"] < df_test_indexed["target_bereitschaft"]).astype(int)
            df_test_indexed["over"] = (df_test_indexed["yhat"] > df_test_indexed["target_bereitschaft"]).astype(int)

            fold_metrics.append({
                "mae": mean_absolute_error(df_test_indexed["target_bereitschaft"], df_test_indexed["yhat"]),
                "rmse": sqrt(mean_squared_error(df_test_indexed["target_bereitschaft"], df_test_indexed["yhat"])),
                "under": df_test_indexed["under"].mean(),
                "over": df_test_indexed["over"].mean()
            })

        except Exception:
            print(f"[Fehler] Gruppe={group_string} | Params={params}")
            return None

    if fold_metrics:
        avg = pd.DataFrame(fold_metrics).mean().to_dict()
        result = {
            "groups": group_string,
            "features": features,
            "params": params,
            "mae": avg["mae"],
            "rmse": avg["rmse"],
            "under": avg["under"],
            "over": avg["over"]
        }
        return result

    return None

# --------------------------------------
# 7. Parallelisierung starten
# --------------------------------------
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores, batch_size=1)(
    delayed(evaluate_combination)(row["groups"], row["features"], params)
    for _, row in tqdm(df_combinations.iterrows(), total=len(df_combinations))
    for params in all_params
)


# --------------------------------------
# 5. Ergebnisse speichern
# --------------------------------------
# Nach Berechnung aller Ergebnisse
results = [r for r in results if r is not None]
results_df = pd.DataFrame(results)

# Speicherpfade
results_path = os.path.abspath("../results/lgbm/cv_metrics_full.parquet")
models_path = os.path.abspath("../models/lgbm/best_params.parquet")

# Ordner anlegen
os.makedirs(os.path.dirname(results_path), exist_ok=True)
os.makedirs(os.path.dirname(models_path), exist_ok=True)
display(results_df)
# Score berechnen und sortieren
results_df["score"] = (
    results_df["mae"] * 0.3 +
    results_df["rmse"] * 0.3 +
    results_df["under"] * 100 * 0.4
)
best_models = results_df.sort_values("score")

# Speichern
results_df.to_parquet(results_path, index=False)
best_models.to_parquet(models_path, index=False)
display(best_models.head())

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.
calls_pred vorbereiten:   0%|          | 0/12 [00:00<?, ?it/s]15:39:28 - cmdstanpy - INFO - Chain [1] start processing
15:39:28 - cmdstanpy - INFO - Chain [1] done processing
calls_pred vorbereiten:   8%|▊         | 1/12 [00:00<00:01,  7.70it/s]15:39:28 - cmdstanpy - INFO - Chain [1] start processing
15:39:29 - cmdstanpy - INFO - Chain [1] done processing
15:39:29 - cmdstanpy - INFO - Chain [1] start processing
15:39:29 - cmdstanpy - INFO - Chain [1] done processing
calls_pred vorbereiten:  25%|██▌       | 3/12 [00:00<00:00,  9.45it/s]15:39:29 - cmdstanpy - INFO - Chain [1] start processing
15:39:29 - cmdstanpy - INFO - Chain [1] done processing
15:39:29 - cmdstanpy - INFO - Chain [1] start processing
15:39:29 - cmdstanpy - INFO - Chain [1] done processing
calls_pred vorbereiten:  42%|████▏     | 5/12 [00:00<00:00,  9.79it/s]15:39:29 - cmdstanpy - INFO - Chain [1] start processin

Unnamed: 0,groups,features,params,mae,rmse,under,over
0,season,"[season_1, season_2, season_3]","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",98.463911,140.143300,0.208877,0.791123
1,season,"[season_1, season_2, season_3]","{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",98.819280,139.870530,0.203411,0.796589
2,season,"[season_1, season_2, season_3]","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",98.349393,139.502285,0.203322,0.796678
3,season,"[season_1, season_2, season_3]","{'learning_rate': 0.05, 'max_depth': 8, 'n_est...",98.365123,139.656489,0.203411,0.796589
4,season,"[season_1, season_2, season_3]","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",98.464584,140.299798,0.206100,0.793900
...,...,...,...,...,...,...,...
5750,cyclic_encoding_month+holiday_effects+holiday_...,"[quarter_1, quarter_2, quarter_3, is_month_sta...","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",87.199119,128.310762,0.248816,0.751184
5751,cyclic_encoding_month+holiday_effects+holiday_...,"[quarter_1, quarter_2, quarter_3, is_month_sta...","{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",86.954909,128.169325,0.225781,0.774219
5752,cyclic_encoding_month+holiday_effects+holiday_...,"[quarter_1, quarter_2, quarter_3, is_month_sta...","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",89.163606,129.765742,0.221179,0.778821
5753,cyclic_encoding_month+holiday_effects+holiday_...,"[quarter_1, quarter_2, quarter_3, is_month_sta...","{'learning_rate': 0.05, 'max_depth': 8, 'n_est...",89.748083,128.823798,0.214939,0.785061


Unnamed: 0,groups,features,params,mae,rmse,under,over,score
1636,holiday+lag_target_short+roll_target_short+season,"[season_1, season_2, season_3, holiday, holida...","{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",84.606324,123.592374,0.219739,0.780261,71.249164
2984,holiday_window+lag_target_short+month_start_en...,"[season_1, season_2, season_3, is_month_start,...","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",83.889692,122.773388,0.233845,0.766155,71.352739
2954,holiday+lag_target_short+month_start_end+roll_...,"[season_1, season_2, season_3, is_month_start,...","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",83.48237,122.889572,0.23951,0.76049,71.491972
4561,holiday_effects+holiday_window+lag_target_shor...,"[season_1, season_2, season_3, is_month_start,...","{'learning_rate': 0.1, 'max_depth': 6, 'n_esti...",85.471355,124.488825,0.214273,0.785727,71.55897
4514,holiday+lag_target_short+month_start_end+roll_...,"[season_1, season_2, season_3, is_month_start,...","{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",83.551898,123.030407,0.240175,0.759825,71.581706
