In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer
import re

In [None]:
df = pd.read_csv('../data/fix/feature_selected_reg_full.csv')

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.multioutput import RegressorChain
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import re

# -------------------------
# Targets
# -------------------------
recession_targets = [
    "recession_probability",
    "1_month_recession_probability",
    "3_month_recession_probability",
    "6_month_recession_probability",
]

chain_targets = recession_targets[:3]
target_6m = recession_targets[3]

# -------------------------
# Split train/test
# -------------------------
split_date = "2020-01-01"
df["date"] = pd.to_datetime(df["date"])
train_df = df[df["date"] < split_date].copy()
test_df  = df[df["date"] >= split_date].copy()

X_train = train_df.drop(columns=recession_targets + ["date"])
X_test  = test_df.drop(columns=recession_targets + ["date"])
y_train = train_df[recession_targets]
y_test  = test_df[recession_targets]

# -------------------------
# Clean NaNs
# -------------------------
def clean_data(X_or_y):
    X_or_y = X_or_y.replace([np.inf, -np.inf], np.nan)
    X_or_y = X_or_y.ffill().bfill()
    X_or_y = X_or_y.fillna(0)
    return X_or_y

X_train = clean_data(X_train)
X_test = clean_data(X_test)
y_train = clean_data(y_train)
y_test = clean_data(y_test)

# -------------------------
# Sanitize column names for LightGBM
# -------------------------
def sanitize_columns(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
    return df

X_train = sanitize_columns(X_train)
X_test = sanitize_columns(X_test)

# -------------------------
# Logit transform
# -------------------------
epsilon = 1e-6
def logit_transform(y):
    y_scaled = np.clip(y / 100, epsilon, 1 - epsilon)
    return np.log(y_scaled / (1 - y_scaled))

def inv_logit_transform(y_logit):
    y_prob = 1 / (1 + np.exp(-y_logit))
    return y_prob * 100

# -------------------------
# Step 1: RegressorChain for first 3 targets
# -------------------------
# Define a LightGBM wrapper compatible with sklearn for RegressorChain
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from api.lgbm_wrapper import LGBMWrapper

# LightGBM parameters
lgb_params = {
    "objective": "regression",
    "metric": "None",
    "max_depth": 5,
    "learning_rate": 0.05,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "seed": 42,
    "verbose": -1,
}

# Transform targets
y_train_chain = logit_transform(y_train[chain_targets].values)

# Fit RegressorChain
chain_base = LGBMWrapper(params=lgb_params, num_boost_round=500)
chain_model = RegressorChain(chain_base)
chain_model.fit(X_train, y_train_chain)

# Predict and invert logit
preds_chain_logit = chain_model.predict(X_test)
preds_chain_df = pd.DataFrame(
    inv_logit_transform(preds_chain_logit),
    columns=chain_targets,
    index=X_test.index
).clip(0, 100)

# -------------------------
# Step 2: Train 6-month target separately
# -------------------------
X_train_6m = X_train.copy()
X_test_6m = X_test.copy()

# Add first 3 predictions as features
X_train_6m[chain_targets] = inv_logit_transform(chain_model.predict(X_train))
X_test_6m[chain_targets] = preds_chain_df

y_train_6m = logit_transform(y_train[target_6m].values)

dtrain_6m = lgb.Dataset(X_train_6m, label=y_train_6m)
model_6m = lgb.train(lgb_params, dtrain_6m, num_boost_round=500)

pred_6m_logit = model_6m.predict(X_test_6m)
pred_6m = np.clip(inv_logit_transform(pred_6m_logit), 0, 100)

# -------------------------
# Combine predictions
# -------------------------
preds_test_final = preds_chain_df.copy()
preds_test_final[target_6m] = pred_6m

# -------------------------
# Evaluation
# -------------------------
def evaluate(y_true, y_pred, dataset_name="Dataset"):
    print(f"\n=== Evaluation on {dataset_name} ===")
    for target in recession_targets:
        mae = mean_absolute_error(y_true[target], y_pred[target])
        rmse = mean_squared_error(y_true[target], y_pred[target])
        r2 = r2_score(y_true[target], y_pred[target])
        print(f"{target}: MAE={mae:.4f}, RMSE={rmse:.4f}, R2={r2:.4f}")

evaluate(y_test, preds_test_final, "Test")
print(preds_test_final.tail())
print(preds_test_final.describe())
# -------------------------
# Plot
# -------------------------
for target in recession_targets:
    plt.figure(figsize=(12, 4))
    plt.plot(test_df.index, y_test[target], label="Test Actual")
    plt.plot(test_df.index, preds_test_final[target], label="Test Predicted", linestyle="--")
    plt.title(f"{target}: Actual vs Predicted (LightGBM + RegressorChain)")
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
# -------------------------
# Export trained models as pickle files
# -------------------------
import pickle
import os


# Save the RegressorChain model (first 3 targets)
with open("lgbm_recession_chain_model.pkl", "wb") as f:
    pickle.dump(chain_model, f)

# Save the 6-month LightGBM model
# with open("lgbm_recession_6m_model.pkl", "wb") as f:
#     pickle.dump(model_6m, f)

# print("✅ Models saved in the 'models' folder:")
# print(" - models/lgbm_recession_chain_model.pkl")
# print(" - models/lgbm_recession_6m_model.pkl")
