In [9]:
%run Imports.ipynb
name = 'Kred'

In [10]:
from pygam import LinearGAM, LogisticGAM, PoissonGAM, GammaGAM, s

In [11]:
import time
from copy import deepcopy
from itertools import product

from sklearn.model_selection import ParameterGrid, cross_val_predict
from sklearn.metrics import (
    confusion_matrix, accuracy_score, f1_score, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score
)


In [12]:
# -------------------------------------------------------------------------
# 1. PARAM GRID  –  now also hosts `cv` and `verbose`
# -------------------------------------------------------------------------
param_grid = {
    "model_cls": [LinearGAM, LogisticGAM, PoissonGAM, GammaGAM],
    "n_splines": [406],          # constant but explicit
    "lam":       [1, 10, 50, 60, 75, 85, 90, 100],
    "max_iter":  [50, 100, 150, 200, 300, 400, 500],
    "cv":        [5],            # ← here
    "verbose":   [True],         # ← and here
}

# 1) Read in Files

In [13]:
key_featsubgroups = pd.read_json('../pickle/2_FS/' + name + '/key_featsubgroups.json', orient='records', lines=True)
df = pd.read_json('../pickle/2_FS/' + name + '/2_df_new_.json', orient='records', lines=True)

KeyboardInterrupt: 

In [5]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


In [14]:
# 1️⃣  read the archive
#     ─────────────────
# * `allow_pickle=True` is **required** if any value in the original
#   dictionary was not a pure NumPy array (e.g. a list, a Python object,
#   a scalar, a scikit-learn model wrapped in an object array, …).
npz = np.load("../pickle/4_Model_Optuna/results_dict_updated.npz",
              allow_pickle=True)

# 2️⃣  turn it back into an ordinary Python dict
#     ──────────────────────────────────────────
# * Each entry in an .npz file is a NumPy array.
# * If the array is 0-D (`shape == ()`) and has dtype=object,
#   call `.item()` to extract the original Python object / scalar.
results_dict = {
    key: (arr.item()        # unwrap 0-D object array
          if arr.shape == () and arr.dtype == object
          else arr)         # leave normal arrays as-is
    for key, arr in npz.items()
}

In [15]:
type(results_dict)

dict

In [16]:
results_dict

AttributeError: 'XGBModel' object has no attribute 'feature_weights'

# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [None]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model Training and prediction

In [None]:
# -------------------------------------------------------------------------
# 2. helper: detect regression vs. binary classification
# -------------------------------------------------------------------------
def _problem_type(y):
    u = np.unique(y)
    return "binary" if len(u) == 2 and u.dtype.kind in "ifu" else "regression"

# -------------------------------------------------------------------------
# 3. MAIN FUNCTION
# -------------------------------------------------------------------------
def gam(param_grid: dict, df, results_dict):
    """
    Exhaustive grid-search over GAM variants.
    `cv` and `verbose` are treated like any other hyper-parameter and
    must live in `param_grid`.
    Returns
    -------
    best_params : dict
    results_dict : dict   (updated in place)
    """
    X_train, X_test, y_train, y_test = split_data_4(df)   # your own splitter
    task = _problem_type(y_train)
    n_features = X_train.shape[1]

    # cache the 406-spline term list once
    term_cache = {406: sum(s(i, n_splines=406) for i in range(n_features))}

    best_score  = -np.inf if task == "binary" else np.inf
    best_params = None
    best_model  = None
    t0 = time.time()

    for params in ParameterGrid(param_grid):
        model_cls = params["model_cls"]
        cv        = params["cv"]
        verbose   = params["verbose"]

        # skip incompatible combos
        if task == "binary" and model_cls is not LogisticGAM:
            continue
        if task == "regression" and model_cls is LogisticGAM:
            continue

        model = model_cls(
            term_cache[params["n_splines"]],
            lam=params["lam"],
            max_iter=params["max_iter"]
        )

        if verbose:
            print(f"[GAM] {model_cls.__name__} | lam={params['lam']} "
                  f"| max_iter={params['max_iter']} | cv={cv}")

        model.fit(X_train, y_train)
        y_pred_cv   = cross_val_predict(model, X_train, y_train, cv=cv)
        y_pred_test = model.predict(X_test)

        if task == "binary":
            score = roc_auc_score(y_test, model.predict_mu(X_test))
        else:
            score = mean_squared_error(y_test, y_pred_test, squared=False)

        is_better = score > best_score if task == "binary" else score < best_score
        if is_better:
            best_score, best_params = score, deepcopy(params)
            best_model = deepcopy(model)
            if verbose:
                tag = "ROC-AUC" if task == "binary" else "RMSE"
                print(f" → new best ({tag}={score:.4f})")

    # ----------------- final evaluation & logging -----------------
    best_model.fit(X_train, y_train)
    y_train_pred = cross_val_predict(best_model, X_train, y_train,
                                     cv=best_params["cv"])
    y_test_pred  = best_model.predict(X_test)

    entry_key = f"gam_best_{best_model.__class__.__name__}"
    log = {
        "estimator": deepcopy(best_model),
        "best_params": best_params,
        "fit_time_s": round(time.time() - t0, 2),
    }

    if task == "binary":
        log.update({
            "cfm_train": confusion_matrix(y_train, y_train_pred),
            "cfm_test":  confusion_matrix(y_test,  y_test_pred),
            "train_accuracy": accuracy_score(y_train, y_train_pred),
            "test_accuracy":  accuracy_score(y_test,  y_test_pred),
            "test_roc_auc":   roc_auc_score(y_test, best_model.predict_mu(X_test)),
        })
    else:
        log.update({
            "rmse_test": mean_squared_error(y_test, y_test_pred, squared=False),
            "mae_test":  mean_absolute_error(y_test, y_test_pred),
            "r2_test":   r2_score(y_test, y_test_pred),
        })

    results_dict[entry_key] = log

    if best_params["verbose"]:
        metric = "ROC-AUC" if task == "binary" else "RMSE"
        print(f"\n✓ Stored best GAM under '{entry_key}' "
              f"({metric}={best_score:.4f}, "
              f"time={log['fit_time_s']:.1f}s)")

    return best_params, results_dict

# 6) Run GAM

In [None]:
best_params, results_dict = gam(param_grid, df, {})