In [1]:
%run Imports.ipynb
name = 'Kred'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# from pygam import LinearGAM, LogisticGAM, PoissonGAM, GammaGAM, s

In [11]:
import time
from copy import deepcopy
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
import json


# 1) Read in Files

In [4]:
string = '../pickl'

In [5]:
key_featsubgroups = pd.read_pickle('../pickle/2_FS/' + name + '/key_featsubgroups.pkl')
df = pd.read_pickle('../pickle/2_FS/' + name + '/2_df_new_.pkl')

In [8]:
df.to_json('../pickle/2_FS/' + name + '/2_df_new_.json', orient='records', lines=True)

In [9]:
key_featsubgroups.to_json('../pickle/2_FS/' + name + '/key_featsubgroups.json', orient='records', lines=True)

In [6]:
print(df[target].value_counts()/df.shape[0])
print('df_shape: ', df.shape)

arrears
1   0.646
0   0.354
Name: count, dtype: float64
df_shape:  (129457, 418)


In [7]:
with open('../pickle/4_Model_Optuna/results_dict_updated.pkl', 'rb') as f:
    results_dict = pickle.load(f)

In [10]:
type(results_dict)

dict

In [13]:
np.savez_compressed("../pickle/4_Model_Optuna/results_dict_updated.npz", **results_dict)
print("Wrote ../pickle/4_Model_Optuna/results_dict_updated.npz")

Wrote ../pickle/4_Model_Optuna/results_dict_updated.npz


# 2) Create Model prediction functions

## 2.1) Split dataset into train/testing while excluding demographic features

In [None]:
def split_data_4(df, key_featsubgroups=key_featsubgroups, target=target, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets while excluding demographic features.

    Parameters:
    df (DataFrame): The dataset containing features and target variable.
    key_featsubgroups (DataFrame): A mapping of feature subgroups.
    target (str): The name of the target variable.
    test_size (float, optional): The proportion of the dataset to allocate for testing. Default is 0.2.
    random_state (int, optional): Random seed for reproducibility. Default is 42.

    Returns:
    tuple: X_train, X_test, y_train, y_test (training and testing datasets)
    """

    # Extract demographic features
    demo_feat = key_featsubgroups.loc[key_featsubgroups['subgroup'] == 'demo', 'list_features'].values[0]
    print("Demographic Features:", demo_feat)

    # Separate features (X) and target variable (y), excluding demographic features
    X = df.drop(columns=[target] + demo_feat)
    y = df[target]

    # Split the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Print dataset shapes
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Testing Labels Shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

## 2.2) Model Training and prediction

In [None]:
model_gam = {
        1: LinearGAM,
        2: LogisticGAM,
        3: PoissonGAM,
        4: GammaGAM
    }

In [None]:
def gam(df: pd.DataFrame, model_numb: int, results_dict) -> dict:
    """
    Description:
    -----------
        This function trains a Generalized Additive Model (GAM) on the provided DataFrame.
        It supports different types of GAMs based on the model_numb parameter.
        It splits the data into training and testing sets, fits the model, and evaluates its performance.

    Parameters:
    -----------
        df (pd.DataFrame):      The input DataFrame containing features and target variable.
        model_numb (int):       An integer indicating the type of GAM to use:
                                1 for LinearGAM, 2 for LogisticGAM, 3 for PoissonGAM, 4 for GammaGAM.
        results_dict (dict):    A dictionary to store the results of the model training and evaluation.

    Returns:
    --------
        dict: A dictionary containing the trained GAM model and its evaluation metrics.

    """

    start_time = time.time()
    # choosing model based on model_numb
    model_gam = {
        1: LinearGAM,
        2: LogisticGAM,
        3: PoissonGAM,
        4: GammaGAM
    }

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data_4(df)
    _, n = X_train.shape

    # Initialize and train the GAM model
    gam = model_gam[model_numb](s(0))
    for i in range(1, n-1):
        gam += s(i)
    gam.fit(X_train, y_train)

    # Perform cross-validation with CV=5 on training data
    y_train_pred = cross_val_predict(gam, X_train, y_train, cv=5)

    # Make predictions on the test set
    y_test_pred = gam.predict(X_test)

    # Compute confusion matrices
    cfm_train = confusion_matrix(y_train, y_train_pred)
    cfm_test = confusion_matrix(y_test, y_test_pred)

    # Compute accuracy scores
    accs_train = accuracy_score(y_train, y_train_pred)
    accs_test = accuracy_score(y_test, y_test_pred)

    # Compute F1-scores for both classes (0 and 1)
    f1s_train_p1 = f1_score(y_train, y_train_pred, pos_label=1)
    f1s_train_p0 = f1_score(y_train, y_train_pred, pos_label=0)
    f1s_test_p1 = f1_score(y_test, y_test_pred, pos_label=1)
    f1s_test_p0 = f1_score(y_test, y_test_pred, pos_label=0)

    # Compute ROC-AUC score for the test data
    test_ras = roc_auc_score(y_test, gam.predict_proba(X_test)[:, 1])

    # Extract feature importances
    global_importance = gam.explain_global().data()
    local_importance = gam.explain_local(X_test, y_test).data()

    total_time = (time.time() - start_time) / 60
    print(f"EBM Model training completed in {total_time:.2f} minutes")

    # Store the computed metrics in the results dictionary
    results_dict["GAM"] = {
        "classifier": deepcopy(gam),
        "cfm_train": cfm_train,
        "cfm_test": cfm_test,
        "train_accuracy": accs_train,
        "test_accuracy": accs_test,
        "train F1-score label 1": f1s_train_p1,
        "train F1-score label 0": f1s_train_p0,
        "test F1-score label 1": f1s_test_p1,
        "test F1-score label 0": f1s_test_p0,
        "test roc auc score": test_ras,
        "default_params": gam.get_params(),  # Save the default parameters used
        "global_importance": global_importance,
        "local_importance": local_importance,
        "time_m": total_time
    }

    return results_dict

In [None]:
def gam_tuning(df: pd.DataFrame, model_numb: int, results_dict, best_params) -> dict:
    """
    Description:
    -----------
        This function trains a Generalized Additive Model (GAM) on the provided DataFrame.
        It supports different types of GAMs based on the model_numb parameter.
        It splits the data into training and testing sets, fits the model, and evaluates its performance.

    Parameters:
    -----------
        df (pd.DataFrame):      The input DataFrame containing features and target variable.
        model_numb (int):       An integer indicating the type of GAM to use:
                                1 for LinearGAM, 2 for LogisticGAM, 3 for PoissonGAM, 4 for GammaGAM.
        results_dict (dict):    A dictionary to store the results of the model training and evaluation.

    Returns:
    --------
        dict: A dictionary containing the trained GAM model and its evaluation metrics.

    """

    start_time = time.time()
    # choosing model based on model_numb
    model_gam = {
        1: LinearGAM,
        2: LogisticGAM,
        3: PoissonGAM,
        4: GammaGAM
    }

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data_4(df)
    _, n = X_train.shape

    # Initialize and train the GAM model
    gam = model_gam[model_numb](s(0),**best_params)
    for i in range(1, n-1):
        gam += s(i)
    gam.fit(X_train, y_train)

    # Perform cross-validation with CV=5 on training data
    y_train_pred = cross_val_predict(gam, X_train, y_train, cv=5)

    # Make predictions on the test set
    y_test_pred = gam.predict(X_test)

    # Compute confusion matrices
    cfm_train = confusion_matrix(y_train, y_train_pred)
    cfm_test = confusion_matrix(y_test, y_test_pred)

    # Compute accuracy scores
    accs_train = accuracy_score(y_train, y_train_pred)
    accs_test = accuracy_score(y_test, y_test_pred)

    # Compute F1-scores for both classes (0 and 1)
    f1s_train_p1 = f1_score(y_train, y_train_pred, pos_label=1)
    f1s_train_p0 = f1_score(y_train, y_train_pred, pos_label=0)
    f1s_test_p1 = f1_score(y_test, y_test_pred, pos_label=1)
    f1s_test_p0 = f1_score(y_test, y_test_pred, pos_label=0)

    # Compute ROC-AUC score for the test data
    test_ras = roc_auc_score(y_test, gam.predict_proba(X_test)[:, 1])

    # Extract feature importances
    global_importance = gam.explain_global().data()
    local_importance = gam.explain_local(X_test, y_test).data()

    total_time = (time.time() - start_time) / 60
    print(f"EBM Model training completed in {total_time:.2f} minutes")

    # Store the computed metrics in the results dictionary
    results_dict["GAM"] = {
        "classifier": deepcopy(gam),
        "cfm_train": cfm_train,
        "cfm_test": cfm_test,
        "train_accuracy": accs_train,
        "test_accuracy": accs_test,
        "train F1-score label 1": f1s_train_p1,
        "train F1-score label 0": f1s_train_p0,
        "test F1-score label 1": f1s_test_p1,
        "test F1-score label 0": f1s_test_p0,
        "test roc auc score": test_ras,
        "default_params": gam.get_params(),  # Save the default parameters used
        "global_importance": global_importance,
        "local_importance": local_importance,
        "time_m": total_time
    }

    return results_dict

# 6) Run GAM