<a href="https://colab.research.google.com/github/Benetti-Hub/Benetti-Hub-Kaggle-Home-Credit-Risk/blob/main/1_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Selection with Adversarial Validation and Univariate ROC-AUC

One of the primary challenges in this competition was identifying features that remained stable over time. To address this, I employed adversarial validation to pinpoint the most crucial features. This method hinges on the principle that the distributions of the training and test sets should be similar. When these distributions align, the features significant for the training set are likely to be equally important for the test set.

In addition, I utilized the ROC-AUC score to further refine feature selection. The ROC-AUC score is a metric that evaluates a model's ability to distinguish between classes. Here, I used it to assess a feature's capability to differentiate between the training and test sets.

The feature selection methods employed here aim to maximize the ROC-AUC while keeping computational constraints to a minimum. Many features are correlated or redundant, but LightGBM is capable of ignoring this redundancy. In a production environment, a more rigorous feature selection process, such as step-forward feature selection, SHAP values, or L1 regularization, would be applied to ensure the optimal set of features is utilized.

In [1]:
%%capture
!pip install --upgrade polars
!pip install --upgrade pandas
!pip install lightgbm
!pip install pyarrow

In [2]:
import os
import pickle
from multiprocessing import Pool

import lightgbm as lgb
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [3]:
drive.mount("/content/drive")

# Setup the directory for the notebook:s
DRIVE_PATH = "/content/drive/MyDrive/Projects/KaggleDefaults"
os.makedirs(DRIVE_PATH, exist_ok=True)

Mounted at /content/drive


In [4]:
def cast_cat(data: pd.DataFrame) -> pd.DataFrame:
    """Cast any object column to category"""
    cat_cols = data.select_dtypes(include="object").columns
    data[cat_cols] = data[cat_cols].astype("category")

    return data


class TimeSeriesSplitter:
    """Scikit-learn style TimeSeriesSplitter to batch custers
    of time series data together for cross-validation."""

    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, X, y, group):
        groups = np.array_split(group.unique(), self.n_splits)
        for bounds in groups:
            mask = group.isin(bounds)

            yield X[mask].index, X[~mask].index

    def get_n_splits(self):
        return self.n_splits


def eval_performance(feat: list[str] | str) -> list[float]:

    params = {
        "max_depth": 3,
        "n_estimators": 10,
        "verbose": -1,
        "random_state": 42,
        "n_jobs": 1,
    }
    uni_model = lgb.LGBMClassifier(**params)
    adv_model = lgb.LGBMClassifier(**params)

    data = train_data[[feat]]
    target_adv = np.zeros(data.shape[0])

    cv = TimeSeriesSplitter(n_splits=5)
    adv_scores = np.zeros(cv.get_n_splits())
    uni_scores = np.zeros(cv.get_n_splits())
    for i, (train_idx, valid_idx) in enumerate(
        cv.split(data, target_adv, train_data["WEEK_NUM"])
    ):
        # Adversarial validation
        target_adv[valid_idx] = 1
        X_train, X_valid, y_adv_train, y_adv_valid = train_test_split(
            data, target_adv, test_size=0.33, random_state=42, shuffle=True
        )
        adv_model.fit(X_train, y_adv_train)
        adv_scores[i] = roc_auc_score(
            y_adv_valid, adv_model.predict_proba(X_valid)[:, 1]
        )
        target_adv[valid_idx] = 0

        # Univariate
        X_train, X_valid = data.iloc[train_idx], data.iloc[valid_idx]
        y_train, y_valid = (
            train_data["target"].iloc[train_idx],
            train_data["target"].iloc[valid_idx],
        )
        uni_model.fit(X_train, y_train)
        uni_scores[i] = roc_auc_score(y_valid, uni_model.predict_proba(X_valid)[:, 1])

    return (*adv_scores, *uni_scores)

In [5]:
train_data = pd.read_parquet(f"{DRIVE_PATH}/train_data_pandas.parquet").pipe(cast_cat)

In [6]:
file_path = f"{DRIVE_PATH}/cat_best.parquet"
if os.path.isfile(file_path):
    univ_results = pd.read_parquet(file_path)
else:
    feat_cols = sorted(set(train_data.columns) - {"case_id", "WEEK_NUM", "target"})
    scores = dict(
        zip(
            feat_cols,
            tqdm(Pool().imap(eval_performance, feat_cols), total=len(feat_cols)),
        )
    )
    adv_cols = [f"adv_{c}" for c in range(5)]
    uni_cols = [f"uni_{c}" for c in range(5)]
    univ_results = (
        pd.DataFrame.from_dict(scores, orient="index", columns=[*adv_cols, *uni_cols])
        .reset_index()
        .assign(
            **{
                "adv_avg": lambda x: np.mean(x[adv_cols], axis=1),
                "uni_avg": lambda x: np.mean(x[uni_cols], axis=1),
                "uni_max": lambda x: np.max(x[uni_cols], axis=1),
                "uni_min": lambda x: np.min(x[uni_cols], axis=1),
                "feature": lambda x: x["index"].str.split("_").str[-1],
            }
        )
        .sort_values(by="uni_min", ascending=False)
    )
    univ_results.to_parquet(file_path)

100%|█████████▉| 4507/4508 [20:32<00:00,  3.66it/s]


In [7]:
univ_reduced = univ_results.sort_values(by="uni_avg", ascending=False).query(
    "uni_avg > 0.51 and adv_avg < 0.8"
)

with open(f"{DRIVE_PATH}/to_keep.pkl", "wb") as f:
    pickle.dump(univ_reduced["index"].to_list(), f)


to_drop = sorted(set(univ_results["index"]) - set(univ_reduced["index"].to_list()))
with open(f"{DRIVE_PATH}/to_drop.pkl", "wb") as f:
    pickle.dump(to_drop, f)