# Feature Selection and Suggestion

In [None]:
# imports
import re
import json
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from typing import Optional, Tuple, Dict, List, Any
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split  
from sklearn.feature_selection import SelectFromModel 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
import os
from main import analyze


In [None]:
analysis:analyze = analyze()

In [None]:
# variable class
class analyze:
    def __init__(self,
                 min_child_samples:Optional[int],
                 min_split_gain:Optional[float],
                 min_child_weight:Optional[Any],
                 data_source: str = os.path.join(os.getcwd(),"Project Data.csv"), # analysis data
                 data_dictionary_source:str = os.path.join(os.getcwd(),"Project Data Dictionary.xlsx"), # data dictionary
                 output_prefix:str = "iteration", # appends to indicate output
                 target_col:str = "success", # modeling target (1=good on C1B, 0=bad on C1B)
                 exclude_col:List[str] = ['ssn_ssn'], # columns to exclude from modeling
                 top_n_features:int = 10, # how many top features to print at the end
                 test_size:float=0.30, 
                 random_state:int=42, 
                 penalty:str="l1",
                 solver:str="liblinear",
                 max_iter:int=3000,
                 class_weight:str="balanced",
                 n_estimators:int=1000,
                 learning_rate:float=0.03,
                 num_leaves:int=64,
                 subsample:float=0.8,
                 colsample_bytree:float=0.8,
                 reg_lambda:float=1.0,
                 n_jobs:int=-1,
                 

                 ):
        self.data_source:str = data_source
        self.data_dictionary_source:str = data_dictionary_source
        self.output_prefix:str = output_prefix
        self.target_col:str = target_col
        self.exclude_col:List[str] = exclude_col
        self.top_n_features:int = top_n_features
        self.test_size = test_size
        self.random_state = random_state
        self.penalty = penalty
        self.solver = solver
        self.max_iter = max_iter
        self.class_weight = class_weight
        self.min_child_samples = min_child_samples
        self.min_child_weight = min_child_weight
        self.min_split_gain = min_split_gain
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.num_leaves = num_leaves
        self.subsample = subsample
        self.colsample_bytree =colsample_bytree
        self.reg_lambda = reg_lambda
        self.n_jobs = n_jobs


        self.data_table()
        print(f"Data shape: {self.data.shape}")
        self.data_dictionary_table()
        print(f"Data dictionary shape: {self.data_dictionary.shape}")
        self.data = self.create_target(self.data)
        # Additional cleaning of data columns? Unnecessary? Seems the dataset is currently pretty clean.
        # Keep only rows that have a target for training - David
            # we should only have success == 1 for the training of the model
        # NOTE: STEP 2 — DICTIONARY-DRIVEN CLEANING (ONE DICTIONARY) 
            # May not need to happen here.
        # NOTE: STEP 3 — MODEL: L1 feature selection → LightGBM (or RF) → Top N features 
            # start here!
                # NOTE: we need to also do grid-search
                    # Or, rather, we need to iterate through all the options in the parameters
                        # to find the optimal condition
    def data_table(self) -> None:
        self.data:pd.DataFrame = (
            pd.read_csv(self.data_source)
        )
    def data_dictionary_table(self) -> None:
        self.data_dictionary:pd.DataFrame = (
            pd.read_excel(self.data_dictionary_source)
        )
    def create_target(self,
                      data:pd.DataFrame
                      ) -> pd.DataFrame:
        data["success"] = (data['active_account']==True) & (data['delinquent_account']==False)
        payload:pd.DataFrame = data.drop(columns=['active_account','delinquent_account'])
        return payload
    def model(self):
        y:pd.Series = table[analysis.target_col] 
        X:pd.DataFrame = table.drop(columns=analysis.exclude_col+[analysis.target_col])

        # Separate numeric vs categorical future functionality
        cat_cols = [c for c in X.columns if X[c].dtype == object] 
        num_cols = [c for c in X.columns if c not in cat_cols]

        # OneHotEncoder: handles encoding of categoricals
        try:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True) 
        except TypeError:
            # older sklearn
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
        # Imputes numericals
        num_pipe = Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("scale", StandardScaler(with_mean=True, with_std=True)),
        ])
        # Imputes categoricals
        cat_pipe = Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", ohe),
        ])
        # For columnar data in particular
        pre = ColumnTransformer(
            transformers=[
                ("num", num_pipe, num_cols),
                ("cat", cat_pipe, cat_cols),
            ],
            remainder="drop"
        )
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, 
            y, 
            test_size=self.test_size, 
            random_state=self.random_state, 
            stratify=y,
        )

        # --- Fit the preprocessor first, then do L1 selection on the preprocessed matrix --- 
        pre.fit(X_train, y_train) 
        Xtr_pre = pre.transform(X_train) 
        Xte_pre = pre.transform(X_test)

        # L1 logistic for feature selection (wrapped inside SelectFromModel; no pipeline step for plain LogisticRegression) 
        lasso = LogisticRegression(
            # Fixed parameters TODO: pipe in from class object.
            penalty="l1",
            solver="liblinear",
            max_iter=3000,
            class_weight="balanced",
        )

        selector = SelectFromModel(
            estimator=lasso, 
            threshold="median",
            ) 
        selector.fit(
            Xtr_pre, 
            y_train,
            )

        Xtr_sel = selector.transform(Xtr_pre)
        Xte_sel = selector.transform(Xte_pre)
        try:
            print(f"Selected features (post-L1): {Xtr_sel.shape[1]}")
        except Exception as e:
            print(f"Failed to print due to {e}")

        # Final model: LightGBM (fallback to RF if LGBM not installed)
        try:
            from lightgbm import LGBMClassifier
            clf = LGBMClassifier(
                n_estimators=1000,
                learning_rate=0.03,
                num_leaves=64,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_lambda=1.0,
                class_weight="balanced",
                n_jobs=-1,
                random_state=42,
                # Helps find more splits, reduce warnings
                min_child_samples=5,
                min_split_gain=0.0,
                min_child_weight=1e-3,
            )
            model_name = "LightGBM"
        except Exception:
            from sklearn.ensemble import RandomForestClassifier
            clf = RandomForestClassifier(
                n_estimators=500,
                min_samples_leaf=2,
                class_weight="balanced",
                n_jobs=-1,
                random_state=42
            )
            model_name = "RandomForest"

        clf.fit(Xtr_sel, y_train)
        proba = clf.predict_proba(Xte_sel)[:, 1]

        roc = roc_auc_score(y_test, proba)
        pr_auc = average_precision_score(y_test, proba) 
        print(f"\n{model_name} ROC AUC: {roc:.4f} | PR AUC: {pr_auc:.4f}")

        print("\n= Report @ 0.5 =")
        pred = (proba >= 0.5).astype(int)
        print(classification_report(y_test, pred, digits=3)) 
        print("Confusion matrix @ 0.5:\n", confusion_matrix(y_test, pred))

        # ------ Feature names and Top-N importances ------ 
        # Build feature names from fitted preprocessor 
        num_feature_names = num_cols[:]  # numeric pass-through names

        cat_feature_names = []
        if len(cat_cols):
            oh = pre.named_transformers_["cat"].named_steps["onehot"]
            cat_feature_names = list(oh.get_feature_names_out(cat_cols))

        all_pre_names = num_feature_names + cat_feature_names

        # Mask from selector
        sel_mask = selector.get_support()
        selected_feature_names = [n for n, keep in zip(all_pre_names, sel_mask) if keep]

        # Importances
        importances = getattr(clf, "feature_importances_", None) 
        if importances is not None and len(importances) == len(selected_feature_names):
            order = np.argsort(importances)[::-1]
            print(f"\nTop {analysis.top_n_features} predictors:")
            for i in order[:analysis.top_n_features]:
                print(f"{selected_feature_names[i]:45s}  {importances[i]:.6f}")
        else:
            print("\n[Note] Could not align importances with selected feature names. "
                "This can happen if the classifier lacks 'feature_importances_' or "
                "feature name lengths mismatch.")
    
# Create the analysis class object
analysis:analyze = analyze()
table:pd.DataFrame = analysis.data

Data shape: (4700, 31)
Data dictionary shape: (31, 11)


In [None]:
y:pd.Series = table[analysis.target_col] # is bool fine?
X:pd.DataFrame = table.drop(columns=analysis.exclude_col+[analysis.target_col])

# Separate numeric vs categorical (objects) 
cat_cols = [c for c in X.columns if X[c].dtype == object] 
num_cols = [c for c in X.columns if c not in cat_cols]

# OneHotEncoder: handle new/old sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True) 
except TypeError:
    # older sklearn
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe),
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.30, 
    random_state=42, 
    stratify=y,
)

# --- Fit the preprocessor first, then do L1 selection on the preprocessed matrix --- 
pre.fit(X_train, y_train) 
Xtr_pre = pre.transform(X_train) 
Xte_pre = pre.transform(X_test)

# L1 logistic for feature selection (wrapped inside SelectFromModel; no pipeline step for plain LogisticRegression) 
lasso = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=3000,
    class_weight="balanced",
)

selector = SelectFromModel(
    estimator=lasso, 
    threshold="median",
    ) 
selector.fit(
    Xtr_pre, 
    y_train,
    )

Xtr_sel = selector.transform(Xtr_pre)
Xte_sel = selector.transform(Xte_pre)
try:
    print(f"Selected features (post-L1): {Xtr_sel.shape[1]}")
except Exception as e:
    print(f"Failed to print due to {e}")

# Final model: LightGBM (fallback to RF if LGBM not installed)
try:
    from lightgbm import LGBMClassifier
    clf = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
    model_name = "LightGBM"
except Exception:
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(
        n_estimators=500,
        min_samples_leaf=2,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
    model_name = "RandomForest"

clf.fit(Xtr_sel, y_train)
proba = clf.predict_proba(Xte_sel)[:, 1]

roc = roc_auc_score(y_test, proba)
pr_auc = average_precision_score(y_test, proba) 
print(f"\n{model_name} ROC AUC: {roc:.4f} | PR AUC: {pr_auc:.4f}")

print("\n= Report @ 0.5 =")
pred = (proba >= 0.5).astype(int)
print(classification_report(y_test, pred, digits=3)) 
print("Confusion matrix @ 0.5:\n", confusion_matrix(y_test, pred))

# ------ Feature names and Top-N importances ------ 
# Build feature names from fitted preprocessor 
num_feature_names = num_cols[:]  # numeric pass-through names

cat_feature_names = []
if len(cat_cols):
    oh = pre.named_transformers_["cat"].named_steps["onehot"]
    cat_feature_names = list(oh.get_feature_names_out(cat_cols))

all_pre_names = num_feature_names + cat_feature_names

# Mask from selector
sel_mask = selector.get_support()
selected_feature_names = [n for n, keep in zip(all_pre_names, sel_mask) if keep]

# Importances
importances = getattr(clf, "feature_importances_", None) 
if importances is not None and len(importances) == len(selected_feature_names):
    order = np.argsort(importances)[::-1]
    print(f"\nTop {analysis.top_n_features} predictors:")
    for i in order[:analysis.top_n_features]:
        print(f"{selected_feature_names[i]:45s}  {importances[i]:.6f}")
else:
    print("\n[Note] Could not align importances with selected feature names. "
          "This can happen if the classifier lacks 'feature_importances_' or "
          "feature name lengths mismatch.")


Selected features (post-L1): 29




[LightGBM] [Info] Number of positive: 2533, number of negative: 757
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1541
[LightGBM] [Info] Number of data points in the train set: 3290, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

LightGBM ROC AUC: 1.0000 | PR AUC: 1.0000

= Report @ 0.5 =
              precision    recall  f1-score   support

       False      1.000     1.000     1.000       324
        True      1.000     1.000     1.000      1086

    accuracy                          1.000      1410
   macro avg      1.000     1.000     1.000      1410
weighted avg      1.000     1.000     1.000      1410

Confusion matrix @ 0.5:
 [[ 324    0]
 [   0 1086]]

Top 1

