# Feature Selection and Suggestion

In [9]:
# imports
import re
import json
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from typing import Optional, Tuple, Dict, List
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split  
from sklearn.feature_selection import SelectFromModel 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
import os
from typing import List, Dict, Tuple


In [10]:
# variable class
class analyze:
    def __init__(self,
                 data_source: str = os.path.join(os.getcwd(),"Project Data.csv"), # analysis data
                 data_dictionary_source:str = os.path.join(os.getcwd(),"Project Data Dictionary.xlsx"), # data dictionary
                 output_prefix:str = "iteration", # appends to indicate output
                 target_col:str = "success", # modeling target (1=good on C1B, 0=bad on C1B)
                 exclude_col:List[str] = ['ssn_ssn'], # columns to exclude from modeling
                 top_n_features:int = 10, # how many top features to print at the end
                 ):
        self.data_source:str = data_source
        self.data_dictionary_source:str = data_dictionary_source
        self.output_prefix:str = output_prefix
        self.target_col:str = target_col
        self.exclude_col:List[str] = exclude_col
        self.top_n_features:int = top_n_features
        self.data_table()
        print(f"Data shape: {self.data.shape}")
        self.data_dictionary_table()
        print(f"Data dictionary shape: {self.data_dictionary.shape}")
        self.data = self.create_target(self.data)
        # Additional cleaning of data columns? Unnecessary? Seems the dataset is currently pretty clean.
        # Keep only rows that have a target for training - David
            # we should only have success == 1 for the training of the model
        # NOTE: STEP 2 — DICTIONARY-DRIVEN CLEANING (ONE DICTIONARY) 
            # May not need to happen here.
        # NOTE: STEP 3 — MODEL: L1 feature selection → LightGBM (or RF) → Top N features 
            # start here!
                # NOTE: we need to also do grid-search
                    # Or, rather, we need to iterate through all the options in the parameters
                        # to find the optimal condition
    def data_table(self) -> None:
        self.data:pd.DataFrame = (
            pd.read_csv(self.data_source)
        )
    def data_dictionary_table(self) -> None:
        self.data_dictionary:pd.DataFrame = (
            pd.read_excel(self.data_dictionary_source)
        )
    def create_target(self,
                      data:pd.DataFrame
                      ) -> pd.DataFrame:
        data["success"] = (data['active_account']==True) & (data['delinquent_account']==False)
        data:pd.DataFrame = data.drop(columns=['active_account','delinquent_account'])
        return data
    
# Create the analysis class object
analysis:analyze = analyze()


Data shape: (4700, 31)
Data dictionary shape: (31, 11)


In [12]:
table:pd.DataFrame = analysis.data
print(table.dtypes)

ident_monitor_opt         float64
Num_Bk_Accts              float64
income                    float64
bk_accts_ssn              float64
cells_ssn                 float64
dls_ssn                   float64
emails_ssn                float64
hmphones_ssn              float64
addrs_ssn                 float64
ssn_ssn                   float64
zips_ssn                  float64
empl_ssn_6Mo              float64
pday_inq_15days           float64
dti_score                 float64
days_from_registration    float64
days_from_login           float64
asset_score               float64
alt_risk_score            float64
seg_id                    float64
stability_score           float64
max_bkaccts               float64
auto_inq_1dy              float64
auto_inq_7dy              float64
auto_inq_72hr             float64
cc_inq_72hr               float64
cc_inq_10dy               float64
pl_inq_72hr               float64
pl_inq_90dy               float64
alt_risk_score_2          float64
success       

In [None]:
y:pd.Series = table[analysis.target_col] # is bool fine?
X:pd.DataFrame = table.drop(columns=analysis.exclude_col)

# Separate numeric vs categorical (objects) 
cat_cols = [c for c in X.columns if X[c].dtype == object] 
num_cols = [c for c in X.columns if c not in cat_cols]

# OneHotEncoder: handle new/old sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True) 
except TypeError:
    # older sklearn
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe),
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.30, 
    random_state=42, 
    stratify=y,
)

# --- Fit the preprocessor first, then do L1 selection on the preprocessed matrix --- 
pre.fit(X_train, y_train) 
Xtr_pre = pre.transform(X_train) 
Xte_pre = pre.transform(X_test)

# L1 logistic for feature selection (wrapped inside SelectFromModel; no pipeline step for plain LogisticRegression) 
lasso = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=3000,
    class_weight="balanced",
)

selector = SelectFromModel(
    estimator=lasso, 
    threshold="median",
    ) 
selector.fit(
    Xtr_pre, 
    y_train,
    )

Xtr_sel = selector.transform(Xtr_pre)
Xte_sel = selector.transform(Xte_pre)
try:
    print(f"Selected features (post-L1): {Xtr_sel.shape[1]}")
except Exception as e:
    print(f"Failed to print due to {e}")

# Final model: LightGBM (fallback to RF if LGBM not installed)
try:
    from lightgbm import LGBMClassifier
    clf = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
    model_name = "LightGBM"
except Exception:
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(
        n_estimators=500,
        min_samples_leaf=2,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
    model_name = "RandomForest"

