# Imports And Settings

In [63]:
import category_encoders as ce
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

# Loading

In [14]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.31 s, sys: 246 ms, total: 2.56 s
Wall time: 2.56 s


# Droping Useless Columns

In [15]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [16]:
df.columns

Index(['City', 'State', 'Bank', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv', 'MIS_Status'],
      dtype='object')

In [17]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [21]:
y.value_counts(normalize=True)

MIS_Status
P I F     0.824377
CHGOFF    0.175623
Name: proportion, dtype: float64

In [25]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((852267, 20), (44857, 20), (852267,), (44857,))

# Pipeline

## Preprocessing

In [34]:
X.nunique()

City             32562
State               51
Bank              5801
BankState           57
SameState            2
NAICS               21
ApprovalMonth       12
ApprovalDoW          7
Recession            2
Term               412
NewExist             2
NoEmp              598
CreateJob          246
RetainedJob        357
Franchise            2
UrbanRural           3
RevLineCr            3
LowDoc               2
GrAppv           22101
SBA_Appv         38288
dtype: int64

In [64]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["State", "BankState", "NAICS",
            "UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

bin_enc_cols = ["Bank", "City"]

In [65]:
assert len(num_cols) + len(bin_cols) + len(nom_cols) + len(cyc_cols) + len(bin_enc_cols) == X.shape[1]

### "Simple" Nominal And Numerical Treats

In [56]:
ohe_bin = OneHotEncoder(drop="if_binary", sparse_output=False)
ohe_nom = OneHotEncoder(sparse_output=False)  # No drop="first" with a nonlinear model
std_scl = StandardScaler()

### Cyclical Encoding

In [52]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length
    
    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self
        
    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X
    
    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [53]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

### Binary Encoding for `Bank`

In [66]:
bin_enc = ce.BinaryEncoder(cols=bin_enc_cols)

In [67]:
X_train_bin = bin_enc.fit_transform(X_train)

In [68]:
X_train_bin

Unnamed: 0,City_0,City_1,City_2,City_3,City_4,City_5,City_6,City_7,City_8,City_9,...,NewExist,NoEmp,CreateJob,RetainedJob,Franchise,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv
49057,0,0,0,0,0,0,0,0,0,0,...,False,56,0,0,False,M,N,N,300000.0,207000.0
227232,0,0,0,0,0,0,0,0,0,0,...,False,11,6,11,False,M,Unknown,N,409900.0,307425.0
239854,0,0,0,0,0,0,0,0,0,0,...,False,18,9,9,False,M,N,N,295000.0,295000.0
668835,0,0,0,0,0,0,0,0,0,0,...,False,15,0,0,False,M,N,N,200000.0,150000.0
99148,0,0,0,0,0,0,0,0,0,0,...,False,2,2,4,False,U,Y,N,12000.0,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511659,0,0,0,0,0,0,0,1,1,1,...,False,9,0,0,False,M,N,N,81000.0,70470.0
441030,0,0,0,1,1,1,1,1,1,0,...,False,4,0,4,False,U,Unknown,N,120000.0,102000.0
198596,0,0,1,1,1,0,0,0,0,0,...,False,3,2,3,False,U,Y,N,25000.0,12500.0
748654,0,0,0,1,1,0,1,0,1,1,...,False,120,0,0,False,M,N,Y,220000.0,154000.0


In [69]:
X_train_bin.columns

Index(['City_0', 'City_1', 'City_2', 'City_3', 'City_4', 'City_5', 'City_6',
       'City_7', 'City_8', 'City_9', 'City_10', 'City_11', 'City_12',
       'City_13', 'City_14', 'State', 'Bank_0', 'Bank_1', 'Bank_2', 'Bank_3',
       'Bank_4', 'Bank_5', 'Bank_6', 'Bank_7', 'Bank_8', 'Bank_9', 'Bank_10',
       'Bank_11', 'Bank_12', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv'],
      dtype='object')

### Feature Hashing for `City`

In [70]:
# Set aside for the moment...

In [73]:
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", bin_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [75]:
X_train_tr = preproc.fit_transform(X_train)

In [76]:
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,City_5,City_6,City_7,City_8,City_9,City_10,City_11,City_12,City_13,City_14
49057,0.116457,0.608226,-0.035721,-0.045540,0.378083,0.250932,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
227232,2.398551,-0.005474,-0.010512,0.000606,0.766322,0.690834,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
239854,1.637853,0.089990,0.002093,-0.007784,0.360420,0.636408,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
668835,-0.492102,0.049077,-0.035721,-0.045540,0.024818,0.001249,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
99148,-0.339962,-0.128214,-0.027318,-0.028760,-0.639321,-0.629529,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511659,-0.644241,-0.032749,-0.035721,-0.045540,-0.395568,-0.347124,1.0,0.0,0.0,0.0,...,0,0,1,1,1,1,0,1,1,1
441030,2.398551,-0.100938,-0.035721,-0.028760,-0.257794,-0.209010,0.0,1.0,0.0,0.0,...,1,1,1,1,0,1,1,0,1,0
198596,-1.176730,-0.114576,-0.027318,-0.032955,-0.593397,-0.601056,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
748654,0.116457,1.481043,-0.035721,-0.045540,0.095471,0.018771,1.0,0.0,0.0,0.0,...,0,1,0,1,1,0,1,0,0,1


## Estimator