# Imports And Settings

In [47]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Loading

In [14]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.31 s, sys: 246 ms, total: 2.56 s
Wall time: 2.56 s


# Droping Useless Columns

In [15]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [16]:
df.columns

Index(['City', 'State', 'Bank', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv', 'MIS_Status'],
      dtype='object')

In [17]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [21]:
y.value_counts(normalize=True)

MIS_Status
P I F     0.824377
CHGOFF    0.175623
Name: proportion, dtype: float64

In [25]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((852267, 20), (44857, 20), (852267,), (44857,))

# Pipeline

In [34]:
X.nunique()

City             32562
State               51
Bank              5801
BankState           57
SameState            2
NAICS               21
ApprovalMonth       12
ApprovalDoW          7
Recession            2
Term               412
NewExist             2
NoEmp              598
CreateJob          246
RetainedJob        357
Franchise            2
UrbanRural           3
RevLineCr            3
LowDoc               2
GrAppv           22101
SBA_Appv         38288
dtype: int64

In [44]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["State", "BankState", "NAICS",
            "UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

bin_enc_cols = ["Bank"]

feat_hash_cols = ["City"]

In [45]:
assert len(num_cols) + len(bin_cols) + len(nom_cols) + len(cyc_cols) + len(bin_enc_cols) + len(feat_hash_cols) == X.shape[1]

## "Simple" Nominal And Numerical Treats

In [46]:
ohe_bin = OneHotEncoder(drop="if_binary", sparse_output=False)
ohe_nom = OneHotEncoder(drop="first", sparse_output=False)
std_scl = StandardScaler()

## Cyclical Encoding

In [52]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length
    
    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self
        
    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X
    
    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [53]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)