# Imports And Settings

In [36]:
import category_encoders as ce
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Loading

In [2]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 3.13 s, sys: 458 ms, total: 3.58 s
Wall time: 3.6 s


# Droping Useless Columns

In [3]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [4]:
df.columns

Index(['City', 'State', 'Bank', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv', 'MIS_Status'],
      dtype='object')

In [5]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [6]:
y.value_counts(normalize=True)

MIS_Status
P I F     0.824377
CHGOFF    0.175623
Name: proportion, dtype: float64

In [7]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((852267, 20), (44857, 20), (852267,), (44857,))

# Model 1: BINARY ENCODING

In [9]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["State", "BankState", "NAICS",
            "UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

bin_enc_cols = ["Bank", "City"]

In [10]:
assert len(num_cols) + len(bin_cols) + len(nom_cols) + len(cyc_cols) + len(bin_enc_cols) == X.shape[1]

## "Simple" Nominal And Numerical Treats

In [11]:
ohe_bin = OneHotEncoder(drop="if_binary", sparse_output=False)
ohe_nom = OneHotEncoder(sparse_output=False)  # No drop="first" with a nonlinear model
std_scl = StandardScaler()

## Cyclical Encoding for Time-Related Features

In [12]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length
    
    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self
        
    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X
    
    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [13]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

## Binary Encoding for `Bank` and `City`

In [14]:
bin_enc = ce.BinaryEncoder(cols=bin_enc_cols)

In [15]:
X_train_bin = bin_enc.fit_transform(X_train)
X_train_bin

Unnamed: 0,City_0,City_1,City_2,City_3,City_4,City_5,City_6,City_7,City_8,City_9,...,NewExist,NoEmp,CreateJob,RetainedJob,Franchise,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv
702917,0,0,0,0,0,0,0,0,0,0,...,False,6,1,6,False,U,Unknown,N,50000.0,25000.0
197965,0,0,0,0,0,0,0,0,0,0,...,False,8,1,8,False,M,Unknown,N,310000.0,232500.0
62584,0,0,0,0,0,0,0,0,0,0,...,False,1,3,1,False,U,Y,N,24000.0,12000.0
209827,0,0,0,0,0,0,0,0,0,0,...,True,0,1,0,False,U,Y,N,15000.0,7500.0
457392,0,0,0,0,0,0,0,0,0,0,...,False,4,0,4,False,R,Y,N,40000.0,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221087,0,0,0,0,0,0,0,0,0,0,...,False,2,0,2,False,U,Y,N,40000.0,20000.0
521445,1,0,1,0,1,0,0,0,0,1,...,False,12,4,8,False,U,Y,N,100000.0,50000.0
441446,0,0,0,0,0,1,1,0,1,0,...,True,6,0,0,True,M,N,N,278900.0,237065.0
634727,0,0,0,1,0,0,1,1,1,1,...,False,1,0,1,False,U,Unknown,N,12000.0,6000.0


In [16]:
X_train_bin.columns

Index(['City_0', 'City_1', 'City_2', 'City_3', 'City_4', 'City_5', 'City_6',
       'City_7', 'City_8', 'City_9', 'City_10', 'City_11', 'City_12',
       'City_13', 'City_14', 'State', 'Bank_0', 'Bank_1', 'Bank_2', 'Bank_3',
       'Bank_4', 'Bank_5', 'Bank_6', 'Bank_7', 'Bank_8', 'Bank_9', 'Bank_10',
       'Bank_11', 'Bank_12', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv'],
      dtype='object')

In [18]:
preproc1 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", bin_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc1.set_output(transform="pandas")

In [20]:
X_train_tr = preproc1.fit_transform(X_train)
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,City_5,City_6,City_7,City_8,City_9,City_10,City_11,City_12,City_13,City_14
702917,-1.151280,-0.074118,-0.031572,-0.020442,-0.504702,-0.545958,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
197965,1.371057,-0.046681,-0.031572,-0.012064,0.412577,0.361942,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
62584,-0.340076,-0.142710,-0.023182,-0.041386,-0.596430,-0.602838,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
209827,-0.644277,-0.156429,-0.031572,-0.045574,-0.628182,-0.622527,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
457392,-1.290705,-0.101555,-0.035766,-0.028819,-0.539982,-0.567835,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221087,-0.340076,-0.128992,-0.035766,-0.037197,-0.539982,-0.567835,0.0,1.0,0.0,0.0,...,0,0,0,0,0,1,1,0,0,1
521445,-0.796378,0.008193,-0.018987,-0.012064,-0.328302,-0.436572,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,0,0,1
441446,1.333032,-0.074118,-0.035766,-0.045574,0.302856,0.381916,0.0,1.0,1.0,1.0,...,1,1,0,1,0,0,0,0,0,0
634727,-0.644277,-0.142710,-0.035766,-0.041386,-0.638766,-0.629091,0.0,0.0,0.0,0.0,...,0,1,1,1,1,1,0,0,1,1


## Estimator

### Train & Score

In [21]:
rfc1 = RandomForestClassifier(random_state=0)

In [25]:
model1 = make_pipeline(preproc1, rfc1)

In [26]:
model1

#### On Accuracy

In [27]:
%%time
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

CPU times: user 4min 49s, sys: 1.93 s, total: 4min 51s
Wall time: 4min 51s


0.9309583788483403

#### On $F_1$ Macro

In [37]:
y_pred = model1.predict(X_test)
f1_score(y_test, y_pred, average="macro")

0.8672263993607658

### Feature Importances

#### Basic Analysis

In [38]:
importances = model1[-1].feature_importances_

In [39]:
indices = np.argsort(importances)[::-1]
feat_labels = X_train_tr.columns
feat_labels

Index(['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'GrAppv', 'SBA_Appv',
       'SameState_True', 'Recession_True', 'NewExist_True', 'Franchise_True',
       ...
       'City_5', 'City_6', 'City_7', 'City_8', 'City_9', 'City_10', 'City_11',
       'City_12', 'City_13', 'City_14'],
      dtype='object', length=179)

In [40]:
for f in range(X_train_tr.shape[1]):
    print("%2d) %-*s %f" % (f + 1,
                            85,
                            feat_labels[indices[f]],
                            importances[indices[f]]))

 1) Term                                                                                  0.326549
 2) SBA_Appv                                                                              0.048372
 3) GrAppv                                                                                0.044261
 4) NoEmp                                                                                 0.027507
 5) RetainedJob                                                                           0.026933
 6) ApprovalMonth_sin                                                                     0.019874
 7) ApprovalMonth_cos                                                                     0.019787
 8) SameState_True                                                                        0.017567
 9) ApprovalDoW_sin                                                                       0.015848
10) Recession_True                                                                        0.015193
11) Create

In [41]:
feat_labels[indices]

Index(['Term', 'SBA_Appv', 'GrAppv', 'NoEmp', 'RetainedJob',
       'ApprovalMonth_sin', 'ApprovalMonth_cos', 'SameState_True',
       'ApprovalDoW_sin', 'Recession_True',
       ...
       'BankState_WY', 'BankState_Missing', 'NAICS_utilities', 'BankState_PR',
       'NAICS_public_administration',
       'NAICS_management_of_companies_and_entreproses', 'BankState_EN',
       'BankState_GU', 'BankState_AN', 'BankState_VI'],
      dtype='object', length=179)

In [42]:
pd.set_option("display.max_rows", None)

df_imp = (
    pd.DataFrame({
        "feature": feat_labels[indices],
        "importance %": importances
    })
    .set_index("feature")
    .mul(100)
)

In [44]:
df_imp.head(20)

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.654892
SBA_Appv,2.750687
GrAppv,1.374682
NoEmp,2.693301
RetainedJob,4.426118
ApprovalMonth_sin,4.83717
ApprovalMonth_cos,1.756672
SameState_True,1.519271
ApprovalDoW_sin,0.741072
Recession_True,0.298814


In [35]:
styled_df_imp = df_imp.style.format("{:.3f}")
styled_df_imp

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.655
SBA_Appv,2.751
GrAppv,1.375
NoEmp,2.693
RetainedJob,4.426
ApprovalMonth_sin,4.837
ApprovalMonth_cos,1.757
SameState_True,1.519
ApprovalDoW_sin,0.741
Recession_True,0.299


In [None]:
# Get back to default settings
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")

#### Elaborated Analysis

In [None]:
df_imp.head()

In [None]:
df_imp.query("feature.str.contains('Bank')").sum()

In [None]:
def get_agg_importance(feat: str) -> float:
    """
    Get an aggregated feature importance for an encoded feature
    """
    print(f"Aggregated Importance for feature {feat}")
    return df_imp.query("feature.str.startswith(@feat + '_')").sum()

In [None]:
get_agg_importance("Bank")

In [None]:
bin_cols

In [None]:
cols_to_agg = [bin_cols, nom_cols, cyc_cols, bin_enc_cols]

In [None]:
for col_type in cols_to_agg:
    for col in col_type:
        print(get_agg_importance(col))

#### Results Within a Dataset

In [None]:
df_num = (df_imp
 # .reset_index()
 .query("not feature.str.contains('_')")
)
df_num

In [None]:
df_sba = (df_imp
 # .reset_index()
 .query("feature == 'SBA_Appv'")
)
df_sba

In [None]:
df_nom = (df_imp
 .query("feature.str.contains('_') and not feature.str.contains('SBA')")
 .rename(index=lambda x: x.split("_")[0])
 .groupby("feature").sum()
 # .reset_index()
)
df_nom        

In [None]:
df_imp_agg = (pd.concat([df_num, df_sba, df_nom])
 .sort_values("importance %", ascending=False)
)
df_imp_agg

## Preprocessing 2 [TARGET ENCODING]

### Based on `category_encoders`

In [45]:
lab_enc = LabelEncoder()

In [46]:
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.transform(y_test)

In [47]:
tar_enc = ce.TargetEncoder(cols=bin_enc_cols)

In [48]:
preproc2 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", tar_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc2.set_output(transform="pandas")

In [49]:
%%time
X_train_tr = preproc2.fit_transform(X_train, y_train)

CPU times: user 5.79 s, sys: 868 ms, total: 6.65 s
Wall time: 6.66 s


In [51]:
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,RevLineCr_Unknown,RevLineCr_Y,LowDoc_N,LowDoc_Y,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,Bank,City
1276,2.398962,0.007790,-0.035747,-0.045592,2.918333,2.624791,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.000000e+00,6.123234e-17,0.781831,0.623490,0.877483,0.790927
662596,-1.100759,-0.113652,-0.023153,-0.045592,-0.628047,-0.622200,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,-1.000000e+00,-1.836970e-16,-0.433884,-0.900969,0.890943,0.767705
631321,-0.339950,-0.059678,-0.035747,-0.045592,-0.416323,-0.376117,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,8.660254e-01,-5.000000e-01,0.433884,-0.900969,0.586372,0.855527
472488,-0.948597,-0.019197,-0.035747,-0.045592,-0.257530,-0.208780,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-8.660254e-01,-5.000000e-01,0.000000,1.000000,0.852033,0.885943
431489,-0.796435,-0.059678,-0.035747,-0.045592,-0.587467,-0.552990,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-5.000000e-01,-8.660254e-01,0.781831,0.623490,0.825279,0.826087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792754,1.638153,0.439582,-0.010558,0.113642,0.956355,1.374905,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.224647e-16,-1.000000e+00,0.974928,-0.222521,1.000000,0.948052
274096,-0.809115,-0.059678,-0.035747,-0.045592,0.007125,0.070115,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,5.000000e-01,-8.660254e-01,-0.433884,-0.900969,0.858655,0.809430
726006,-0.187788,-0.073171,-0.035747,-0.045592,0.324712,0.404789,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-1.000000e+00,-1.836970e-16,0.974928,-0.222521,0.724203,0.885943
724051,-0.263869,0.034777,-0.035747,-0.045592,0.236493,0.198078,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.224647e-16,-1.000000e+00,-0.433884,-0.900969,0.785441,0.857545


# Estimator

## Train & Score

In [52]:
rfc = RandomForestClassifier(random_state=0)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [53]:
model2 = make_pipeline(preproc2, rfc)

In [54]:
model2

In [56]:
%%time
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

CPU times: user 4min 24s, sys: 1.44 s, total: 4min 26s
Wall time: 4min 26s


0.9337004257975344