# Imports And Settings

In [57]:
import category_encoders as ce
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Loading

In [58]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.56 s, sys: 212 ms, total: 2.78 s
Wall time: 2.78 s


# Droping Useless Columns

In [59]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [60]:
df.columns

Index(['City', 'State', 'Bank', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv', 'MIS_Status'],
      dtype='object')

In [61]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [62]:
y.value_counts(normalize=True)

MIS_Status
P I F     0.824377
CHGOFF    0.175623
Name: proportion, dtype: float64

In [63]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y)

In [64]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((852267, 20), (44857, 20), (852267,), (44857,))

# Model 1: BINARY ENCODING

In [65]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["State", "BankState", "NAICS",
            "UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

bin_enc_cols = ["Bank", "City"]

In [66]:
assert len(num_cols) + len(bin_cols) + len(nom_cols) + len(cyc_cols) + len(bin_enc_cols) == X.shape[1]

## "Simple" Nominal And Numerical Treats

In [67]:
ohe_bin = OneHotEncoder(drop="if_binary", sparse_output=False)
ohe_nom = OneHotEncoder(sparse_output=False)  # No drop="first" with a nonlinear model
std_scl = StandardScaler()

## Cyclical Encoding for Time-Related Features

In [68]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length
    
    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self
        
    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X
    
    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [69]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

## Binary Encoding for `Bank` and `City`

In [70]:
bin_enc = ce.BinaryEncoder(cols=bin_enc_cols)

In [71]:
X_train_bin = bin_enc.fit_transform(X_train)
X_train_bin

Unnamed: 0,City_0,City_1,City_2,City_3,City_4,City_5,City_6,City_7,City_8,City_9,...,NewExist,NoEmp,CreateJob,RetainedJob,Franchise,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv
478736,0,0,0,0,0,0,0,0,0,0,...,False,45,0,0,False,M,N,N,46250.0,39313.0
325150,0,0,0,0,0,0,0,0,0,0,...,False,20,0,20,False,U,Unknown,N,324000.0,243000.0
777971,0,0,0,0,0,0,0,0,0,0,...,True,4,0,0,False,M,N,Y,25000.0,22500.0
597469,0,0,0,0,0,0,0,0,0,0,...,False,10,0,0,False,M,N,N,110000.0,99000.0
97431,0,0,0,0,0,0,0,0,0,0,...,False,3,1,4,False,U,Y,N,20000.0,10000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690934,0,0,0,1,0,1,0,0,0,0,...,False,5,0,5,False,U,Y,N,50000.0,25000.0
201226,0,0,0,0,1,0,1,0,1,1,...,False,2,5,0,False,U,Unknown,N,243000.0,243000.0
177046,0,1,1,1,0,0,1,0,1,0,...,False,0,0,9,False,U,Unknown,N,10500.0,5250.0
87136,0,0,1,1,0,0,1,1,1,0,...,False,1,0,1,False,U,N,N,15000.0,7500.0


In [72]:
X_train_bin.columns

Index(['City_0', 'City_1', 'City_2', 'City_3', 'City_4', 'City_5', 'City_6',
       'City_7', 'City_8', 'City_9', 'City_10', 'City_11', 'City_12',
       'City_13', 'City_14', 'State', 'Bank_0', 'Bank_1', 'Bank_2', 'Bank_3',
       'Bank_4', 'Bank_5', 'Bank_6', 'Bank_7', 'Bank_8', 'Bank_9', 'Bank_10',
       'Bank_11', 'Bank_12', 'BankState', 'SameState', 'NAICS',
       'ApprovalMonth', 'ApprovalDoW', 'Recession', 'Term', 'NewExist',
       'NoEmp', 'CreateJob', 'RetainedJob', 'Franchise', 'UrbanRural',
       'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv'],
      dtype='object')

In [73]:
preproc1 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", bin_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [74]:
X_train_tr = preproc.fit_transform(X_train)
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,City_5,City_6,City_7,City_8,City_9,City_10,City_11,City_12,City_13,City_14
478736,-1.329139,0.447549,-0.035687,-0.045571,-0.517749,-0.483185,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
325150,2.398175,0.114103,-0.035687,0.038348,0.461790,0.407879,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
777971,-0.644531,-0.099303,-0.035687,-0.045571,-0.592692,-0.556736,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,1
597469,-0.340260,-0.019275,-0.035687,-0.045571,-0.292923,-0.222074,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
97431,-1.138970,-0.112640,-0.031484,-0.028787,-0.610325,-0.611420,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690934,-1.215038,-0.085965,-0.035687,-0.024591,-0.504524,-0.545799,1.0,0.0,0.0,0.0,...,1,0,0,0,0,1,1,0,0,1
201226,1.637499,-0.125978,-0.014670,-0.045571,0.176128,0.407879,1.0,0.0,0.0,0.0,...,0,1,0,1,1,0,1,0,0,0
177046,-0.733276,-0.152654,-0.035687,-0.007808,-0.643829,-0.632199,0.0,1.0,0.0,0.0,...,0,1,0,1,0,1,1,0,1,0
87136,-1.037547,-0.139316,-0.035687,-0.041375,-0.627959,-0.622356,0.0,0.0,0.0,0.0,...,0,1,1,1,0,1,1,0,1,0


## Estimator

### Train & Score

In [75]:
rfc1 = RandomForestClassifier(random_state=0)

In [76]:
model1 = make_pipeline(preproc, rfc)

In [77]:
model1

In [78]:
%%time
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

CPU times: user 4min 34s, sys: 1.6 s, total: 4min 35s
Wall time: 4min 35s


0.9335443743451413

### Feature Importances

#### Basic Analysis

In [26]:
importances = model[-1].feature_importances_

In [27]:
indices = np.argsort(importances)[::-1]
feat_labels = X_train_tr.columns
feat_labels

Index(['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'GrAppv', 'SBA_Appv',
       'SameState_True', 'Recession_True', 'NewExist_True', 'Franchise_True',
       ...
       'City_5', 'City_6', 'City_7', 'City_8', 'City_9', 'City_10', 'City_11',
       'City_12', 'City_13', 'City_14'],
      dtype='object', length=179)

In [28]:
for f in range(X_train_tr.shape[1]):
    print("%2d) %-*s %f" % (f + 1,
                            85,
                            feat_labels[indices[f]],
                            importances[indices[f]]))

 1) Term                                                                                  0.323055
 2) SBA_Appv                                                                              0.049117
 3) GrAppv                                                                                0.045355
 4) NoEmp                                                                                 0.027290
 5) RetainedJob                                                                           0.026619
 6) ApprovalMonth_sin                                                                     0.020063
 7) ApprovalMonth_cos                                                                     0.019941
 8) SameState_True                                                                        0.017359
 9) ApprovalDoW_sin                                                                       0.016025
10) Recession_True                                                                        0.015070
11) Create

In [29]:
feat_labels[indices]

Index(['Term', 'SBA_Appv', 'GrAppv', 'NoEmp', 'RetainedJob',
       'ApprovalMonth_sin', 'ApprovalMonth_cos', 'SameState_True',
       'ApprovalDoW_sin', 'Recession_True',
       ...
       'BankState_WY', 'BankState_Missing', 'NAICS_utilities', 'BankState_PR',
       'NAICS_public_administration',
       'NAICS_management_of_companies_and_entreproses', 'BankState_GU',
       'BankState_EN', 'BankState_AN', 'BankState_VI'],
      dtype='object', length=179)

In [30]:
pd.set_option("display.max_rows", None)

df_imp = (
    pd.DataFrame({
        "feature": feat_labels[indices],
        "importance %": importances
    })
    .set_index("feature")
    .mul(100)
)

In [31]:
df_imp.head()

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.305494
SBA_Appv,2.729038
GrAppv,1.407182
NoEmp,2.661943
RetainedJob,4.535477


In [32]:
styled_df_imp = df_imp.style.format("{:.3f}")
styled_df_imp

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.305
SBA_Appv,2.729
GrAppv,1.407
NoEmp,2.662
RetainedJob,4.535
ApprovalMonth_sin,4.912
ApprovalMonth_cos,1.736
SameState_True,1.507
ApprovalDoW_sin,0.766
Recession_True,0.305


In [33]:
# Get back to default settings
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")

#### Elaborated Analysis

In [34]:
df_imp.head()

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.305494
SBA_Appv,2.729038
GrAppv,1.407182
NoEmp,2.661943
RetainedJob,4.535477


In [35]:
df_imp.query("feature.str.contains('Bank')").sum()

importance %    27.905568
dtype: float64

In [36]:
def get_agg_importance(feat: str) -> float:
    """
    Get an aggregated feature importance for an encoded feature
    """
    print(f"Aggregated Importance for feature {feat}")
    return df_imp.query("feature.str.startswith(@feat + '_')").sum()

In [37]:
get_agg_importance("Bank")

Aggregated Importance for feature Bank


importance %    2.170446
dtype: float64

In [38]:
bin_cols

Index(['SameState', 'Recession', 'NewExist', 'Franchise'], dtype='object')

In [39]:
cols_to_agg = [bin_cols, nom_cols, cyc_cols, bin_enc_cols]

In [40]:
for col_type in cols_to_agg:
    for col in col_type:
        print(get_agg_importance(col))

Aggregated Importance for feature SameState
importance %    1.507005
dtype: float64
Aggregated Importance for feature Recession
importance %    0.30514
dtype: float64
Aggregated Importance for feature NewExist
importance %    0.073241
dtype: float64
Aggregated Importance for feature Franchise
importance %    0.127641
dtype: float64
Aggregated Importance for feature State
importance %    12.275847
dtype: float64
Aggregated Importance for feature BankState
importance %    25.735122
dtype: float64
Aggregated Importance for feature NAICS
importance %    4.496921
dtype: float64
Aggregated Importance for feature UrbanRural
importance %    0.309867
dtype: float64
Aggregated Importance for feature RevLineCr
importance %    0.105644
dtype: float64
Aggregated Importance for feature LowDoc
importance %    0.028102
dtype: float64
Aggregated Importance for feature ApprovalMonth
importance %    6.647644
dtype: float64
Aggregated Importance for feature ApprovalDoW
importance %    0.837652
dtype: floa

#### Results Within a Dataset

In [41]:
df_num = (df_imp
 # .reset_index()
 .query("not feature.str.contains('_')")
)
df_num

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.305494
GrAppv,1.407182
NoEmp,2.661943
RetainedJob,4.535477
CreateJob,0.032924


In [42]:
df_sba = (df_imp
 # .reset_index()
 .query("feature == 'SBA_Appv'")
)
df_sba

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
SBA_Appv,2.729038


In [43]:
df_nom = (df_imp
 .query("feature.str.contains('_') and not feature.str.contains('SBA')")
 .rename(index=lambda x: x.split("_")[0])
 .groupby("feature").sum()
 # .reset_index()
)
df_nom        

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
ApprovalDoW,0.837652
ApprovalMonth,6.647644
Bank,2.170446
BankState,25.735122
City,1.707668
Franchise,0.127641
LowDoc,0.028102
NAICS,4.496921
NewExist,0.073241
Recession,0.30514


In [44]:
df_imp_agg = (pd.concat([df_num, df_sba, df_nom])
 .sort_values("importance %", ascending=False)
)
df_imp_agg

Unnamed: 0_level_0,importance %
feature,Unnamed: 1_level_1
Term,32.305494
BankState,25.735122
State,12.275847
ApprovalMonth,6.647644
RetainedJob,4.535477
NAICS,4.496921
SBA_Appv,2.729038
NoEmp,2.661943
Bank,2.170446
City,1.707668


## Preprocessing 2 [TARGET ENCODING]

### Based on `category_encoders`

In [45]:
lab_enc = LabelEncoder()

In [46]:
y_train = lab_enc.fit_transform(y_train)
y_test = lab_enc.transform(y_test)

In [47]:
tar_enc = ce.TargetEncoder(cols=bin_enc_cols)

In [48]:
preproc2 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", tar_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc2.set_output(transform="pandas")

In [49]:
%%time
X_train_tr = preproc2.fit_transform(X_train, y_train)

CPU times: user 5.79 s, sys: 868 ms, total: 6.65 s
Wall time: 6.66 s


In [51]:
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,RevLineCr_Unknown,RevLineCr_Y,LowDoc_N,LowDoc_Y,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,Bank,City
1276,2.398962,0.007790,-0.035747,-0.045592,2.918333,2.624791,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.000000e+00,6.123234e-17,0.781831,0.623490,0.877483,0.790927
662596,-1.100759,-0.113652,-0.023153,-0.045592,-0.628047,-0.622200,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,-1.000000e+00,-1.836970e-16,-0.433884,-0.900969,0.890943,0.767705
631321,-0.339950,-0.059678,-0.035747,-0.045592,-0.416323,-0.376117,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,8.660254e-01,-5.000000e-01,0.433884,-0.900969,0.586372,0.855527
472488,-0.948597,-0.019197,-0.035747,-0.045592,-0.257530,-0.208780,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-8.660254e-01,-5.000000e-01,0.000000,1.000000,0.852033,0.885943
431489,-0.796435,-0.059678,-0.035747,-0.045592,-0.587467,-0.552990,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-5.000000e-01,-8.660254e-01,0.781831,0.623490,0.825279,0.826087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792754,1.638153,0.439582,-0.010558,0.113642,0.956355,1.374905,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.224647e-16,-1.000000e+00,0.974928,-0.222521,1.000000,0.948052
274096,-0.809115,-0.059678,-0.035747,-0.045592,0.007125,0.070115,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,5.000000e-01,-8.660254e-01,-0.433884,-0.900969,0.858655,0.809430
726006,-0.187788,-0.073171,-0.035747,-0.045592,0.324712,0.404789,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-1.000000e+00,-1.836970e-16,0.974928,-0.222521,0.724203,0.885943
724051,-0.263869,0.034777,-0.035747,-0.045592,0.236493,0.198078,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.224647e-16,-1.000000e+00,-0.433884,-0.900969,0.785441,0.857545


# Estimator

## Train & Score

In [52]:
rfc = RandomForestClassifier(random_state=0)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [53]:
model2 = make_pipeline(preproc2, rfc)

In [54]:
model2

In [56]:
%%time
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

CPU times: user 4min 24s, sys: 1.44 s, total: 4min 26s
Wall time: 4min 26s


0.9337004257975344