# Imports

In [1]:
from typing import List

import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Data Loading

In [2]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.46 s, sys: 276 ms, total: 2.73 s
Wall time: 2.75 s


# Droping Useless Columns

In [3]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [4]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [5]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model1: Target Encoding With `RandomForestClassifier`

## Encoding Target

In [6]:
lbl_enc = LabelEncoder()

y_train = lbl_enc.fit_transform(y_train)
y_test = lbl_enc.transform(y_test)

## Preprocessing

In [7]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

tgt_cols = ["State", "BankState", "NAICS", "Bank", "City"]

## "Simple" Nominal And Numerical Processes

In [8]:
std_scl = StandardScaler()
ohe_bin = OneHotEncoder(drop="if_binary",
                        sparse_output=False,
                        handle_unknown="ignore")
ohe_nom = OneHotEncoder(sparse_output=False,  # No drop="first" with a nonlinear model
                        handle_unknown="ignore")

## Cyclical Encoding

In [9]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [10]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

## Target Encoding for `Bank`, `City`, `State`, `BankState`

In [11]:
tgt_enc = ce.TargetEncoder(cols=tgt_cols)

In [12]:
%%time
# Test
X_train_tgt = tgt_enc.fit_transform(X_train, y_train)
X_train_tgt

CPU times: user 3.99 s, sys: 745 ms, total: 4.73 s
Wall time: 4.74 s


Unnamed: 0,City,State,Bank,BankState,SameState,NAICS,ApprovalMonth,ApprovalDoW,Recession,Term,NewExist,NoEmp,CreateJob,RetainedJob,Franchise,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv
52158,0.776280,0.820586,0.732058,0.779467,False,0.772481,7,0,False,82,False,10,0,0,False,M,Unknown,N,540000.0,405000.0
374932,0.858277,0.800739,0.872016,0.831345,True,0.766984,3,4,False,64,False,8,0,0,False,U,N,N,70000.0,52500.0
716138,0.806424,0.814921,0.938389,0.917150,False,0.916683,8,3,False,84,False,15,0,0,False,M,N,N,120000.0,105600.0
192181,0.789573,0.802920,0.893810,0.706389,False,0.793372,3,2,True,120,False,5,2,5,False,R,N,N,210000.0,105000.0
140792,0.796371,0.814921,1.000000,0.779467,True,0.845807,5,1,False,240,False,12,8,0,False,U,Unknown,N,422000.0,422000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448689,0.847227,0.833083,0.928313,0.872163,True,0.909197,11,4,False,13,False,5,0,5,False,R,N,N,41000.0,20500.0
755744,0.927152,0.925362,0.925305,0.945354,True,0.772481,12,2,False,60,False,5,0,0,False,M,N,N,50000.0,44000.0
53966,0.878378,0.814921,0.829540,0.779467,True,0.751065,7,0,False,120,False,1,0,0,False,M,N,Y,50000.0,40000.0
576823,0.878613,0.772574,0.904315,0.777417,True,0.845807,9,0,False,120,False,78,0,0,False,U,N,N,1000000.0,750000.0


In [13]:
preproc1 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("tgt_enc", tgt_enc, tgt_cols),
    ],
    verbose_feature_names_out=False
)
preproc1.set_output(transform="pandas")

In [14]:
%%time
# Test
X_train_tr = preproc1.fit_transform(X_train, y_train)
X_train_tr

CPU times: user 4.67 s, sys: 512 ms, total: 5.18 s
Wall time: 5.19 s


Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,LowDoc_Y,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,State,BankState,NAICS,Bank,City
52158,-0.365827,-0.019158,-0.035608,-0.045527,1.224734,1.117235,0.0,0.0,0.0,0.0,...,0.0,-5.000000e-01,-8.660254e-01,0.000000,1.000000,0.820586,0.779467,0.772481,0.732058,0.776280
374932,-0.593990,-0.045825,-0.035608,-0.045527,-0.434501,-0.425997,1.0,0.0,0.0,0.0,...,0.0,1.000000e+00,6.123234e-17,-0.433884,-0.900969,0.800739,0.831345,0.766984,0.872016,0.858277
716138,-0.340475,0.047508,-0.035608,-0.045527,-0.257986,-0.193527,0.0,0.0,0.0,0.0,...,0.0,-8.660254e-01,-5.000000e-01,0.433884,-0.900969,0.814921,0.917150,0.916683,0.938389,0.806424
192181,0.115851,-0.085824,-0.027137,-0.024389,0.059739,-0.196154,0.0,1.0,0.0,0.0,...,0.0,1.000000e+00,6.123234e-17,0.974928,-0.222521,0.802920,0.706389,0.793372,0.893810,0.789573
140792,1.636939,0.007508,-0.001724,-0.045527,0.808160,1.191661,1.0,0.0,0.0,0.0,...,0.0,5.000000e-01,-8.660254e-01,0.781831,0.623490,0.814921,0.779467,0.845807,1.000000,0.796371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448689,-1.240452,-0.085824,-0.035608,-0.024389,-0.536879,-0.566092,1.0,0.0,0.0,0.0,...,0.0,-5.000000e-01,8.660254e-01,-0.433884,-0.900969,0.833083,0.872163,0.909197,0.928313,0.847227
755744,-0.644693,-0.085824,-0.035608,-0.045527,-0.505106,-0.463210,1.0,0.0,0.0,0.0,...,0.0,-2.449294e-16,1.000000e+00,0.974928,-0.222521,0.925362,0.945354,0.772481,0.925305,0.927152
53966,0.115851,-0.139157,-0.035608,-0.045527,-0.505106,-0.480722,1.0,0.0,0.0,0.0,...,1.0,-5.000000e-01,-8.660254e-01,0.000000,1.000000,0.814921,0.779467,0.751065,0.829540,0.878378
576823,0.115851,0.887501,-0.035608,-0.045527,2.848666,2.627633,1.0,0.0,0.0,0.0,...,0.0,-1.000000e+00,-1.836970e-16,0.000000,1.000000,0.772574,0.777417,0.845807,0.904315,0.878613


## Estimator

### Train

In [15]:
rfc1 = RandomForestClassifier(random_state=0)

model1 = make_pipeline(preproc1, rfc1)
model1

### Accuracy

In [16]:
%%time
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

CPU times: user 3min 3s, sys: 884 ms, total: 3min 4s
Wall time: 3min 4s


0.9415029984171924

### $F_1$ Macro

In [17]:
y_pred = model1.predict(X_test)
f1_score(y_test, y_pred, average="macro")

0.8923730139876496

### Feature Importances

In [18]:
importances = model1[-1].feature_importances_
indices = np.argsort(importances)[::-1]
feat_labels = X_train_tr.columns
tups = [(feat_labels[i], importances[i]) for i in indices]
df_imp = pd.DataFrame(tups, columns=["feature", "importance"])
df_imp

Unnamed: 0,feature,importance
0,Term,0.400459
1,Bank,0.105542
2,City,0.060206
3,SBA_Appv,0.054517
4,BankState,0.048428
5,GrAppv,0.044554
6,State,0.034481
7,NAICS,0.028265
8,NoEmp,0.028243
9,RetainedJob,0.025398


In [19]:
def select_feats_by_importance(thresh: float) -> List[str]:
    """Return a list of features by decreasing feature importance"""
    return (
        df_imp
        .query("importance >= @thresh")
        .loc[:, "feature"]
        .to_list()
    )

def select_feats_by_number(n: int) -> List[str]:
    """Return a list of n most important features"""
    return df_imp.loc[:(n - 1), "feature"]

### Keeping Only Features Over 1% Importance

In [20]:
kept_feats = select_feats_by_importance(0.01)
kept_feats

['Term',
 'Bank',
 'City',
 'SBA_Appv',
 'BankState',
 'GrAppv',
 'State',
 'NAICS',
 'NoEmp',
 'RetainedJob',
 'ApprovalMonth_sin',
 'ApprovalMonth_cos',
 'UrbanRural_M',
 'ApprovalDoW_sin',
 'Recession_True',
 'SameState_True',
 'CreateJob',
 'ApprovalDoW_cos']

In [21]:
select_feats_by_number(10)

0           Term
1           Bank
2           City
3       SBA_Appv
4      BankState
5         GrAppv
6          State
7          NAICS
8          NoEmp
9    RetainedJob
Name: feature, dtype: object

# Model2: `SelectFromModel` After Preprocessing

## Pipeline

In [22]:
rfc = RandomForestClassifier(random_state=0)

In [23]:
sfm = SelectFromModel(rfc, threshold=0.01)

In [24]:
preproc_select = make_pipeline(preproc1, sfm)
preproc_select.set_output(transform="pandas")

In [25]:
%%time
X_train_sel = preproc_select.fit_transform(X_train, y_train)
X_train_sel

CPU times: user 3min 10s, sys: 788 ms, total: 3min 11s
Wall time: 3min 11s


Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,UrbanRural_M,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,State,BankState,NAICS,Bank,City
52158,-0.365827,-0.019158,-0.035608,-0.045527,1.224734,1.117235,0.0,0.0,1.0,-5.000000e-01,-8.660254e-01,0.000000,1.000000,0.820586,0.779467,0.772481,0.732058,0.776280
374932,-0.593990,-0.045825,-0.035608,-0.045527,-0.434501,-0.425997,1.0,0.0,0.0,1.000000e+00,6.123234e-17,-0.433884,-0.900969,0.800739,0.831345,0.766984,0.872016,0.858277
716138,-0.340475,0.047508,-0.035608,-0.045527,-0.257986,-0.193527,0.0,0.0,1.0,-8.660254e-01,-5.000000e-01,0.433884,-0.900969,0.814921,0.917150,0.916683,0.938389,0.806424
192181,0.115851,-0.085824,-0.027137,-0.024389,0.059739,-0.196154,0.0,1.0,0.0,1.000000e+00,6.123234e-17,0.974928,-0.222521,0.802920,0.706389,0.793372,0.893810,0.789573
140792,1.636939,0.007508,-0.001724,-0.045527,0.808160,1.191661,1.0,0.0,0.0,5.000000e-01,-8.660254e-01,0.781831,0.623490,0.814921,0.779467,0.845807,1.000000,0.796371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448689,-1.240452,-0.085824,-0.035608,-0.024389,-0.536879,-0.566092,1.0,0.0,0.0,-5.000000e-01,8.660254e-01,-0.433884,-0.900969,0.833083,0.872163,0.909197,0.928313,0.847227
755744,-0.644693,-0.085824,-0.035608,-0.045527,-0.505106,-0.463210,1.0,0.0,1.0,-2.449294e-16,1.000000e+00,0.974928,-0.222521,0.925362,0.945354,0.772481,0.925305,0.927152
53966,0.115851,-0.139157,-0.035608,-0.045527,-0.505106,-0.480722,1.0,0.0,1.0,-5.000000e-01,-8.660254e-01,0.000000,1.000000,0.814921,0.779467,0.751065,0.829540,0.878378
576823,0.115851,0.887501,-0.035608,-0.045527,2.848666,2.627633,1.0,0.0,0.0,-1.000000e+00,-1.836970e-16,0.000000,1.000000,0.772574,0.777417,0.845807,0.904315,0.878613


In [26]:
model = make_pipeline(preproc1, sfm, rfc)

## $F_1$ Macro

In [27]:
%%time
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

CPU times: user 6min 17s, sys: 816 ms, total: 6min 17s
Wall time: 6min 17s


0.8909402350240652

In [28]:
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

0.8909402350240652

# `RandomizedSearch`

In [29]:
model.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('num', StandardScaler(),
                                    ['Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                                     'GrAppv', 'SBA_Appv']),
                                   ('bin',
                                    OneHotEncoder(drop='if_binary',
                                                  handle_unknown='ignore',
                                                  sparse_output=False),
                                    Index(['SameState', 'Recession', 'NewExist', 'Franchise'], dtype='object')),
                                   ('nom',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse_output=False),
                                    ['UrbanRu...vLineCr', 'LowDoc']),
                                   ('cyc_mth',
                                    CyclicalEncoder(column_name='App

In [30]:
param_dist = {
    "randomforestclassifier__max_features": ['sqrt', 'log2', None] + list(np.linspace(0.1, 1.0, 10)),
    "randomforestclassifier__min_samples_leaf": sp_randint(1, 20),
    "randomforestclassifier__bootstrap": [True, False],
    "randomforestclassifier__n_estimators": sp_randint(100, 2_000)
}

In [31]:
%%time
random_search = RandomizedSearchCV(model,
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=5,
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


PicklingError: Could not pickle the task to send it to the workers.