In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!pip install category-encoders



# Imports

In [None]:
import category_encoders as ce
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Loading From Drive

In [None]:
%%time
df = pd.read_csv("/content/drive/MyDrive/loan_approval (1)/cleaned_dataset.csv")

CPU times: user 3.19 s, sys: 664 ms, total: 3.85 s
Wall time: 7.47 s


# Droping Useless Columns

In [None]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [None]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [None]:
y.value_counts(normalize=True)

P I F     0.824377
CHGOFF    0.175623
Name: MIS_Status, dtype: float64

In [None]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model 1: Binary Encoding

In [None]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["State", "BankState", "NAICS",
            "UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

bin_enc_cols = ["Bank", "City"]

## "Simple" Nominal And Numerical Treats

In [None]:
ohe_bin = OneHotEncoder(drop="if_binary",
                        sparse_output=False,
                        handle_unknown="ignore")
ohe_nom = OneHotEncoder(sparse_output=False,  # No drop="first" with a nonlinear model
                        handle_unknown="ignore")
std_scl = StandardScaler()

## Cyclical Encoding

In [None]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [None]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

## Binary Encoding for `Bank` and `City`

In [None]:
bin_enc = ce.BinaryEncoder(cols=bin_enc_cols)

In [None]:
%%time
# Test
X_train_bin = bin_enc.fit_transform(X_train)
X_train_bin

CPU times: user 3.25 s, sys: 1.65 s, total: 4.91 s
Wall time: 4.94 s


Unnamed: 0.1,Unnamed: 0,City_0,City_1,City_2,City_3,City_4,City_5,City_6,City_7,City_8,...,NewExist,NoEmp,CreateJob,RetainedJob,Franchise,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv
114053,114825,0,0,0,0,0,0,0,0,0,...,True,2,1,3,False,R,Y,N,50000.0,25000.0
469035,470373,0,0,0,0,0,0,0,0,0,...,True,2,2,0,True,U,Unknown,N,112000.0,95200.0
138089,138928,0,0,0,0,0,0,0,0,0,...,False,2,1,2,False,U,N,N,50000.0,25000.0
268271,269412,0,0,0,0,0,0,0,0,0,...,False,57,30,0,False,U,Unknown,N,277000.0,277000.0
827977,829901,0,0,0,0,0,0,0,0,0,...,False,2,0,2,False,U,Unknown,N,200000.0,150000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506196,507561,0,0,0,0,0,0,1,1,0,...,False,1,0,1,False,U,Y,N,20000.0,10000.0
4159,4173,0,0,0,0,0,1,1,0,1,...,True,1,0,0,False,U,Y,N,10000.0,5000.0
730820,732592,0,0,0,0,0,0,0,1,0,...,False,15,0,0,False,M,N,N,162500.0,113750.0
671888,673575,0,0,1,0,0,0,0,0,1,...,False,2,0,2,False,U,Y,N,65000.0,32500.0


In [None]:
preproc1 = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("bin_enc", bin_enc, bin_enc_cols),
    ],
    verbose_feature_names_out=False
)
preproc1.set_output(transform="pandas")

In [None]:
%%time
# Test
X_train_tr = preproc1.fit_transform(X_train)
X_train_tr

CPU times: user 7.7 s, sys: 2.12 s, total: 9.82 s
Wall time: 12.8 s


Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,SameState_True,Recession_True,NewExist_True,Franchise_True,...,City_5,City_6,City_7,City_8,City_9,City_10,City_11,City_12,City_13,City_14
114053,-0.340110,-0.126271,-0.031416,-0.032909,-0.504800,-0.545689,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
469035,-0.340110,-0.126271,-0.027194,-0.045551,-0.286155,-0.238955,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
138089,-0.631680,-0.126271,-0.031416,-0.037123,-0.504800,-0.545689,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,1
268271,1.637494,0.609539,0.091018,-0.045551,0.295723,0.555406,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
827977,0.876877,-0.126271,-0.035638,-0.037123,0.024180,0.000489,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506196,-0.340110,-0.139649,-0.035638,-0.041337,-0.610596,-0.611230,1.0,1.0,0.0,0.0,...,0,1,1,0,1,1,1,0,1,1
4159,-1.138758,-0.139649,-0.035638,-0.045551,-0.645862,-0.633077,0.0,0.0,1.0,0.0,...,1,1,0,1,1,0,1,0,1,0
730820,2.398111,0.047648,-0.035638,-0.045551,-0.108065,-0.157902,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,1,1,0
671888,-0.340110,-0.126271,-0.035638,-0.037123,-0.451902,-0.512918,0.0,0.0,0.0,0.0,...,0,0,0,1,0,1,1,0,0,1


## Estimator

### Train & Score

In [None]:
rfc1 = RandomForestClassifier(random_state=0)

model1 = make_pipeline(preproc1, rfc1)
model1

In [None]:
%%time
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

### Feature Importances

In [None]:
importances = model1[-1].feature_importances_
indices = np.argsort(importances)[::-1]
feat_labels = X_train_tr.columns

pd.set_option("display.max_rows", None)

df_imp = (
    pd.DataFrame({
        "feature": feat_labels[indices],
        "importance %": importances
    })
    .set_index("feature")
    .mul(100)
)

In [None]:
df_imp