# Imports

In [30]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import loguniform, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

# Data Loading

In [3]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 5.05 s, sys: 610 ms, total: 5.66 s
Wall time: 5.71 s


# Droping Useless Columns

In [4]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [5]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [6]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Preprocessing

In [7]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

nom_cols = ["Bank", "BankState", "City", "Franchise", "LowDoc", "NAICS",
            "NewExist", "Recession", "RevLineCr", "SameState", "State", "UrbanRural"]

In [8]:
std_scl = StandardScaler()
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

In [17]:
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [18]:
X_train_tr = preproc.fit_transform(X_train)
X_train_tr

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,GrAppv,SBA_Appv,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,...,Bank,BankState,SameState,NAICS,Recession,NewExist,Franchise,UrbanRural,RevLineCr,LowDoc
52158,-0.365827,-0.019158,-0.035608,-0.045527,1.224734,1.117235,-5.000000e-01,-8.660254e-01,0.000000,1.000000,...,"READYCAP LENDING, LLC",CA,False,retail_trade,False,False,False,M,Unknown,N
374932,-0.593990,-0.045825,-0.035608,-0.045527,-0.434501,-0.425997,1.000000e+00,6.123234e-17,-0.433884,-0.900969,...,MANUFACTURERS & TRADERS TR CO,NY,True,construction,False,False,False,U,N,N
716138,-0.340475,0.047508,-0.035608,-0.045527,-0.257986,-0.193527,-8.660254e-01,-5.000000e-01,0.433884,-0.900969,...,LOANS FROM OLD CLOSED LENDERS,DC,False,unknown,False,False,False,M,N,N
192181,0.115851,-0.085824,-0.027137,-0.024389,0.059739,-0.196154,1.000000e+00,6.123234e-17,0.974928,-0.222521,...,BRANCH BK. & TR CO,NC,False,arts_entertainment_and_recreation,True,False,False,R,N,N
140792,1.636939,0.007508,-0.001724,-0.045527,0.808160,1.191661,5.000000e-01,-8.660254e-01,0.781831,0.623490,...,CDC SMALL BUS. FINAN CORP,CA,True,manufacturing,False,False,False,U,Unknown,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448689,-1.240452,-0.085824,-0.035608,-0.024389,-0.536879,-0.566092,-5.000000e-01,8.660254e-01,-0.433884,-0.900969,...,FIRST FINANCIAL BANK,AR,True,agriculture_forestry_fishing_and_hunting,False,False,False,R,N,N
755744,-0.644693,-0.085824,-0.035608,-0.045527,-0.505106,-0.463210,-2.449294e-16,1.000000e+00,0.974928,-0.222521,...,MERCHANTS BANK,VT,True,retail_trade,False,False,False,M,N,N
53966,0.115851,-0.139157,-0.035608,-0.045527,-0.505106,-0.480722,-5.000000e-01,-8.660254e-01,0.000000,1.000000,...,ROYAL BUSINESS BANK,CA,True,information,False,False,False,M,N,Y
576823,0.115851,0.887501,-0.035608,-0.045527,2.848666,2.627633,-1.000000e+00,-1.836970e-16,0.000000,1.000000,...,MORTON COMMUNITY BANK,IL,True,manufacturing,False,False,False,U,N,N


In [29]:
nom_indexes = [idx for idx, col in enumerate(X_train_tr.dtypes)
               if col == "object"]
nom_indexes

[10, 11, 12, 13, 15, 19, 20, 21]

## Estimator

In [33]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

In [34]:
cb = CatBoostClassifier(cat_features=nom_cols)

In [35]:
model = make_pipeline(preproc, cb)
model

## Training & Score

In [36]:
%%time
# Not performed for memory gain
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

Learning rate set to 0.183776
0:	learn: 0.5167199	total: 1.8s	remaining: 29m 56s
1:	learn: 0.4155271	total: 3.55s	remaining: 29m 30s
2:	learn: 0.3532100	total: 5.02s	remaining: 27m 47s
3:	learn: 0.3226529	total: 6.54s	remaining: 27m 9s
4:	learn: 0.2912754	total: 8.52s	remaining: 28m 15s
5:	learn: 0.2656632	total: 10.3s	remaining: 28m 32s
6:	learn: 0.2552164	total: 11.7s	remaining: 27m 46s
7:	learn: 0.2473874	total: 13.4s	remaining: 27m 36s
8:	learn: 0.2380813	total: 15.2s	remaining: 27m 57s
9:	learn: 0.2322812	total: 16.9s	remaining: 27m 57s
10:	learn: 0.2273536	total: 18.3s	remaining: 27m 23s
11:	learn: 0.2206226	total: 20.1s	remaining: 27m 33s
12:	learn: 0.2178386	total: 21.4s	remaining: 27m 3s
13:	learn: 0.2153452	total: 22.7s	remaining: 26m 39s
14:	learn: 0.2120373	total: 24.1s	remaining: 26m 24s
15:	learn: 0.2095526	total: 25.6s	remaining: 26m 13s
16:	learn: 0.2075476	total: 27.4s	remaining: 26m 23s
17:	learn: 0.2051247	total: 29.3s	remaining: 26m 37s
18:	learn: 0.2029649	total: 3

0.922854841138847

In [15]:
df.select_dtypes(include="object")

Unnamed: 0,City,State,Bank,BankState,NAICS,UrbanRural,RevLineCr,LowDoc,MIS_Status
0,EVANSVILLE,IN,FIFTH THIRD BANK,OH,retail_trade,M,N,Y,P I F
1,NEW PARIS,IN,1ST SOURCE BANK,IN,accomodation_and_food_services,M,N,Y,P I F
2,BLOOMINGTON,IN,GRANT COUNTY STATE BANK,IN,health_care_and_social_assistance,M,N,N,P I F
3,BROKEN ARROW,OK,1ST NATL BK & TR CO OF BROKEN,OK,unknown,M,N,Y,P I F
4,ORLANDO,FL,FLORIDA BUS. DEVEL CORP,FL,unknown,M,N,N,P I F
...,...,...,...,...,...,...,...,...,...
899159,UPPER ARLINGTON,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Unknown,N,P I F
899160,COLUMBUS,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Y,N,P I F
899161,SANTA MARIA,CA,"RABOBANK, NATIONAL ASSOCIATION",CA,manufacturing,M,N,N,P I F
899162,HONOLULU,HI,BANK OF HAWAII,HI,unknown,M,N,Y,CHGOFF


In [16]:
# (df
#  .select_dtypes(include="object")
#  .map(lambda x: 'AURORA' in x)
#  .sum()
# )