# Imports

In [1]:
# pip install catboost 

In [2]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import loguniform, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [3]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

# Data Loading

In [4]:
%%time
df = pd.read_csv("../cleaned_dataset.csv", index_col=0)

CPU times: user 2.47 s, sys: 385 ms, total: 2.85 s
Wall time: 2.85 s


# Droping Useless Columns

In [5]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])
df = df.dropna()

# Separating Features and Target

In [6]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [7]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Preprocessing

In [8]:
# Splitting columns
# num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
num_cols = ["Term", "NoEmp", "RetainedJob",
            "GrAppv", "SBA_Appv"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

nom_cols = ["BankState", "Franchise", "LowDoc", "NAICS", 'City', 'Bank',
            "NewExist", "Recession", "RevLineCr", "State", "UrbanRural"]

In [9]:
std_scl = StandardScaler()
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

In [10]:
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [11]:
X_train_tr = preproc.fit_transform(X_train)
X_train_tr

Unnamed: 0,Term,NoEmp,RetainedJob,GrAppv,SBA_Appv,ApprovalMonth_sin,ApprovalMonth_cos,ApprovalDoW_sin,ApprovalDoW_cos,City,...,BankState,SameState,NAICS,Recession,NewExist,CreateJob,Franchise,UrbanRural,RevLineCr,LowDoc
52158,-0.365827,-0.019158,-0.045527,1.224734,1.117235,-5.000000e-01,-8.660254e-01,0.000000,1.000000,AURORA,...,CA,False,retail_trade,False,False,0,False,M,Unknown,N
374932,-0.593990,-0.045825,-0.045527,-0.434501,-0.425997,1.000000e+00,6.123234e-17,-0.433884,-0.900969,BUFFALO,...,NY,True,construction,False,False,0,False,U,N,N
716138,-0.340475,0.047508,-0.045527,-0.257986,-0.193527,-8.660254e-01,-5.000000e-01,0.433884,-0.900969,RIVERSIDE,...,DC,False,unknown,False,False,0,False,M,N,N
192181,0.115851,-0.085824,-0.024389,0.059739,-0.196154,1.000000e+00,6.123234e-17,0.974928,-0.222521,ODENTON,...,NC,False,arts_entertainment_and_recreation,True,False,2,False,R,N,N
140792,1.636939,0.007508,-0.045527,0.808160,1.191661,5.000000e-01,-8.660254e-01,0.781831,0.623490,SAN BERNARDINO,...,CA,True,manufacturing,False,False,8,False,U,Unknown,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448689,-1.240452,-0.085824,-0.024389,-0.536879,-0.566092,-5.000000e-01,8.660254e-01,-0.433884,-0.900969,Summers,...,AR,True,agriculture_forestry_fishing_and_hunting,False,False,0,False,R,N,N
755744,-0.644693,-0.085824,-0.045527,-0.505106,-0.463210,-2.449294e-16,1.000000e+00,0.974928,-0.222521,COLCHESTER,...,VT,True,retail_trade,False,False,0,False,M,N,N
53966,0.115851,-0.139157,-0.045527,-0.505106,-0.480722,-5.000000e-01,-8.660254e-01,0.000000,1.000000,YORBA LINDA,...,CA,True,information,False,False,0,False,M,N,Y
576823,0.115851,0.887501,-0.045527,2.848666,2.627633,-1.000000e+00,-1.836970e-16,0.000000,1.000000,ROCKFORD,...,IL,True,manufacturing,False,False,0,False,U,N,N


In [12]:
nom_indexes = [idx for idx, col in enumerate(X_train_tr.dtypes)
               if col == "object"]
nom_indexes

[9, 10, 11, 12, 14, 19, 20, 21]

## Estimator

In [13]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

In [14]:
#cb = CatBoostClassifier(cat_features=nom_cols, eval_metric="TotalF1")

In [15]:
cb = CatBoostClassifier(cat_features=nom_cols, eval_metric="TotalF1")

In [16]:
model = make_pipeline(preproc, cb)
model

## Training & Score

In [17]:
%%time
# Not performed for memory gain
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

Learning rate set to 0.183776
0:	learn: 0.8824272	total: 751ms	remaining: 12m 29s
1:	learn: 0.8860970	total: 1.32s	remaining: 11m 1s
2:	learn: 0.9001189	total: 1.82s	remaining: 10m 4s
3:	learn: 0.9037058	total: 2.26s	remaining: 9m 22s
4:	learn: 0.9075891	total: 2.88s	remaining: 9m 33s
5:	learn: 0.9118968	total: 3.38s	remaining: 9m 20s
6:	learn: 0.9148758	total: 3.82s	remaining: 9m 2s
7:	learn: 0.9156581	total: 4.34s	remaining: 8m 58s
8:	learn: 0.9160741	total: 4.88s	remaining: 8m 57s
9:	learn: 0.9167686	total: 5.38s	remaining: 8m 52s
10:	learn: 0.9191121	total: 5.81s	remaining: 8m 42s
11:	learn: 0.9195264	total: 6.36s	remaining: 8m 43s
12:	learn: 0.9197399	total: 6.83s	remaining: 8m 38s
13:	learn: 0.9197430	total: 7.39s	remaining: 8m 40s
14:	learn: 0.9217649	total: 7.84s	remaining: 8m 34s
15:	learn: 0.9231963	total: 8.29s	remaining: 8m 29s
16:	learn: 0.9241156	total: 8.76s	remaining: 8m 26s
17:	learn: 0.9243234	total: 9.29s	remaining: 8m 26s
18:	learn: 0.9263402	total: 9.72s	remaining:

0.9240111953258152

0.9240 sans aucun param et en ayant suppr que index et name 

F1 SCORE MACRO (sans aucun param) : 0.922854841138847 (juste suppr index et name)

F1 SCORE MACRO (sans aucun param) : 0.9196447775840155 (juste suppr index, name et city)

F1 SCORE MACRO (sans aucun param) : 0.9100917662983697 (juste suppr index, name bank et city)

In [18]:
df.select_dtypes(include="object")

Unnamed: 0,City,State,Bank,BankState,NAICS,UrbanRural,RevLineCr,LowDoc,MIS_Status
0,EVANSVILLE,IN,FIFTH THIRD BANK,OH,retail_trade,M,N,Y,P I F
1,NEW PARIS,IN,1ST SOURCE BANK,IN,accomodation_and_food_services,M,N,Y,P I F
2,BLOOMINGTON,IN,GRANT COUNTY STATE BANK,IN,health_care_and_social_assistance,M,N,N,P I F
3,BROKEN ARROW,OK,1ST NATL BK & TR CO OF BROKEN,OK,unknown,M,N,Y,P I F
4,ORLANDO,FL,FLORIDA BUS. DEVEL CORP,FL,unknown,M,N,N,P I F
...,...,...,...,...,...,...,...,...,...
899159,UPPER ARLINGTON,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Unknown,N,P I F
899160,COLUMBUS,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Y,N,P I F
899161,SANTA MARIA,CA,"RABOBANK, NATIONAL ASSOCIATION",CA,manufacturing,M,N,N,P I F
899162,HONOLULU,HI,BANK OF HAWAII,HI,unknown,M,N,Y,CHGOFF


In [19]:
# (df
#  .select_dtypes(include="object")
#  .map(lambda x: 'AURORA' in x)
#  .sum()
# )