# Imports

In [17]:
import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import loguniform, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures, StandardScaler
import xgboost as xgb

In [6]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

# Data Loading

In [7]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 5.26 s, sys: 491 ms, total: 5.75 s
Wall time: 5.82 s


# Droping Useless Columns

In [8]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [9]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [10]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Preprocessing

In [11]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

nom_cols = ["Bank", "BankState", "City", "Franchise", "LowDoc", "NAICS",
            "NewExist", "Recession", "RevLineCr", "SameState", "State", "UrbanRural"]

In [16]:
polynom = PolynomialFeatures(2)
std_scl = StandardScaler()
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)
cat_enc = ce.TargetEncoder(cols=nom_cols)
pca = PCA(n_components=0.9)

In [14]:
preproc = ColumnTransformer(
    transformers = [
        ("polynum", polynom, num_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("cat_enc", cat_enc, nom_cols)
    ],
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [19]:
xgbc = xgb.XGBClassifier()

In [20]:
model = make_pipeline(preproc, std_scl, pca, xgbc)
model

## Training & Score

In [None]:
%%time
# Not performed for memory gain
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [15]:
df.select_dtypes(include="object")

Unnamed: 0,City,State,Bank,BankState,NAICS,UrbanRural,RevLineCr,LowDoc,MIS_Status
0,EVANSVILLE,IN,FIFTH THIRD BANK,OH,retail_trade,M,N,Y,P I F
1,NEW PARIS,IN,1ST SOURCE BANK,IN,accomodation_and_food_services,M,N,Y,P I F
2,BLOOMINGTON,IN,GRANT COUNTY STATE BANK,IN,health_care_and_social_assistance,M,N,N,P I F
3,BROKEN ARROW,OK,1ST NATL BK & TR CO OF BROKEN,OK,unknown,M,N,Y,P I F
4,ORLANDO,FL,FLORIDA BUS. DEVEL CORP,FL,unknown,M,N,N,P I F
...,...,...,...,...,...,...,...,...,...
899159,UPPER ARLINGTON,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Unknown,N,P I F
899160,COLUMBUS,OH,JPMORGAN CHASE BANK NATL ASSOC,IL,retail_trade,M,Y,N,P I F
899161,SANTA MARIA,CA,"RABOBANK, NATIONAL ASSOCIATION",CA,manufacturing,M,N,N,P I F
899162,HONOLULU,HI,BANK OF HAWAII,HI,unknown,M,N,Y,CHGOFF


In [16]:
# (df
#  .select_dtypes(include="object")
#  .map(lambda x: 'AURORA' in x)
#  .sum()
# )

In [40]:
param_distributions = {
    'iterations': [100, 200, 400, 800],
    'depth': [1, 2, 4],
    'learning_rate': loguniform(0.01, 1.0),
    'random_strength': loguniform(1e-9, 10),
    'l2_leaf_reg': [2, 4, 8, 16],
    'bagging_temperature': uniform(0, 1),
    'border_count': sp_randint(1, 255)
}

In [49]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

cb = CatBoostClassifier(cat_features=nom_cols,
                        eval_metric="TotalF1")

In [None]:
%%time
search_results = cb.randomized_search(
    param_distributions,
    X=X_train_tr,
    y=y_train,
    cv=5,
    n_iter=20,
    partition_random_seed=42,
    verbose=True
)

0:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 567ms	remaining: 56.2s
1:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 1.07s	remaining: 52.4s
2:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 1.54s	remaining: 49.9s
3:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 2.13s	remaining: 51.2s
4:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 2.64s	remaining: 50.2s
5:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 3.15s	remaining: 49.4s
6:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 3.61s	remaining: 48s
7:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 4.14s	remaining: 47.6s
8:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 4.65s	remaining: 47.1s
9:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 5.22s	remaining: 47s
10:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	total: 5.68s	remaining: 46s
11:	learn: 0.8536687	test: 0.8521111	best: 0.8521111 (0)	to