# Imports

In [1]:
from typing import List

import category_encoders as ce
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Data Loading

In [2]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.85 s, sys: 474 ms, total: 3.32 s
Wall time: 3.35 s


# Droping Useless Columnss

In [3]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [4]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [5]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Encoding Target

In [6]:
lbl_enc = LabelEncoder()

y_train = lbl_enc.fit_transform(y_train)
y_test = lbl_enc.transform(y_test)

## Preprocessing

In [7]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

bin_cols = df.select_dtypes("bool").columns

nom_cols = ["UrbanRural", "RevLineCr", "LowDoc"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

tgt_cols = ["State", "BankState", "NAICS", "Bank", "City"]

### "Simple" Nominal And Numerical Processes

In [8]:
std_scl = StandardScaler()
ohe_bin = OneHotEncoder(drop="if_binary",
                        sparse_output=True,
                        handle_unknown="ignore")
ohe_nom = OneHotEncoder(sparse_output=True,  # No drop="first" with a nonlinear model
                        handle_unknown="ignore")

### Cyclical Encoding

In [9]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

In [10]:
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

### Target Encoding for `Bank`, `City`, `State`, `BankState`

In [11]:
tgt_enc = ce.TargetEncoder(cols=tgt_cols)

In [12]:
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("bin", ohe_bin, bin_cols),
        ("nom", ohe_nom, nom_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
        ("tgt_enc", tgt_enc, tgt_cols),
    ],
    verbose_feature_names_out=False
)

## Feature Selection

In [13]:
rfc = RandomForestClassifier(random_state=0)

In [14]:
sfm = SelectFromModel(rfc, threshold=0.01)

## Estimator

In [15]:
model = make_pipeline(preproc, sfm, rfc)
model

## Training & Score

In [None]:
%%time
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average="macro")

# `RandomizedSearch`

In [None]:
param_dist = {
    "randomforestclassifier__max_features": ['sqrt', 'log2', None] + list(np.linspace(0.1, 1.0, 10)),
    "randomforestclassifier__min_samples_leaf": sp_randint(1, 20),
    "randomforestclassifier__bootstrap": [True, False],
    "randomforestclassifier__n_estimators": sp_randint(100, 2_000)
}

In [None]:
%%time
random_search = RandomizedSearchCV(model,
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=5,
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)
random_search.fit(X_train, y_train)

In [None]:
y_pred = random_search.best_model_.predict(X_test)
f1_score(y_test, y_pred, average="macro")