# Imports

In [1]:
import json
import os

from catboost import CatBoostClassifier, Pool
from numba import cuda
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import loguniform, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# GPU Detector

CatBoost can make use of GPU as a material accelerator when fitting, performing randomized search...

This function will help decide on which mode it may run, whether your machine has got a CUDA-compatible GPU or not.

In [14]:
def is_gpu_available():
    try:
        # Attempt to create a CUDA device
        cuda_device = cuda.get_current_device()
        return True
    except cuda.CudaSupportError:
        # No CUDA-compatible device is detected
        return False

use_gpu = is_gpu_available()

print("Is a CUDA-compatible GPU available?", use_gpu)

Is a CUDA-compatible GPU available? True


# Cyclical Encoding

This class will be used later to encode temporal features extracted from date data to create cyclical features.

In [4]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

# Data Loading

> **NOTE**
>
> Data Loading could also be accelerated with CUDA, even if here it's quite marginal.

In [5]:
%%time
df = pd.read_csv("../data/cleaned_dataset.csv", index_col=0)

CPU times: user 2.41 s, sys: 330 ms, total: 2.74 s
Wall time: 2.76 s


# Droping Useless Columns

In [6]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [7]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [8]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Preprocessing

This first step involves:
- scaling numerical features,
- creating cyclical features from those derived from date,
- letting other categorical pass through, as they will be directly managed by CatBoost

In [10]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

# # This listing is useless, unless you decide to encode some of those without catboost
# nom_cols = ["Bank", "BankState", "City", "Franchise", "LowDoc", "NAICS",
#             "NewExist", "Recession", "RevLineCr", "SameState", "State", "UrbanRural"]

# Instanciating transformers
std_scl = StandardScaler()
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

# Builiding preprocessing step
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [11]:
X_train_tr = preproc.fit_transform(X_train)
# X_train_tr

In [12]:
# CatBoost needs to be told the indexes of columns it has to target encode
nom_indexes = [idx for idx, col in enumerate(X_train_tr.dtypes)
               if col == "object"]
nom_indexes

[10, 11, 12, 13, 15, 19, 20, 21]

## Estimator

> **NOTE** (Thibaut)
>
> Sans régler `eval_metric='TotalF1'`, j'avais de meilleurs résultats, c'est assez étrange, mais on pourra, de toutes façons, toujours "roll back".

In [18]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

cb = CatBoostClassifier(cat_features=nom_cols,
                        eval_metric="TotalF1",
                        task_type="GPU" if use_gpu else "CPU",
                        verbose=100,
                        random_seed=42)  # Added this for reproducibility

In [19]:
model = make_pipeline(preproc, cb)
model

## Raw Training & Score - Initiate Monitoring

In [30]:
%%time

res_path = "../data/results.csv"
if not os.path.exists(res_path):    
    # This will be our baseline
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    raw_score = f1_score(y_test, y_pred, average="macro")
    print(f"F1 macro score: {raw_score:.4f}")
    
    # Retrieving all CatBoost parameters
    cb_all_params = model[-1].get_all_params()
    
    # Define chosen hyperparameters for tuning
    hyperparams = [
        'iterations',
        'depth',
        'learning_rate',
        'random_strength',
        'bagging_temperature',
        'border_count',
        'l2_leaf_reg',
    ]
    
    # Selecting chosen hyperparameters among all parameters
    cb_raw = {hp: cb_all_params.get(hp, np.nan)
              for hp in hyperparams}
    # Adding f1_macro score to dict
    cb_raw["f1_macro"] = raw_score
    print(cb_raw)
    
    # Dump raw data (for reference)
    with open("../data/catboost_raw.json", "w") as dump_file:
        json.dump(cb_raw, dump_file, indent=4)
    
    # Initialize Results DataFrame
    df_res = pd.DataFrame([cb_raw])
    display(df_res)
    # Save it into a csv file
    df_res.to_csv(res_path)
else:
    print("Raw run has already been proceeded")

Raw run has already been proceeded
CPU times: user 628 µs, sys: 17 µs, total: 645 µs
Wall time: 534 µs


# Randomized Search

## Config & Launch

There is room for improvement within this randomized search, as, inspecting results afterwards seem to indicate this isn't really well randomized.

It should then be explored further.

In [27]:
param_distributions = {
    'iterations': [1_000],
    'depth': [1, 2, 3, 4, 5, 6, 7, 8],
    'learning_rate': loguniform(0.01, 1.0),
    'random_strength': loguniform(1e-9, 10),
    'bagging_temperature': uniform(0, 1),
    'border_count': np.arange(1, 256),
    'l2_leaf_reg': np.arange(2, 31),
}

In [28]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

In [31]:
def run_randomized_searchs(n_runs: int = 1) -> pd.DataFrame:
    """Each run will make 3 randomized searchs"""
    # Retrieving score to beat
    raw_path = "../data/catboost_raw.json"
    rs_path = "../data/catboost_rs.json"
    
    if os.path.exists(rs_path):
        with open(rs_path, "r") as load_file:
            best_caracs = json.load(load_file)
            best_test_score = best_caracs['f1_macro']
    else:
        with open(raw_path, "r") as load_file:
            best_caracs = json.load(load_file)
            best_test_score = best_caracs['f1_macro']

    # Loading results' DataFrame
    df_res = pd.read_csv("../data/results.csv", index_col=0)

    # Launching runs
    for i in range(1, n_runs + 1):
        print(f" RUN {i} ".center(105, "="))
        cb = CatBoostClassifier(cat_features=nom_cols,
                        eval_metric="TotalF1",
                        task_type="GPU" if use_gpu else "CPU",
                        verbose=100,
                        random_seed=42)
        
        search_results = cb.randomized_search(
            param_distributions,
            X=X_train_tr,
            y=y_train,
            cv=5,
            n_iter=3,
            partition_random_seed=42,
            verbose=100
        )
        best_params = search_results['params']
    
        # F_1 macro
        X_test_tr = model[:-1].transform(X_test)
        y_pred = cb.predict(X_test_tr)
        gs_score = f1_score(y_test, y_pred, average="macro")
        best_params['f1_macro'] = gs_score
        
        # New best score on test?
        if gs_score > best_test_score:
            print("NEW BEST TEST SCORE!")
            # Update best test score
            best_test_score = gs_score
            # Export on JSON
            with open("../data/catboost_rs.json", "w") as dump_file:
                json.dump(best_params, dump_file, indent=4)
    
        # Add new row to results' DataFrame
        df_i = pd.DataFrame(best_params, columns=best_params.keys(), index=[i])
        df_res = pd.concat([df_res, df_i])

    df_res = df_res.reset_index(drop=True)
    df_res.to_csv("../data/results.csv")
    return df_res

## Getting Results

In [32]:
%%time
run_randomized_searchs()

0:	learn: 0.8576040	test: 0.8554085	best: 0.8554085 (0)	total: 1.1s	remaining: 18m 21s
100:	learn: 0.9233192	test: 0.9228901	best: 0.9230175 (99)	total: 12.9s	remaining: 1m 54s
200:	learn: 0.9307644	test: 0.9296875	best: 0.9297171 (197)	total: 24.4s	remaining: 1m 36s
300:	learn: 0.9331562	test: 0.9324210	best: 0.9324210 (300)	total: 36.2s	remaining: 1m 24s
400:	learn: 0.9347079	test: 0.9340966	best: 0.9341188 (399)	total: 47.9s	remaining: 1m 11s
500:	learn: 0.9355442	test: 0.9350939	best: 0.9350939 (500)	total: 59.3s	remaining: 59.1s
600:	learn: 0.9361965	test: 0.9357459	best: 0.9358294 (595)	total: 1m 10s	remaining: 47.1s
700:	learn: 0.9369201	test: 0.9365589	best: 0.9365589 (699)	total: 1m 22s	remaining: 35.3s
800:	learn: 0.9374234	test: 0.9368764	best: 0.9369479 (793)	total: 1m 34s	remaining: 23.4s
900:	learn: 0.9378939	test: 0.9374412	best: 0.9374515 (892)	total: 1m 45s	remaining: 11.6s
999:	learn: 0.9384093	test: 0.9379842	best: 0.9380241 (995)	total: 1m 57s	remaining: 0us
bestTes

Unnamed: 0,iterations,depth,learning_rate,random_strength,bagging_temperature,border_count,l2_leaf_reg,f1_macro
0,1000,6,0.023259,1.0,1.0,128,3,0.90627
1,1000,6,0.799928,0.003941,0.688542,116,9,0.917904
