# Imports

In [1]:
import json
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
from scipy.stats import randint as sp_randint
from scipy.stats import loguniform, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    """Meant to encode time data with cycles (days of week, month...)"""
    def __init__(self, column_name, cycle_length):
        self.column_name = column_name
        self.cycle_length = cycle_length

    def fit(self, X, y=None):
        # No fitting needed, implemented for compatibility with sklearn's API
        return self

    def transform(self, X, y=None):
        # Apply cyclical encoding directly without needing to fit
        X = X.copy()
        values = X[self.column_name]
        # Create the cyclical features
        X[f'{self.column_name}_sin'] = np.sin(2 * np.pi * values / self.cycle_length)
        X[f'{self.column_name}_cos'] = np.cos(2 * np.pi * values / self.cycle_length)
        # Drop the original column
        X.drop(columns=[self.column_name], inplace=True)
        return X

    def get_feature_names_out(self, input_features=None):
        # Generate names for the output features
        return np.array(
          [f'{self.column_name}_sin', f'{self.column_name}_cos'], dtype=object
        )

# Data Loading

In [5]:
%%time
df = pd.read_csv("../cleaned_dataset.csv", index_col=0)

CPU times: total: 2.08 s
Wall time: 2.12 s


# Droping Useless Columns

In [6]:
df = df.drop(columns=['LoanNr_ChkDgt', 'Name'])

# Separating Features and Target

In [7]:
X = df.copy()
y = X.pop("MIS_Status")

# Hold-Out

In [8]:
# Stratify with y by default
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    stratify=y,
                                                    random_state=42)

# Model

## Preprocessing

In [9]:
# Splitting columns
num_cols = ["Term", "NoEmp", "CreateJob", "RetainedJob",
            "GrAppv", "SBA_Appv"]

cyc_cols = ["ApprovalMonth", "ApprovalDoW"]

nom_cols = ["Bank", "BankState", "City", "Franchise", "LowDoc", "NAICS",
            "NewExist", "Recession", "RevLineCr", "SameState", "State", "UrbanRural"]

In [10]:
std_scl = StandardScaler()
cyc_dow = CyclicalEncoder("ApprovalDoW", 7)
cyc_mth = CyclicalEncoder("ApprovalMonth", 12)

In [11]:
preproc = ColumnTransformer(
    transformers = [
        ("num", std_scl, num_cols),
        ("cyc_mth", cyc_mth, ["ApprovalMonth"]),
        ("cyc_dow", cyc_dow, ["ApprovalDoW"]),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)
preproc.set_output(transform="pandas")

In [12]:
X_train_tr = preproc.fit_transform(X_train)
# X_train_tr

In [13]:
nom_indexes = [idx for idx, col in enumerate(X_train_tr.dtypes)
               if col == "object"]
nom_indexes

[10, 11, 12, 13, 15, 19, 20, 21]

## Estimator

In [14]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

cb = CatBoostClassifier(cat_features=nom_cols, eval_metric="TotalF1")

CatBoostError: Invalid type for cat_feature[non-default value idx=62762,feature_idx=11]=nan : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [None]:
model = make_pipeline(preproc, cb)
model

## Training & Score

In [None]:
%%time
# Not performed for memory gain
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
raw_score = f1_score(y_test, y_pred, average="macro")
raw_score

Learning rate set to 0.183776
0:	learn: 0.8824272	total: 588ms	remaining: 9m 47s
1:	learn: 0.8860970	total: 1.15s	remaining: 9m 33s
2:	learn: 0.9001189	total: 1.6s	remaining: 8m 51s
3:	learn: 0.9037058	total: 2.02s	remaining: 8m 22s
4:	learn: 0.9075891	total: 2.59s	remaining: 8m 35s
5:	learn: 0.9118968	total: 3.08s	remaining: 8m 31s
6:	learn: 0.9148758	total: 3.55s	remaining: 8m 24s
7:	learn: 0.9156581	total: 4.08s	remaining: 8m 25s
8:	learn: 0.9162997	total: 4.59s	remaining: 8m 25s
9:	learn: 0.9175598	total: 5.14s	remaining: 8m 28s
10:	learn: 0.9186897	total: 5.55s	remaining: 8m 18s
11:	learn: 0.9183625	total: 6.07s	remaining: 8m 19s
12:	learn: 0.9204624	total: 6.5s	remaining: 8m 13s
13:	learn: 0.9220410	total: 6.91s	remaining: 8m 6s
14:	learn: 0.9227746	total: 7.32s	remaining: 8m
15:	learn: 0.9246433	total: 7.73s	remaining: 7m 55s
16:	learn: 0.9247712	total: 8.27s	remaining: 7m 57s
17:	learn: 0.9247765	total: 8.81s	remaining: 8m
18:	learn: 0.9259303	total: 9.25s	remaining: 7m 57s
19:

0.922854841138847

In [None]:
cb_all_params = model[-1].get_all_params()

In [None]:
params = [
    'iterations',
    'depth',
    'learning_rate',
    'random_strength',
    'bagging_temperature',
    'border_count',
    'l2_leaf_reg',
    # 'scale_pos_weight'
]

In [None]:
cb_raw_params = {param: cb_all_params.get(param, "Not found")
                for param in params}
cb_raw_params["f1_macro"] = raw_score
cb_raw_params

{'iterations': 1000,
 'depth': 6,
 'learning_rate': 0.18377600610256195,
 'random_strength': 1,
 'bagging_temperature': 'Not found',
 'border_count': 254,
 'l2_leaf_reg': 3,
 'f1_macro': 0.922854841138847}

In [None]:
with open("../data/catboost_raw_params.json", "w") as dump_file:
    json.dump(cb_raw_params, dump_file, indent=4)

# Randomized Search

## Config & Launch

In [None]:
param_distributions = {
    'iterations': [1_000],
    'depth': np.arange(1, 9),
    'learning_rate': loguniform(0.01, 1.0),
    'random_strength': loguniform(1e-9, 10),
    'bagging_temperature': uniform(0, 1),
    'border_count': np.arange(1, 256),
    'l2_leaf_reg': np.arange(2, 31),
    # 'scale_pos_weight': np.arange(0.01, 10, 0.01),
}

In [None]:
train_pool = Pool(data=X_train_tr,
                  label=y_train, 
                  cat_features=nom_indexes,
                  feature_names=X_train_tr.columns.to_list())

cb = CatBoostClassifier(cat_features=nom_cols,
                        eval_metric="TotalF1")

In [None]:
%%time
search_results = cb.randomized_search(
    param_distributions,
    X=X_train_tr,
    y=y_train,
    cv=5,
    n_iter=3,
    partition_random_seed=42,
    verbose=True
)

0:	learn: 0.8576040	test: 0.8554085	best: 0.8554085 (0)	total: 190ms	remaining: 3m 9s
1:	learn: 0.8640882	test: 0.8629686	best: 0.8629686 (1)	total: 376ms	remaining: 3m 7s
2:	learn: 0.8576040	test: 0.8554085	best: 0.8629686 (1)	total: 558ms	remaining: 3m 5s
3:	learn: 0.8649928	test: 0.8629468	best: 0.8629686 (1)	total: 678ms	remaining: 2m 48s
4:	learn: 0.8625630	test: 0.8603352	best: 0.8629686 (1)	total: 850ms	remaining: 2m 49s
5:	learn: 0.8657564	test: 0.8635552	best: 0.8635552 (5)	total: 978ms	remaining: 2m 42s
6:	learn: 0.8621769	test: 0.8599682	best: 0.8635552 (5)	total: 1.13s	remaining: 2m 40s
7:	learn: 0.8657291	test: 0.8635561	best: 0.8635561 (7)	total: 1.28s	remaining: 2m 39s
8:	learn: 0.8658458	test: 0.8640509	best: 0.8640509 (8)	total: 1.42s	remaining: 2m 36s
9:	learn: 0.8654635	test: 0.8635433	best: 0.8640509 (8)	total: 1.53s	remaining: 2m 31s
10:	learn: 0.8657870	test: 0.8637344	best: 0.8640509 (8)	total: 1.69s	remaining: 2m 31s
11:	learn: 0.8655791	test: 0.8636879	best: 0.

In [None]:
# bestTest = 0.949204703
# bestIteration = 990

best_params = search_results['params']
best_params

In [None]:
# The model is already updated with the best parameters
X_test_tr = model[:-1].transform(X_test)
y_pred = cb.predict(X_test_tr)
gs_score = f1_score(y_test, y_pred, average="macro")

## Getting Results

In [None]:
cb_gs_params = {k, v for k, v in best_params.items()
                if k in params}
cb_gs_params['f1_macro'] = gs_score

In [None]:
with open("../data/catboost_rs_params.json", "w") as dump_file:
    json.dump(cb_gs_params, dump_file, indent=4)

In [None]:
# AJOUT THIBAUT

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

y_test_binary = np.where(y_test == 'P I F', 1, 0)
y_pred_binary = np.where(y_pred == 'P I F', 1, 0)

# Calculer le taux de faux positifs, le taux de vrais positifs et les seuils à partir des prédictions binaires
fpr, tpr, thresholds = roc_curve(y_test_binary, y_pred_proba)

# Calculer l'AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Tracer la courbe ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='lightseagreen', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()