# Análise de Crédito - Modelo

# Imports

In [35]:
import numpy as np
import typing
import shap
import warnings
import optuna
from utils import *
from time import time
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



ModuleNotFoundError: No module named 'optuna'

In [2]:
warnings.filterwarnings('ignore')

# Parameters

In [22]:
REMOVE_MISSING  = ["loan_term"]
REMOVE_BIAS = ["age", "city", "state", "gender", "education_level", "zip_code","marital_status"]
REMOVE_ID_COLUMN = ["id"]
REMOVE_UNUSED_COLUMNS = ["informed_purpose", "pre_approved"]
DATASET = Path("../datasets/dataset_cleaned.csv")
TARGET_VARIABLE = "sent_to_analysis"
TARGET_ENCODER = []
FEATURES = ["monthly_income", 'collateral_value', "loan_amount", "collateral_debt","verified_restriction", "dishonored_checks", 
            "expired_debts", "banking_debts", "commercial_debts", "protests", "informed_restriction", "monthly_payment", "auto_brand",
            "auto_model", "auto_year", "form_completed", "channel", "landing_page", "landing_page_product", "utm_term"]
TARGET_VARIABLE = "sent_to_analysis"
CAT_FEATURES = ["auto_brand","auto_model","channel","landing_page","landing_page_product","utm_term"]
TARGET_ENCODER = ["auto_brand","auto_model","channel","landing_page","landing_page_product","utm_term"]
KFOLD = 5
TOTAL_MODELS_TESTED = 2 # Verificar um jeito melhor de dar assign nessa variável
N_TRIALS = 2
TIMEOUT = 100*3
K_FOLDS = KFold(n_splits = KFOLD)

In [28]:
LGB_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': -1,
    'verbosity': -1,
    'n_estimators': 100,
    'max_bin': 1024,
    'boosting_type': 'gbdt', #'dart'
    'colsample_bytree': 0.5673775386473462,        
    'eta': 0.05446876730023387,
    'reg_lambda': 10.787843597294561,
    'min_child_samples': 69,
    'random_state': SEED,
    'early_stopping_rounds': 150,
    'verbose':1,
}

# Definitions

In [4]:
np.seed = SEED

In [5]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.target_means_ = {}

    def fit(self, X, y):
        X = pd.DataFrame(X)
        y = pd.Series(y)
        self.target_means_ = {}
        for col in self.cols:
            self.target_means_[col] = y.groupby(X[col]).mean()
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X_transformed = X.copy()
        for col in self.cols:
            X_transformed[col] = X[col].map(self.target_means_[col])
        return X_transformed

In [6]:
def encoder(df_train:pd.DataFrame ,df_test: pd.DataFrame) -> pd.DataFrame:

    encoder = TargetEncoder(cols=TARGET_ENCODER)
    encoder.fit(df_train, df_train[TARGET_VARIABLE])
    df_train[TARGET_ENCODER] = encoder.transform(df_train[TARGET_ENCODER])
    df_test[TARGET_ENCODER] = encoder.transform(df_test[TARGET_ENCODER])

    return df_train, df_test

In [7]:
def fit_and_validate_model(train: pd.DataFrame, test: pd.DataFrame, model, features_col:  typing.List[str], categorical_feature: typing.List[str] = None) -> dict[str, typing.Any]:
    if isinstance(model, LGBMClassifier):
        model = model.fit(train[features_col],train[TARGET_VARIABLE], categorical_feature=categorical_feature)
    else:
        model = model.fit(train[features_col],train[TARGET_VARIABLE])
        
    output = model.predict(test[features_col])

    artifact = {}

    metrics = calculate_metrics(output, test[TARGET_VARIABLE])
    
    artifact["metrics"] = metrics
    return artifact

In [8]:
def calculate_metrics(output, real) -> dict[str, float]:
    precision, recall, thresholds = precision_recall_curve(real, output)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    roc_auc = roc_auc_score(real, output)

    metrics = {}
    metrics["pr_auc"] = auc_precision_recall
    metrics["auc"] = roc_auc

    return metrics

In [29]:
def optuna_search(data):
    
    start = time()
    # Optuna study for Lightgbm
    study = optuna.create_study(study_name='Study', direction='maximize')
    with tqdm(total=N_TRIALS, desc="Optimizing", unit="trial") as pbar:
        study.optimize(lambda trial: objective(trial, data), n_trials=N_TRIALS, callbacks=[tqdm_callback], timeout=TIMEOUT)

    print(f'Time spent[s]: {(time()-start)/60:.2f} minutes')
    
    print('N trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial 

    print('  Valor: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))
        
    optuna_params = trial.params
    LGB_PARAMS.update(optuna_params)

In [27]:
def objective_lgbm(trial, data):
    max_depth = -1
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': 0,
        'max_bin': 1024,
        'boosting_type': 'gbdt', #'gbdt'
        'subsample': trial.suggest_float('subsample', 0.6, 0.8),
        'num_leaves': trial.suggest_int('num_leaves', 2 ** (max_depth - 1), 2 ** max_depth),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),        
        'eta': trial.suggest_float('eta', 0.05, 0.1),
        'lambda_l1': trial.suggest_float("reg_alpha", 1e-6, 1e-3),
        'lambda_l2': trial.suggest_float("reg_lambda", 10, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10),
        'random_state': SEED,
        'early_stopping_rounds': 150,
        #'scale_pos_weight': scale_pos_weight,
        #'device_type': 'GPU',
    }

    for i, (train_index, test_index) in enumerate(K_FOLDS.split(data)):

        print(f"Fold {i+1}:")    
    
        df_train, df_test = data.iloc[train_index], data.iloc[test_index]
        df_train, df_test = encoder(df_train, df_test)
    
        lgbm = LGBMClassifier(params)
    
        df_train, df_test = encoder(df_train, df_test)
        artifact_xgb = fit_and_validate_model(df_train, df_test, xgb, FEATURES)
        artifact_lgbm = fit_and_validate_model(df_train, df_test, lgbm, FEATURES, CAT_FEATURES)
        
        xgb_metrics_pr_auc.append(artifact_xgb["metrics"]["pr_auc"])
        xgb_metrics_roc_auc.append(artifact_xgb["metrics"]["auc"])

        del df_train, df_test, lgbm
        gc.collect()
        

    return np.mean(xgb_metrics_pr_auc)


# Pre-Processing

In [10]:
df = load_dataset(DATASET)

In [11]:
# Remove columns that will not be used for modelling
df_cleaned = df.drop(REMOVE_BIAS + REMOVE_ID_COLUMN + REMOVE_MISSING + REMOVE_UNUSED_COLUMNS, axis=1)
df_cleaned.head()

Unnamed: 0,monthly_income,collateral_value,loan_amount,collateral_debt,verified_restriction,dishonored_checks,expired_debts,banking_debts,commercial_debts,protests,...,monthly_payment,auto_brand,auto_model,auto_year,form_completed,sent_to_analysis,channel,landing_page,landing_page_product,utm_term
0,5668.0,24000.0,5000.0,900.0,0.0,0,0,0,0,0,...,161.77,Nissan,LIVINA 1.6 16V Flex Fuel 5p,2011.0,0.0,0.0,search,/emprestimos/solicitar,PersonalLoan,
1,5000.0,14200.0,5000.0,1500.0,0.0,0,0,0,0,0,...,279.56,VW - VolksWagen,Fox City 1.0Mi/ 1.0Mi Total Flex 8V 3p,2004.0,0.0,0.0,direct,/emprestimos,,
2,3000.0,17000.0,8000.0,1060.0,0.0,0,0,0,0,0,...,447.3,Fiat,Palio 1.0 ECONOMY Fire Flex 8V 4p,2010.0,0.0,0.0,affiliates,/emprestimos/garantia-veiculo/solicitar,,
3,7500.0,21000.0,12000.0,0.0,1.0,0,0,0,0,0,...,670.95,GM - Chevrolet,Classic/ Classic LS 1.0 VHC FlexPower 4p,2012.0,0.0,0.0,social,/emprestimos/solicitar,PersonalLoan,
4,3379.0,16500.0,5000.0,0.0,0.0,0,0,0,0,0,...,333.33,GM - Chevrolet,Celta Life/ LS 1.0 MPFI 8V FlexPower 5p,2008.0,0.0,0.0,search,/emprestimos/solicitar,PersonalLoan,


# XGBoost & LightGBM

In [12]:
data = df.copy(deep=True)

In [14]:
xgb_metrics_pr_auc = list()
lgbm_metrics_pr_auc = list()
xgb_metrics_roc_auc = list()
lgbm_metrics_roc_auc = list()

metrics_pdf_pr = pd.DataFrame()
metrics_pdf_auc = pd.DataFrame()

for i, (train_index, test_index) in enumerate(K_FOLDS.split(data)):

    print(f"Fold {i+1}:")    

    df_train, df_test = data.iloc[train_index], data.iloc[test_index]
    df_train, df_test = encoder(df_train, df_test)

    xgb = XGBClassifier(n_estimators=100, max_depth=10)
    lgbm = LGBMClassifier(n_estimators=100, max_depth=10)

    df_train.to_pickle(f"df_train_fold_{i+1}.pkl")
    df_test.to_pickle(f"df_test_fold_{i+1}.pkl")

    df_train, df_test = encoder(df_train, df_test)
    artifact_xgb = fit_and_validate_model(df_train, df_test, xgb, FEATURES)
    artifact_lgbm = fit_and_validate_model(df_train, df_test, lgbm, FEATURES, CAT_FEATURES)
    
    xgb.save_model(f"xgb_model_fold_{i+1}.json")
    lgbm.booster_.save_model(f"lgbm_model_fold_{i+1}.txt")

    
    xgb_metrics_pr_auc.append(artifact_xgb["metrics"]["pr_auc"])
    lgbm_metrics_pr_auc.append(artifact_lgbm["metrics"]["pr_auc"])
    xgb_metrics_roc_auc.append(artifact_xgb["metrics"]["auc"])
    lgbm_metrics_roc_auc.append(artifact_lgbm["metrics"]["auc"])


Fold 1:
[LightGBM] [Info] Number of positive: 2612, number of negative: 9345
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 11957, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.218449 -> initscore=-1.274725
[LightGBM] [Info] Start training from score -1.274725
Fold 2:
[LightGBM] [Info] Number of positive: 2475, number of negative: 9482
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1289
[LightGBM] [Info] Number of data points in the train set: 11957, number of used features: 19
[Light

In [15]:
metrics_pdf_pr["value"] = xgb_metrics_pr_auc + lgbm_metrics_pr_auc
metrics_pdf_pr["metric"] = ["pr_auc"] * KFOLD * TOTAL_MODELS_TESTED # Total de modelos = 2
metrics_pdf_pr["fold"] =  TOTAL_MODELS_TESTED * list(range(1,KFOLD+1))
metrics_pdf_pr["model"] = ["xgb"] * KFOLD + ["lgbm"] * KFOLD

In [16]:
metrics_pdf_auc["value"] = xgb_metrics_roc_auc + lgbm_metrics_roc_auc
metrics_pdf_auc["metric"] = ["roc_auc"] * KFOLD * TOTAL_MODELS_TESTED # Total de modelos = 2
metrics_pdf_auc["fold"] =  TOTAL_MODELS_TESTED * list(range(1,KFOLD+1))
metrics_pdf_auc["model"] = ["xgb"] * KFOLD + ["lgbm"] * KFOLD

In [17]:
df_metrics = pd.concat([metrics_pdf_pr, metrics_pdf_auc])

In [18]:
df_metrics.to_csv("metrics.csv", index=None)

# Optuna 

In [34]:
optuna_search(data)

NameError: name 'optuna' is not defined