# Carregando base de dados

In [39]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, train_test_split, ParameterGrid

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

In [40]:
base_dados = pd.read_csv('./base_dados_preprocessada.csv', low_memory=False)

In [41]:
base_dados

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,NewCreditCustomer,VerificationType,Age,Gender,AppliedAmount,Amount,Interest,...,PrincipalOverdueBySchedule,PrincipalPaymentsMade,InterestAndPenaltyPaymentsMade,PrincipalBalance,InterestAndPenaltyBalance,NoOfPreviousLoansBeforeLoan,AmountOfPreviousLoansBeforeLoan,PreviousRepaymentsBeforeLoan,PreviousEarlyRepaymentsCountBeforeLoan,Status
0,970,1150,5.0,0,4.0,53,1.0,2125.0,2125.0,20.97,...,1155.84,969.16,1187.91,1155.84,433.60,1.0,500.0,590.95,0.0,1
1,1295,0,1705.0,0,1.0,50,1.0,3000.0,3000.0,17.12,...,2436.41,563.59,360.07,2436.41,2291.82,1.0,1800.0,445.26,1.0,1
2,2700,565,5835.0,1,4.0,44,0.0,10630.0,9100.0,13.67,...,0.00,6537.00,1708.47,0.00,0.00,0.0,0.0,0.00,0.0,0
3,1115,0,385.0,1,3.0,42,0.0,1500.0,1500.0,40.40,...,1035.27,464.73,355.92,1035.27,2833.83,0.0,0.0,0.00,1.0,1
4,305,0,785.0,1,4.0,34,1.0,1595.0,1090.0,68.39,...,1089.99,0.01,0.00,1089.99,4215.37,0.0,0.0,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121456,2000,0,0.0,1,1.0,38,1.0,2000.0,2000.0,33.79,...,2000.00,0.00,0.00,2000.00,4284.86,0.0,0.0,0.00,0.0,1
121457,6215,70,1155.0,0,4.0,37,0.0,7440.0,7440.0,24.52,...,0.00,7440.00,2664.67,0.00,0.00,2.0,2500.0,986.78,0.0,0
121458,430,0,1055.0,0,1.0,37,0.0,1595.0,1485.0,64.51,...,0.00,1485.00,186.41,0.00,0.00,3.0,2425.0,2749.65,1.0,0
121459,3000,0,0.0,1,1.0,58,1.0,3000.0,3000.0,21.62,...,0.00,3000.00,1974.70,0.00,0.00,0.0,0.0,0.00,0.0,0


In [42]:
y = base_dados['Status']
X = base_dados.drop('Status', axis=1)

# Funções Auxiliares

In [43]:
def analise_metricas(y_predict, y_test):
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)
    

    return {
                    "Acurácia": accuracy,
                    "F1 Score" : f1,
                    "Precision": precision,
                    "Recall": recall,
    }

In [44]:
def treinar_modelo(modelo, X_train, y_train, X_test):
    modelo.fit(X_train, y_train)
    y_predict = modelo.predict(X_test)

    return y_predict

### K-Fold & Gridsearch

In [45]:
#definindo os hiperparâmetros
param_DT = {'max_depth': range(1, 11)}
param_LR = {'C': [0.1, 1, 10], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear']}
param_RF = {'n_estimators': [100], 'max_depth': [None, 10], 'criterion': ['gini']}
param_XGB = {
    'n_estimators': [50, 100],  # Número de árvores
    'max_depth': [3, 5, 7],          # Profundidade máxima das árvores
    'learning_rate': [0.1, 0.2],  # Taxa de aprendizado
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

#definindo as listas para armazenar os valores das métricas de cada fold

metricas_DT = []
metricas_LR = []
metricas_RF = []
metricas_XGB = []

for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_divided, X_val, y_train_divided, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    X_train_divided = scaler.fit_transform(X_train_divided)
    X_val = scaler.transform(X_val)

    #definindo as listas para armazenar os valores das acurácias pra ser encntrado os melhores hiperparâmetros de cada algoritmo

    accs_val_DT = []
    par_DT = []

    accs_val_LR = []
    par_LR = []

    accs_val_RF = []
    par_RF = []

    accs_val_XGB =[]
    par_XGB = []

    #encontrando melhores hiperparametros

#DT
    for params in ParameterGrid(param_DT):
        dt_model = DecisionTreeClassifier(criterion="gini", max_depth=params['max_depth'], random_state=42)
        dt_model.fit(X_train_divided, y_train_divided)
        y_predict = dt_model.predict(X_val)
        acc = accuracy_score(y_val, y_predict)
        accs_val_DT.append(acc)
        par_DT.append(params)

    print(par_DT[accs_val_DT.index(max(accs_val_DT))])

    dt_best = DecisionTreeClassifier(criterion="gini", max_depth=par_DT[accs_val_DT.index(max(accs_val_DT))]['max_depth'], random_state=42)
    y_predict_DT = treinar_modelo(dt_best, X_train, y_train, X_test)

    metricas_DT.append(analise_metricas(y_predict_DT, y_test))

    #LR
    for params in ParameterGrid(param_LR):
        lr_model = LogisticRegression(C=params['C'], penalty=params['penalty'], solver=params['solver'], random_state=42)
        lr_model.fit(X_train_divided, y_train_divided)
        y_predict = lr_model.predict(X_val)
        acc = accuracy_score(y_val, y_predict)
        accs_val_LR.append(acc)
        par_LR.append(params)

    print(par_LR[accs_val_LR.index(max(accs_val_LR))])

    lr_best = LogisticRegression(C=par_LR[accs_val_LR.index(max(accs_val_LR))]['C'],
                            penalty=par_LR[accs_val_LR.index(max(accs_val_LR))]['penalty'],
                            solver=par_LR[accs_val_LR.index(max(accs_val_LR))]['solver'], 
                            random_state=42)

    y_predict_LR = treinar_modelo(lr_best, X_train, y_train, X_test)
    metricas_LR.append(analise_metricas(y_predict_LR, y_test))

    #RF
    for params in ParameterGrid(param_RF):
        rf_model = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], criterion=params['criterion'], random_state=42)
        rf_model.fit(X_train_divided, y_train_divided)
        y_predict = rf_model.predict(X_val)
        acc = accuracy_score(y_val, y_predict)
        accs_val_RF.append(acc)
        par_RF.append(params)

    print(par_RF[accs_val_RF.index(max(accs_val_RF))])
    
    rf_best = RandomForestClassifier(n_estimators=par_RF[accs_val_RF.index(max(accs_val_RF))]['n_estimators'], 
                                    max_depth=par_RF[accs_val_RF.index(max(accs_val_RF))]['max_depth'], 
                                    criterion=par_RF[accs_val_RF.index(max(accs_val_RF))]['criterion'], random_state=42)
    
    y_predict_RF = treinar_modelo(rf_best, X_train, y_train, X_test)
    metricas_RF.append(analise_metricas(y_predict_RF, y_test))

    # XGBoost
    for params in ParameterGrid(param_XGB):
        xgb_model = XGBClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            random_state=42)
        xgb_model.fit(X_train_divided, y_train_divided)
        y_predict = xgb_model.predict(X_val)
        acc = accuracy_score(y_val, y_predict)
        accs_val_XGB.append(acc)
        par_XGB.append(params)

    best_params_XGB = par_XGB[accs_val_XGB.index(max(accs_val_XGB))]
    print("Melhores hiperparâmetros do XGBoost:", best_params_XGB)

    xgb_best = XGBClassifier(
        n_estimators=best_params_XGB['n_estimators'],
        max_depth=best_params_XGB['max_depth'],
        learning_rate=best_params_XGB['learning_rate'],
        random_state=42)

    y_predict_XGB = treinar_modelo(xgb_best, X_train, y_train, X_test)
    metricas_XGB.append(analise_metricas(y_predict_XGB, y_test))

{'max_depth': 4}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
{'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
Melhores hiperparâmetros do XGBoost: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
{'max_depth': 4}
{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
{'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
Melhores hiperparâmetros do XGBoost: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
{'max_depth': 5}
{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}
Melhores hiperparâmetros do XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
{'max_depth': 4}
{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
{'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
Melhores hiperparâmetros do XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
{'max_depth': 6}
{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 10

In [46]:
def exibir_metricas_finais(metricas, nome_modelo):

    metricas_finais = {
        "Acurácia Média": np.mean([m["Acurácia"] for m in metricas]),
        "F1 Score Médio": np.mean([m["F1 Score"] for m in metricas]),
        "Precision Média": np.mean([m["Precision"] for m in metricas]),
        "Recall Médio": np.mean([m["Recall"] for m in metricas]),
    }
    
    print(f"\nMédias das métricas para {nome_modelo}:")
    for chave, valor in metricas_finais.items():
        print(f"{chave}: {valor:.8f}")

In [47]:
exibir_metricas_finais(metricas_DT, "Decision Tree")


Médias das métricas para Decision Tree:
Acurácia Média: 0.99962951
F1 Score Médio: 0.99967189
Precision Média: 0.99965015
Recall Médio: 0.99969363


In [48]:
exibir_metricas_finais(metricas_LR, "Regressão Logística")


Médias das métricas para Regressão Logística:
Acurácia Média: 0.99158578
F1 Score Médio: 0.99249590
Precision Média: 0.99951184
Recall Médio: 0.98557804


In [49]:
exibir_metricas_finais(metricas_RF, "Random Florest")


Médias das métricas para Random Florest:
Acurácia Média: 0.99964598
F1 Score Médio: 0.99968635
Precision Média: 0.99988331
Recall Médio: 0.99948949


In [50]:
exibir_metricas_finais(metricas_XGB, "XGBoost")


Médias das métricas para XGBoost:
Acurácia Média: 0.99920139
F1 Score Médio: 0.99929193
Precision Média: 0.99962043
Recall Médio: 0.99896377
