In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.ensemble import 

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv("../data/processed/loan_data_prepared.csv")
data.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_male,...,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,False,...,True,False,False,True,False,False,False,True,False,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,False,...,False,False,True,False,True,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,False,...,False,False,False,False,False,False,True,False,False,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,False,...,False,False,False,True,False,False,True,False,False,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,True,...,True,False,False,True,False,False,True,False,False,False


### Selección de Variables

In [4]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)

In [6]:
print("Tamaño de X_train: ", X_train.shape)
print("Tamaño de X_test: ", X_test.shape)
print("Tamaño de y_train: ", y_train.shape)
print("Tamaño de y_test: ", y_test.shape)

Tamaño de X_train:  (36000, 22)
Tamaño de X_test:  (9000, 22)
Tamaño de y_train:  (36000,)
Tamaño de y_test:  (9000,)


In [7]:
print("Valores únicos de y_train antes de la transformación:")
print(y_train.unique())

Valores únicos de y_train antes de la transformación:
[0 1]


### Codificar Variables 

In [8]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [9]:
print("Clases codificadas: ", le.classes_)

Clases codificadas:  [0 1]


In [10]:
print("Valores únicos de y_train después de la codificación:")
print(np.unique(y_train_encoded))
print(np.unique(y_test_encoded))

Valores únicos de y_train después de la codificación:
[0 1]
[0 1]


### Ecalado y Ajuste

In [11]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# <span style="color: orange"> Modelos </span>

### Regresión Logística

In [12]:
# Hiper-parámetros
param_grid_logit = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

In [13]:
# Regresion Logistica
logit = LogisticRegression()
grid_logit = GridSearchCV(estimator=logit, param_grid=param_grid_logit, cv=5, scoring='roc_auc')
grid_logit.fit(X_train_scaled, y_train_encoded)

In [14]:
# Predicción y Evaluación
logit_best = grid_logit.best_estimator_
#logit_predicts = logit_best.predict(X_test_scaled)
#roc_auc = roc_auc_score(y_test_encoded, logit_predicts)
logit_predicts = logit_best.predict(X_test_scaled)
roc_auc = roc_auc_score(y_test_encoded, logit_best.predict_proba(X_test_scaled)[:, 1])


print("Mejores hiperparámetros: ", grid_logit.best_params_)
print("Roc_Auc Score: ", roc_auc)

Mejores hiperparámetros:  {'C': 10, 'solver': 'liblinear'}
Roc_Auc Score:  0.9562063571428572


In [15]:
logit_results = pd.DataFrame(grid_logit.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
logit_results['algoritmo'] = 'Regresión Logística'

logit_results = logit_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
print("Resultados: ")
logit_results

Resultados: 


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score
6,Regresión Logística,1,"{'C': 10, 'solver': 'liblinear'}",0.953383,0.002684
8,Regresión Logística,2,"{'C': 100, 'solver': 'liblinear'}",0.953376,0.002692
9,Regresión Logística,3,"{'C': 100, 'solver': 'lbfgs'}",0.953369,0.002681
7,Regresión Logística,4,"{'C': 10, 'solver': 'lbfgs'}",0.953366,0.002682
4,Regresión Logística,5,"{'C': 1, 'solver': 'liblinear'}",0.953346,0.002657
5,Regresión Logística,6,"{'C': 1, 'solver': 'lbfgs'}",0.953342,0.002637
2,Regresión Logística,7,"{'C': 0.1, 'solver': 'liblinear'}",0.952278,0.002669
3,Regresión Logística,8,"{'C': 0.1, 'solver': 'lbfgs'}",0.952183,0.002666
0,Regresión Logística,9,"{'C': 0.01, 'solver': 'liblinear'}",0.943799,0.002662
1,Regresión Logística,10,"{'C': 0.01, 'solver': 'lbfgs'}",0.942985,0.002751


In [16]:
print(classification_report(y_test_encoded, logit_predicts))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      7000
           1       0.79      0.75      0.77      2000

    accuracy                           0.90      9000
   macro avg       0.86      0.85      0.85      9000
weighted avg       0.90      0.90      0.90      9000



### Decesion Tree

In [17]:
#Hiper-parámetros
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [18]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=2025)
grid_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='roc_auc')
grid_dt.fit(X_train_scaled, y_train_encoded)

In [19]:
# Predicción y Evaluación
dt_best = grid_dt.best_estimator_
dt_predicts = dt_best.predict(X_test_scaled)
roc_auc_dt = roc_auc_score(y_test_encoded, dt_predicts)

print("Mejores hiperparámetros: ", grid_dt.best_params_)
print("ROC AUC Score: ", roc_auc_dt)

Mejores hiperparámetros:  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
ROC AUC Score:  0.8516785714285715


In [20]:
dt_results = pd.DataFrame(grid_dt.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
dt_results['algoritmo'] = 'Decision Tree'

dt_results = dt_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
print("Resultados: ")
dt_results

Resultados: 


Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score
60,Decision Tree,1,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.961293,0.002329
61,Decision Tree,1,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.961293,0.002329
62,Decision Tree,3,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.961272,0.002477
17,Decision Tree,4,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.961134,0.001960
59,Decision Tree,5,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.960883,0.001667
...,...,...,...,...,...
81,Decision Tree,86,"{'criterion': 'entropy', 'max_depth': 50, 'min...",0.858786,0.005721
45,Decision Tree,86,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.858786,0.005721
27,Decision Tree,88,"{'criterion': 'gini', 'max_depth': 30, 'min_sa...",0.855014,0.002576
36,Decision Tree,89,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",0.854509,0.001105


In [21]:
print(classification_report(y_test_encoded, dt_predicts))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      7000
           1       0.90      0.73      0.80      2000

    accuracy                           0.92      9000
   macro avg       0.91      0.85      0.88      9000
weighted avg       0.92      0.92      0.92      9000



### Random Forest

In [22]:
# Hiper-parámetros
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'criterion': ['gini', 'entropy']
}

In [23]:
# Random Forest
rf = RandomForestClassifier()
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='roc_auc')
grid_rf.fit(X_train_scaled, y_train_encoded)

In [24]:
# Predicción y Evaluación
rf_best = grid_rf.best_estimator_
rf_predicts = rf_best.predict(X_test_scaled)
roc_auc_rf = roc_auc_score(y_test_encoded, rf_predicts)

print("Mejores hiperparámetros: ", grid_rf.best_params_)
print("Roc_Auc Score: ", roc_auc_rf)

Mejores hiperparámetros:  {'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 500}
Roc_Auc Score:  0.8745714285714286


In [25]:
rf_results = pd.DataFrame(grid_rf.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
rf_results['algoritmo'] = 'Random Forest'

rf_results = rf_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
print("Resultados: ")
print(rf_results)

Resultados: 
        algoritmo  rank_test_score  \
17  Random Forest                1   
14  Random Forest                2   
16  Random Forest                3   
8   Random Forest                4   
12  Random Forest                5   
13  Random Forest                6   
5   Random Forest                7   
7   Random Forest                8   
4   Random Forest                9   
15  Random Forest               10   
6   Random Forest               11   
3   Random Forest               12   
10  Random Forest               13   
11  Random Forest               14   
9   Random Forest               15   
2   Random Forest               16   
1   Random Forest               17   
0   Random Forest               18   

                                               params  mean_test_score  \
17  {'criterion': 'entropy', 'max_depth': 30, 'n_e...         0.974131   
14  {'criterion': 'entropy', 'max_depth': 20, 'n_e...         0.974065   
16  {'criterion': 'entropy', 'max_depth': 

In [26]:
print(classification_report(y_test_encoded, rf_predicts))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      7000
           1       0.90      0.77      0.83      2000

    accuracy                           0.93      9000
   macro avg       0.92      0.87      0.89      9000
weighted avg       0.93      0.93      0.93      9000



### AdaBoost

In [27]:
# Hiper-parámetros
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

In [28]:
# AdaBoost
ada = AdaBoostClassifier()
grid_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, scoring='roc_auc')
grid_ada.fit(X_train_scaled, y_train_encoded)



In [29]:
# Predicciones y Evaluación
ada_best = grid_ada.best_estimator_
ada_predicts = ada_best.predict(X_test_scaled)
roc_auc = roc_auc_score(y_test_encoded, ada_predicts)

print("Mejores hiperparámetros: ", grid_ada.best_params_)
print("Roc_Auc Score: ", roc_auc)

Mejores hiperparámetros:  {'learning_rate': 1, 'n_estimators': 200}
Roc_Auc Score:  0.8736071428571429


In [30]:
ada_results = pd.DataFrame(grid_ada.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
ada_results['algoritmo'] = 'AdaBoost'

ada_results = ada_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
print("Resultados: ")
print(ada_results)

Resultados: 
  algoritmo  rank_test_score                                        params  \
8  AdaBoost                1     {'learning_rate': 1, 'n_estimators': 200}   
7  AdaBoost                2     {'learning_rate': 1, 'n_estimators': 100}   
5  AdaBoost                3   {'learning_rate': 0.1, 'n_estimators': 200}   
6  AdaBoost                4      {'learning_rate': 1, 'n_estimators': 50}   
4  AdaBoost                5   {'learning_rate': 0.1, 'n_estimators': 100}   
3  AdaBoost                6    {'learning_rate': 0.1, 'n_estimators': 50}   
2  AdaBoost                7  {'learning_rate': 0.01, 'n_estimators': 200}   
1  AdaBoost                8  {'learning_rate': 0.01, 'n_estimators': 100}   
0  AdaBoost                9   {'learning_rate': 0.01, 'n_estimators': 50}   

   mean_test_score  std_test_score  
8         0.965947        0.001417  
7         0.965057        0.001430  
5         0.964106        0.001465  
6         0.964028        0.001558  
4         0.961810   

In [31]:
print(classification_report(y_test_encoded, ada_predicts))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7000
           1       0.85      0.79      0.82      2000

    accuracy                           0.92      9000
   macro avg       0.90      0.87      0.88      9000
weighted avg       0.92      0.92      0.92      9000



### XGBoost

In [32]:
# Hiper-parámetros 
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7]
}

In [33]:
# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
grid_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='roc_auc')
grid_xgb.fit(X_train_scaled, y_train_encoded)

In [34]:
# Predicciones y evaluación
xgb_best = grid_xgb.best_estimator_
xgb_predicts = xgb_best.predict(X_test_scaled)
roc_auc = roc_auc_score(y_test_encoded, xgb_predicts)

print("Mejores hiperparámetros: ", grid_xgb.best_params_)
print("Roc_Auc Score: ", roc_auc)

Mejores hiperparámetros:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Roc_Auc Score:  0.8890714285714286


In [35]:
xgb_results = pd.DataFrame(grid_xgb.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
xgb_results['algoritmo'] = 'XGBoost'

xgb_results = xgb_results[['algoritmo', 'rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
print("Resultados: ")
print(xgb_results)

Resultados: 
   algoritmo  rank_test_score  \
17   XGBoost                1   
23   XGBoost                2   
22   XGBoost                3   
20   XGBoost                4   
14   XGBoost                5   
26   XGBoost                6   
24   XGBoost                7   
25   XGBoost                8   
16   XGBoost                9   
21   XGBoost               10   
19   XGBoost               11   
13   XGBoost               12   
15   XGBoost               13   
11   XGBoost               14   
18   XGBoost               15   
12   XGBoost               16   
10   XGBoost               17   
8    XGBoost               18   
7    XGBoost               19   
9    XGBoost               20   
6    XGBoost               21   
5    XGBoost               22   
4    XGBoost               23   
3    XGBoost               24   
2    XGBoost               25   
1    XGBoost               26   
0    XGBoost               27   

                                               params  mean_te

In [36]:
print(classification_report(y_test_encoded, xgb_predicts))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      7000
           1       0.90      0.81      0.85      2000

    accuracy                           0.94      9000
   macro avg       0.92      0.89      0.90      9000
weighted avg       0.93      0.94      0.93      9000



# <span style="color: orange"> Resultados </span>

In [37]:
resultados = pd.concat([
    logit_results,
    dt_results,
    rf_results,
    ada_results,
    xgb_results
], ignore_index=True)

In [38]:
resultados.head()

Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score
0,Regresión Logística,1,"{'C': 10, 'solver': 'liblinear'}",0.953383,0.002684
1,Regresión Logística,2,"{'C': 100, 'solver': 'liblinear'}",0.953376,0.002692
2,Regresión Logística,3,"{'C': 100, 'solver': 'lbfgs'}",0.953369,0.002681
3,Regresión Logística,4,"{'C': 10, 'solver': 'lbfgs'}",0.953366,0.002682
4,Regresión Logística,5,"{'C': 1, 'solver': 'liblinear'}",0.953346,0.002657


In [39]:
resultados_sorted = resultados.sort_values(by='mean_test_score', ascending=False)
resultados_sorted.head()

Unnamed: 0,algoritmo,rank_test_score,params,mean_test_score,std_test_score
127,XGBoost,1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.978021,0.001075
128,XGBoost,2,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.977648,0.001005
129,XGBoost,3,"{'learning_rate': 0.3, 'max_depth': 5, 'n_esti...",0.977591,0.001205
130,XGBoost,4,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.977292,0.000908
131,XGBoost,5,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.977124,0.001333


In [40]:
mejor_modelo = resultados_sorted.iloc[0]
mejor_algoritmo = mejor_modelo['algoritmo']
mejores_parametros = mejor_modelo['params']

print(f"Mejor modelo: {mejor_algoritmo}")
print(f"Mejores parámetros: {mejores_parametros}")

Mejor modelo: XGBoost
Mejores parámetros: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}


# <span style="color: orange"> Modelo Final </span>

In [47]:
# Crear el modelo con los mejores parámetros
if mejor_algoritmo == 'Regresión Logística':
    modelo_final = LogisticRegression(**mejores_parametros)
elif mejor_algoritmo == 'Decision Tree':
    modelo_final = DecisionTreeClassifier(**mejores_parametros, random_state=2025)
elif mejor_algoritmo == 'Random Forest':
    modelo_final = RandomForestClassifier(**mejores_parametros, random_state=2025)
elif mejor_algoritmo == 'AdaBoost':
    modelo_final = AdaBoostClassifier(**mejores_parametros, random_state=2025)
elif mejor_algoritmo == 'XGBoost':
    modelo_final = xgb.XGBClassifier(**mejores_parametros, eval_metric='logloss', random_state=2025)
else:
    raise ValueError("No se reconoce el algoritmo del mejor modelo.")

# Entrenar el modelo con los mejores parámetros
modelo_final.fit(X_train_scaled, y_train_encoded)

In [48]:
# Guardar modelo en un archivo pkl
joblib.dump(modelo_final, '../models/modelo_mejor.pkl')
print(f"Modelo guardado: {mejor_algoritmo} con parámetros {mejores_parametros}")

Modelo guardado: XGBoost con parámetros {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
