## Importación de librerías

In [51]:
import kagglehub
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb

## Análisis del dataset

In [60]:
path = kagglehub.dataset_download("parthpatel2130/realistic-loan-approval-dataset-us-and-canada")
data = pd.read_csv(os.path.join(path, "Loan_approval_data_2025.csv"))    
data = data.drop(columns=['customer_id'])
data = data.dropna()

In [61]:
info_df = pd.DataFrame({
    'column': data.columns,
    'dtype': data.dtypes.values,
    'non_nulls': data.notnull().sum().values,
    'nulls': data.isnull().sum().values
})

info_df

Unnamed: 0,column,dtype,non_nulls,nulls
0,age,int64,50000,0
1,occupation_status,object,50000,0
2,years_employed,float64,50000,0
3,annual_income,int64,50000,0
4,credit_score,int64,50000,0
5,credit_history_years,float64,50000,0
6,savings_assets,int64,50000,0
7,current_debt,int64,50000,0
8,defaults_on_file,int64,50000,0
9,delinquencies_last_2yrs,int64,50000,0


In [62]:
data = pd.get_dummies(data, columns=['product_type', 'occupation_status', 'loan_intent'], drop_first=True)
data

Unnamed: 0,age,years_employed,annual_income,credit_score,credit_history_years,savings_assets,current_debt,defaults_on_file,delinquencies_last_2yrs,derogatory_marks,...,loan_status,product_type_Line of Credit,product_type_Personal Loan,occupation_status_Self-Employed,occupation_status_Student,loan_intent_Debt Consolidation,loan_intent_Education,loan_intent_Home Improvement,loan_intent_Medical,loan_intent_Personal
0,40,17.2,25579,692,5.3,895,10820,0,0,0,...,1,False,False,False,False,False,False,False,False,False
1,33,7.3,43087,627,3.5,169,16550,0,1,0,...,0,False,True,False,False,False,False,True,False,False
2,42,1.1,20840,689,8.4,17,7852,0,0,0,...,1,False,False,False,True,True,False,False,False,False
3,53,0.5,29147,692,9.8,1480,11603,0,1,0,...,1,False,False,False,True,False,False,False,False,False
4,32,12.5,63657,630,7.2,209,12424,0,0,0,...,1,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,35,4.3,39449,570,16.3,1127,7576,0,0,0,...,0,False,False,False,False,False,True,False,False,False
49996,34,4.4,20496,672,12.6,1478,6276,1,0,0,...,0,False,False,False,False,False,False,False,False,True
49997,41,4.8,18743,719,10.1,17,10331,0,0,0,...,0,False,False,True,False,False,False,False,False,True
49998,38,0.4,17250,633,1.3,5,7779,0,0,1,...,0,False,True,False,True,False,False,False,False,True


In [63]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[numerical_cols] = StandardScaler().fit_transform(X[numerical_cols])
X

Unnamed: 0,age,years_employed,annual_income,credit_score,credit_history_years,savings_assets,current_debt,defaults_on_file,delinquencies_last_2yrs,derogatory_marks,...,payment_to_income_ratio,product_type_Line of Credit,product_type_Personal Loan,occupation_status_Self-Employed,occupation_status_Student,loan_intent_Debt Consolidation,loan_intent_Education,loan_intent_Home Improvement,loan_intent_Medical,loan_intent_Personal
0,0.453563,1.280229,-0.750345,0.747482,-0.397958,-0.204093,-0.262046,-0.237701,-0.656347,-0.357489,...,-1.455529,False,False,False,False,False,False,False,False,False
1,-0.176018,-0.020345,-0.213787,-0.256675,-0.647698,-0.258959,0.170615,-0.237701,0.527028,-0.357489,...,1.146448,False,True,False,False,False,False,True,False,False
2,0.633444,-0.834846,-0.895579,0.701137,0.032151,-0.270446,-0.486154,-0.237701,-0.656347,-0.357489,...,-1.288075,False,False,False,True,True,False,False,False,False
3,1.622787,-0.913669,-0.640998,0.747482,0.226393,-0.159883,-0.202923,-0.237701,0.527028,-0.357489,...,-1.294515,False,False,False,True,False,False,False,False,False
4,-0.265959,0.662785,0.416612,-0.210330,-0.134343,-0.255936,-0.140931,-0.237701,-0.656347,-0.357489,...,1.854907,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.003862,-0.414459,-0.325278,-1.137244,1.128234,-0.186561,-0.506994,-0.237701,-0.656347,-0.357489,...,0.824421,False,False,False,False,False,True,False,False,False
49996,-0.086078,-0.401322,-0.906121,0.438511,0.614879,-0.160035,-0.605155,4.206968,-0.656347,-0.357489,...,-1.107740,False,False,False,False,False,False,False,False,True
49997,0.543504,-0.348773,-0.959844,1.164594,0.268017,-0.270446,-0.298970,-0.237701,-0.656347,-0.357489,...,0.553919,False,False,True,False,False,False,False,False,True
49998,0.273683,-0.926806,-1.005600,-0.163984,-0.952937,-0.271353,-0.491666,-0.237701,-0.656347,2.063865,...,-1.333159,False,True,False,True,False,False,False,False,True


## Modelos

In [68]:
model_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
model_xgboost = xgb.XGBClassifier(eval_metric='logloss')
k_folds = KFold(n_splits=10, shuffle=True, random_state=42)

In [71]:
variables_importances_random_forest = pd.DataFrame({
    'feature': X.columns,
    'importance_random_forest': model_random_forest.fit(X, y).feature_importances_,
}).sort_values(by='importance_random_forest', ascending=False)

variables_importances_random_forest

Unnamed: 0,feature,importance_random_forest
3,credit_score,0.204656
12,debt_to_income_ratio,0.169741
11,interest_rate,0.066342
4,credit_history_years,0.064787
8,delinquencies_last_2yrs,0.062069
0,age,0.046416
13,loan_to_income_ratio,0.045991
14,payment_to_income_ratio,0.044099
7,defaults_on_file,0.039897
1,years_employed,0.039123


In [72]:
variables_importances_xgboost = pd.DataFrame({
    'feature': X.columns,
    'importance_xgboost': model_xgboost.fit(X, y).feature_importances_,
}).sort_values(by='importance_xgboost', ascending=False)

variables_importances_xgboost

Unnamed: 0,feature,importance_xgboost
7,defaults_on_file,0.160836
3,credit_score,0.136706
12,debt_to_income_ratio,0.079668
8,delinquencies_last_2yrs,0.078596
19,loan_intent_Debt Consolidation,0.065321
23,loan_intent_Personal,0.063546
20,loan_intent_Education,0.062345
14,payment_to_income_ratio,0.054532
22,loan_intent_Medical,0.051371
9,derogatory_marks,0.03831


In [73]:
scores_random_forest = cross_val_score(model_random_forest, X, y, cv=k_folds, scoring='accuracy')
scores_xgboost = cross_val_score(model_xgboost, X, y, cv=k_folds, scoring='accuracy')

print("Random Forest Classifier Accuracy: %.2f%% (+/- %.2f%%)" % (scores_random_forest.mean()*100, scores_random_forest.std()*100))
print("XGBoost Classifier Accuracy: %.2f%% (+/- %.2f%%)" % (scores_xgboost.mean()*100, scores_xgboost.std()*100))


Random Forest Classifier Accuracy: 91.17% (+/- 0.42%)
XGBoost Classifier Accuracy: 92.72% (+/- 0.28%)
