In [9]:
#Librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
pd.set_option('display.max_column',500)
pd.set_option('display.max_rows',500)

In [None]:
import sklearn

# Proyecto: Sistema de Aprobación de Créditos con IA 

## Fase 1: Definición del Problema y Preparación 

### 1.1. Comprensión del Problema de Negocio

El objetivo es desarrollar un sistema de inteligencia artificial que ayude a un banco a tomar decisiones más informadas sobre la aprobación de préstamos. 

buscamos predecir la probabilidad de que un solicitante de crédito (`TARGET = 1`) no pueda cumplir con sus pagos , frente a aquellos que sí lo harán (`TARGET = 0`).

Este problema es crítico para la banca por varias razones:
-   **Reducción del Riesgo Financiero:** Al predecir con precisión el riesgo de impago, el banco puede evitar otorgar préstamos a individuos con alta probabilidad de default, lo que directamente reduce las pérdidas financieras por créditos incobrables.
-   **Optimización de Recursos:** Un sistema automatizado agiliza el proceso de aprobación, permitiendo a los analistas de crédito enfocarse en casos más complejos o de alto riesgo que requieran un análisis manual más profundo.
-   **Cumplimiento Regulatorio y Transparencia:** Cada vez más, las regulaciones financieras exigen que los bancos puedan explicar por qué se aprueba o se deniega un crédito. Un modelo interpretable es fundamental para la transparencia y la confianza.

**Métricas Clave de Negocio:**
En este contexto, las consecuencias de los errores del modelo tienen costos diferentes:
-   **Falso Positivo (Error Tipo I):** El modelo predice que el solicitante entrará en no pagará, pero en realidad sí pagaría. **Costo:** El banco pierde una oportunidad de negocio (ingresos por intereses) y el cliente potencial no obtiene un crédito que podría haber pagado.
-   **Falso Negativo (Error Tipo II):** El modelo predice que el solicitante pagará, pero en realidad entra no pagará. **Costo:** El banco incurre en una pérdida financiera significativa (el monto del préstamo).

Dado que el costo asociado a un **Falso Negativo (perder dinero por un préstamo impagado)** es considerablemente mayor que el costo de un Falso Positivo (perder una oportunidad de ingreso), mi modelo se centrará en maximizar el **Recall** para la clase de "no pagará" (asegurando que detectamos la mayor cantidad posible de casos de impago) y el **Área Bajo la Curva (AUC)**, que es una métrica robusta para evaluar la capacidad de discriminación del modelo en datasets desbalanceados.

## Fase 2: Análisis Exploratorio de Datos (EDA) y Carga de Datos

Esta fase es el corazón de la comprensión del dataset. Antes de construir cualquier modelo, es imprescindible sumergirse en los datos para entender su estructura, identificar patrones, detectar anomalías y, crucialmente, comprender los desafíos de calidad de datos. En un entorno bancario, esta etapa no es solo académica; es la base para asegurar la confiabilidad del sistema de IA.

### 2.1. Carga y Fusión de Datos: Un Desafío de Integración Real

A diferencia de los datasets simples, en un entorno de producción, la información de un cliente rara vez reside en una única tabla. Los datos de riesgo crediticio pueden estar dispersos en el historial de préstamos anteriores, saldos de tarjetas de crédito, burós externos, etc. El dataset "Home Credit Default Risk" simula este escenario al proporcionar múltiples archivos CSV que deben ser cargados y fusionados.

Comenzaré cargando la tabla principal de solicitudes (`application_train.csv`) y realizando una primera inspección para entender su composición y detectar problemas iniciales de datos, como valores faltantes.

In [12]:
df_train=pd.read_csv(r"C:\Users\DAYRA\Desktop\PROYECTOS BRYAN\7.Sistema de Aprobación de Créditos con IA Interpretable\data\application_train.csv")

df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
print(f"DIMENSIONES DEL DATAFRAME {df_train.shape[0]}  filas y {df_train.shape[1]} columnas")

print(f"Informacion del dataframe")
df_train.info()


#Mostramos valones nulos 

print("\n----Porcentaje de valores nulos (por columna mayores a 0%)")
mising_porcentage=df_train.isnull().sum()*100/len(df_train)
mising_porcentage=mising_porcentage[mising_porcentage>0].sort_values(ascending=False)

print(mising_porcentage)

print(f"\n\nCantidad de columnas con valores nulos-------->{mising_porcentage.count()}")

DIMENSIONES DEL DATAFRAME 307511  filas y 122 columnas
Informacion del dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB

----Porcentaje de valores nulos (por columna mayores a 0%)
COMMONAREA_MEDI                 69.872297
COMMONAREA_AVG                  69.872297
COMMONAREA_MODE                 69.872297
NONLIVINGAPARTMENTS_MEDI        69.432963
NONLIVINGAPARTMENTS_MODE        69.432963
NONLIVINGAPARTMENTS_AVG         69.432963
FONDKAPREMONT_MODE              68.386172
LIVINGAPARTMENTS_MODE           68.354953
LIVINGAPARTMENTS_MEDI           68.354953
LIVINGAPARTMENTS_AVG            68.354953
FLOORSMIN_MODE                  67.848630
FLOORSMIN_MEDI                  67.848630
FLOORSMIN_AVG                   67.848630
YEARS_BUILD_MODE                66.497784
YEARS_BUILD_MEDI                66.497784
YEARS_BUILD_AVG      