<a href="https://colab.research.google.com/github/AnniaBenitez/Cores_mod2/blob/main/Core_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV

In [20]:
import pandas as pd

# URL del archivo CSV en formato raw
url = "https://raw.githubusercontent.com/shrikant-temburwar/Loan-Prediction-Dataset/master/train.csv"

# Leer el archivo CSV directamente desde la URL
dataset = pd.read_csv(url)

# Mostrar las primeras filas del dataset
dataset.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [3]:
# Copia del dataset para trabajar
df = dataset.copy()

# Identificar columnas categóricas y numéricas
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Rellenar valores nulos para columnas numéricas con la mediana
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

# Rellenar valores nulos para columnas categóricas con la moda
for column in categorical_columns:
    df[column] = df[column].fillna(df[column].mode()[0])

# Verificar si quedan valores nulos
print(df.isnull().sum())


Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [4]:
print("Duplicados restantes:", df.duplicated().sum())

Duplicados restantes: 0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# 1. Codificar variables categóricas utilizando One-Hot Encoding
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Inicializar el codificador
encoder = OneHotEncoder( drop='first')  # Usa 'drop="first"' para evitar multicolinealidad
encoded_features = encoder.fit_transform(df[categorical_columns])

# Crear un DataFrame con los datos codificados
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns), index=df.index)

# Unir las nuevas columnas codificadas con el dataset original
df = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

# 2. Escalar características numéricas utilizando StandardScaler
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
scaler = StandardScaler()

# Escalar las columnas numéricas
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# 3. Dividir el dataset en conjuntos de entrenamiento y prueba
# Separar características (X) y la variable objetivo (y)
X = df.drop(columns=['Loan_ID', 'Loan_Status'])  # Excluir columnas no necesarias para el modelo
y = df['Loan_Status']  # Variable objetivo

# Convertir la variable objetivo en binaria: Y = 1, N = 0
y = y.map({'Y': 1, 'N': 0})

# Dividir en conjuntos de entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Resultados
print("Tamaño del conjunto de entrenamiento:", X_train.shape)
print("Tamaño del conjunto de prueba:", X_test.shape)

In [None]:
# Parte 1: Entrenamiento del Modelo Inicial
# Inicializar el modelo LGBM con hiperparámetros básicos
lgbm = LGBMClassifier(random_state=42)

# Entrenar el modelo
lgbm.fit(X_train, y_train)

# Predicciones en el conjunto de prueba
y_pred = lgbm.predict(X_test)
y_pred_prob = lgbm.predict_proba(X_test)[:, 1]  # Probabilidades para ROC-AUC

# Evaluar el modelo inicial
print("Métricas del modelo inicial:")
print(f"Exactitud: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precisión: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob):.4f}")

# Parte 2: Optimización de Hiperparámetros
# Definir el grid de hiperparámetros
param_grid = {
    'num_leaves': [15, 31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150]
}

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=LGBMClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # Validación cruzada de 5 pliegues
    scoring='roc_auc',  # Métrica a optimizar
    verbose=1,
    n_jobs=-1
)

# Buscar los mejores hiperparámetros
grid_search.fit(X_train, y_train)

# Mejor modelo optimizado
best_lgbm = grid_search.best_estimator_

# Parte 3: Evaluación del Modelo Optimizado
# Predicciones con el modelo optimizado
y_pred_optimized = best_lgbm.predict(X_test)
y_pred_prob_optimized = best_lgbm.predict_proba(X_test)[:, 1]

# Evaluar el modelo optimizado
print("\nMétricas del modelo optimizado:")
print(f"Exactitud: {accuracy_score(y_test, y_pred_optimized):.4f}")
print(f"Precisión: {precision_score(y_test, y_pred_optimized):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_optimized):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_optimized):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob_optimized):.4f}")

# Imprimir el reporte de clasificación
print("\nReporte de Clasificación (Modelo Optimizado):")
print(classification_report(y_test, y_pred_optimized))

# Comparar métricas iniciales con las optimizadas
print("\nHiperparámetros óptimos:")
print(grid_search.best_params_)
