# Credit Scoring Model

Modelo predictivo de default crediticio usando el dataset "Give Me Some Credit" de Kaggle.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, roc_curve, confusion_matrix, 
    classification_report, precision_recall_curve
)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
pd.options.display.float_format = '{:.3f}'.format

## 1. Carga y exploración de datos

In [None]:
# Cargar datos
# Descargar de: https://www.kaggle.com/c/GiveMeSomeCredit/data

try:
    df = pd.read_csv('../data/cs-training.csv', index_col=0)
    print(f"Dataset cargado: {df.shape[0]:,} filas, {df.shape[1]} columnas")
except FileNotFoundError:
    print("Archivo no encontrado. Generando datos sintéticos para demo...")
    
    # Datos sintéticos para demo
    np.random.seed(42)
    n = 10000
    
    df = pd.DataFrame({
        'SeriousDlqin2yrs': np.random.binomial(1, 0.07, n),
        'RevolvingUtilizationOfUnsecuredLines': np.clip(np.random.exponential(0.3, n), 0, 1),
        'age': np.random.normal(52, 15, n).astype(int).clip(21, 90),
        'NumberOfTime30-59DaysPastDueNotWorse': np.random.poisson(0.3, n),
        'DebtRatio': np.clip(np.random.exponential(0.35, n), 0, 2),
        'MonthlyIncome': np.random.lognormal(8.5, 0.8, n),
        'NumberOfOpenCreditLinesAndLoans': np.random.poisson(8, n),
        'NumberOfTimes90DaysLate': np.random.poisson(0.1, n),
        'NumberRealEstateLoansOrLines': np.random.poisson(1, n),
        'NumberOfTime60-89DaysPastDueNotWorse': np.random.poisson(0.1, n),
        'NumberOfDependents': np.random.poisson(0.8, n)
    })
    
    # Agregar missing values realistas
    df.loc[np.random.choice(n, 2000, replace=False), 'MonthlyIncome'] = np.nan
    df.loc[np.random.choice(n, 500, replace=False), 'NumberOfDependents'] = np.nan
    
    print(f"Datos sintéticos generados: {df.shape[0]:,} filas")

In [None]:
# Vista general
df.head()

In [None]:
# Información del dataset
print("INFORMACIÓN DEL DATASET")
print("="*50)
print(f"\nDistribución del target:")
print(df['SeriousDlqin2yrs'].value_counts(normalize=True))
print(f"\nDefault rate: {df['SeriousDlqin2yrs'].mean():.2%}")

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing': missing, 'Porcentaje': missing_pct})
missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)

In [None]:
# Estadísticas descriptivas
df.describe().T

## 2. Análisis exploratorio

In [None]:
# Distribución de variables numéricas
fig, axes = plt.subplots(3, 3, figsize=(14, 10))
axes = axes.flatten()

numeric_cols = df.select_dtypes(include=[np.number]).columns.drop('SeriousDlqin2yrs')

for i, col in enumerate(numeric_cols[:9]):
    ax = axes[i]
    df[col].hist(bins=50, ax=ax, color='#2E86AB', alpha=0.7)
    ax.set_title(col[:25], fontsize=10)
    ax.set_ylabel('')

plt.suptitle('Distribución de Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Default rate por edad
df_clean = df.dropna()
df_clean['age_bucket'] = pd.cut(df_clean['age'], bins=[0, 30, 40, 50, 60, 70, 100])

default_by_age = df_clean.groupby('age_bucket')['SeriousDlqin2yrs'].agg(['mean', 'count'])
default_by_age.columns = ['Default Rate', 'Count']

fig, ax = plt.subplots(figsize=(10, 5))
default_by_age['Default Rate'].plot(kind='bar', ax=ax, color='#A23B72')
ax.set_title('Default Rate por Rango de Edad', fontsize=14, fontweight='bold')
ax.set_ylabel('Default Rate')
ax.set_xlabel('Rango de Edad')
plt.xticks(rotation=45)

for i, v in enumerate(default_by_age['Default Rate']):
    ax.text(i, v + 0.005, f'{v:.1%}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Correlación con target
correlations = df.corr()['SeriousDlqin2yrs'].drop('SeriousDlqin2yrs').sort_values()

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#A23B72' if x > 0 else '#2E86AB' for x in correlations]
correlations.plot(kind='barh', ax=ax, color=colors)
ax.set_title('Correlación con Default', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

## 3. Preprocesamiento

In [None]:
def preprocess_data(df):
    """Pipeline de preprocesamiento."""
    df_processed = df.copy()
    
    # Imputar missing values
    df_processed['MonthlyIncome'].fillna(df_processed['MonthlyIncome'].median(), inplace=True)
    df_processed['NumberOfDependents'].fillna(df_processed['NumberOfDependents'].median(), inplace=True)
    
    # Tratar outliers
    df_processed['RevolvingUtilizationOfUnsecuredLines'] = df_processed['RevolvingUtilizationOfUnsecuredLines'].clip(0, 1)
    df_processed['DebtRatio'] = df_processed['DebtRatio'].clip(0, 3)
    df_processed['age'] = df_processed['age'].clip(18, 100)
    
    # Cap variables de mora
    mora_cols = [c for c in df_processed.columns if 'Past' in c or 'Late' in c]
    for col in mora_cols:
        df_processed[col] = df_processed[col].clip(0, 10)
    
    # Feature engineering básico
    df_processed['TotalLateTimes'] = (
        df_processed['NumberOfTime30-59DaysPastDueNotWorse'] +
        df_processed['NumberOfTime60-89DaysPastDueNotWorse'] +
        df_processed['NumberOfTimes90DaysLate']
    )
    
    df_processed['IncomePerDependent'] = df_processed['MonthlyIncome'] / (df_processed['NumberOfDependents'] + 1)
    
    return df_processed

df_processed = preprocess_data(df)
print(f"Datos preprocesados: {df_processed.shape}")
print(f"Missing values restantes: {df_processed.isnull().sum().sum()}")

In [None]:
# Separar features y target
FEATURES = [
    'RevolvingUtilizationOfUnsecuredLines',
    'age',
    'DebtRatio',
    'MonthlyIncome',
    'NumberOfOpenCreditLinesAndLoans',
    'NumberRealEstateLoansOrLines',
    'NumberOfDependents',
    'TotalLateTimes',
    'IncomePerDependent'
]

TARGET = 'SeriousDlqin2yrs'

X = df_processed[FEATURES]
y = df_processed[TARGET]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}")
print(f"Default rate train: {y_train.mean():.2%}")
print(f"Default rate test: {y_test.mean():.2%}")

In [None]:
# Escalar features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Modelado

In [None]:
# Definir modelos
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

In [None]:
# Entrenar y evaluar
for name, model in models.items():
    print(f"\nEntrenando {name}...")
    
    # Usar datos escalados para logistic, originales para tree-based
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Métricas
    auc = roc_auc_score(y_test, y_pred_proba)
    gini = 2 * auc - 1
    
    # KS Statistic
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    ks = max(tpr - fpr)
    
    results[name] = {
        'model': model,
        'y_pred_proba': y_pred_proba,
        'AUC': auc,
        'Gini': gini,
        'KS': ks
    }
    
    print(f"  AUC: {auc:.4f} | Gini: {gini:.4f} | KS: {ks:.4f}")

In [None]:
# Comparativa de modelos
comparison = pd.DataFrame({
    name: {'AUC': r['AUC'], 'Gini': r['Gini'], 'KS': r['KS']}
    for name, r in results.items()
}).T

comparison.sort_values('AUC', ascending=False)

## 5. Evaluación detallada

In [None]:
# Curvas ROC
fig, ax = plt.subplots(figsize=(10, 8))

colors = ['#2E86AB', '#A23B72', '#3A7D44']

for (name, r), color in zip(results.items(), colors):
    fpr, tpr, _ = roc_curve(y_test, r['y_pred_proba'])
    ax.plot(fpr, tpr, label=f"{name} (AUC={r['AUC']:.3f})", color=color, linewidth=2)

ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('Curvas ROC - Comparativa de Modelos', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (Random Forest)
rf_model = results['Random Forest']['model']
importance = pd.DataFrame({
    'Feature': FEATURES,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
importance.plot(kind='barh', x='Feature', y='Importance', ax=ax, color='#2E86AB', legend=False)
ax.set_title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()

In [None]:
# Distribución de scores por clase
best_model_name = comparison['AUC'].idxmax()
y_pred_proba = results[best_model_name]['y_pred_proba']

fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.6, label='No Default', color='#2E86AB')
ax.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.6, label='Default', color='#A23B72')

ax.set_xlabel('Probabilidad de Default', fontsize=12)
ax.set_ylabel('Frecuencia', fontsize=12)
ax.set_title(f'Distribución de Scores - {best_model_name}', fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Matriz de confusión (threshold óptimo)
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'])
ax.set_xlabel('Predicho', fontsize=12)
ax.set_ylabel('Real', fontsize=12)
ax.set_title(f'Matriz de Confusión (threshold={threshold})', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(classification_report(y_test, y_pred, target_names=['No Default', 'Default']))

## 6. Resumen

In [None]:
print("="*60)
print("RESUMEN - CREDIT SCORING MODEL")
print("="*60)
print(f"\nDataset: {len(df):,} observaciones")
print(f"Default rate: {df['SeriousDlqin2yrs'].mean():.2%}")
print(f"\nFeatures utilizadas: {len(FEATURES)}")
print(f"\nMEJOR MODELO: {best_model_name}")
print(f"  - AUC: {results[best_model_name]['AUC']:.4f}")
print(f"  - Gini: {results[best_model_name]['Gini']:.4f}")
print(f"  - KS: {results[best_model_name]['KS']:.4f}")
print(f"\nTOP 3 VARIABLES PREDICTIVAS:")
for i, row in importance.tail(3).iloc[::-1].iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.3f}")
print("="*60)