In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
import pandas as pd
import numpy as np

ml_client = MLClient(
    DefaultAzureCredential(),
    subscription_id="1c10247d-21e6-46ad-aad4-0415d628ab58",
    resource_group_name="synapse-ogvd-rg",
    workspace_name="ml-workspace-ogvd"
)

print("Cliente Azure ML configurado")
print(f"Workspace: {ml_client.workspace_name}")
print(f"Resource Group: {ml_client.resource_group_name}")

Cliente Azure ML configurado
Workspace: ml-workspace-ogvd
Resource Group: synapse-ogvd-rg


In [2]:
import urllib.request

url = "https://raw.githubusercontent.com/ETSISI-OGVD/azure-machinelearning-labs/main/data/default_of_credit_card_clients.csv"
urllib.request.urlretrieve(url, "credit_data.csv")

df = pd.read_csv("credit_data.csv")

print("Dataset de clientes de tarjetas de crédito cargado:")
print(f"Shape: {df.shape}")
print(f"Columnas: {df.columns.tolist()}")

print("\nPrimeras 5 filas:")
df.head()

Dataset de clientes de tarjetas de crédito cargado:
Shape: (30001, 25)
Columnas: ['Unnamed: 0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'Y']

Primeras 5 filas:


Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


In [4]:
# Análisis exploratorio (CORREGIDO)
print("INFORMACIÓN DEL DATASET:")
print(df.info())

print("\nCOLUMNAS DISPONIBLES:")
print(df.columns.tolist())

# Identificar la columna objetivo correcta
target_column = 'Y'  # ← CORREGIDO
print(f"\nCOLUMNA OBJETIVO: {target_column}")

print(f"\nDISTRIBUCIÓN DE LA VARIABLE OBJETIVO:")
print(df[target_column].value_counts())

# Distribución porcentual
print(f"\nDistribución porcentual:")
print(df[target_column].value_counts(normalize=True) * 100)

# Verificar valores nulos
print(f"\nValores nulos: {df.isnull().sum().sum()}")

# Mostrar primeras filas para entender mejor los datos
print(f"\nMUESTRA DE DATOS:")
print(df.head())

INFORMACIÓN DEL DATASET:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30001 entries, 0 to 30000
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  30001 non-null  object
 1   X1          30001 non-null  object
 2   X2          30001 non-null  object
 3   X3          30001 non-null  object
 4   X4          30001 non-null  object
 5   X5          30001 non-null  object
 6   X6          30001 non-null  object
 7   X7          30001 non-null  object
 8   X8          30001 non-null  object
 9   X9          30001 non-null  object
 10  X10         30001 non-null  object
 11  X11         30001 non-null  object
 12  X12         30001 non-null  object
 13  X13         30001 non-null  object
 14  X14         30001 non-null  object
 15  X15         30001 non-null  object
 16  X16         30001 non-null  object
 17  X17         30001 non-null  object
 18  X18         30001 non-null  object
 19  X19         30001 non

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

print("LIMPIANDO Y PREPARANDO DATOS...")

if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
    print("Columna índice eliminada")

feature_columns = [col for col in df.columns if col != 'Y']

for col in feature_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['Y'] = pd.to_numeric(df['Y'], errors='coerce')

print(f"DATOS DESPUÉS DE LIMPIEZA:")
print(f"Shape: {df.shape}")
print(f"Tipos de datos:")
print(df.dtypes.value_counts())

null_counts = df.isnull().sum()
if null_counts.sum() > 0:
    print(f"\nValores nulos encontrados:")
    print(null_counts[null_counts > 0])
    df = df.dropna()
    print(f"Filas con nulos eliminadas. Nuevo shape: {df.shape}")

X = df.drop(['Y'], axis=1)
y = df['Y']

print(f"\nPREPARACIÓN FINAL:")
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")

print(f"\nDISTRIBUCIÓN DEL TARGET:")
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nDIVISIÓN DE DATOS:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nDistribución en training:")
print(y_train.value_counts())
print(f"\nDistribución en test:")
print(y_test.value_counts())

LIMPIANDO Y PREPARANDO DATOS...
Columna índice eliminada
DATOS DESPUÉS DE LIMPIEZA:
Shape: (30001, 24)
Tipos de datos:
float64    24
dtype: int64

Valores nulos encontrados:
X1     1
X2     1
X3     1
X4     1
X5     1
X6     1
X7     1
X8     1
X9     1
X10    1
X11    1
X12    1
X13    1
X14    1
X15    1
X16    1
X17    1
X18    1
X19    1
X20    1
X21    1
X22    1
X23    1
Y      1
dtype: int64
Filas con nulos eliminadas. Nuevo shape: (30000, 24)

PREPARACIÓN FINAL:
Features (X): (30000, 23)
Target (y): (30000,)

DISTRIBUCIÓN DEL TARGET:
0.0    23364
1.0     6636
Name: Y, dtype: int64

DIVISIÓN DE DATOS:
Training set: (24000, 23)
Test set: (6000, 23)

Distribución en training:
0.0    18691
1.0     5309
Name: Y, dtype: int64

Distribución en test:
0.0    4673
1.0    1327
Name: Y, dtype: int64


In [6]:
print("ENTRENANDO MODELO RANDOM FOREST...")

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("MODELO ENTRENADO EXITOSAMENTE")
print(f"Accuracy: {accuracy:.4f}")

print(f"\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTOP 10 CARACTERÍSTICAS MÁS IMPORTANTES:")
print(feature_importance.head(10))

ENTRENANDO MODELO RANDOM FOREST...
MODELO ENTRENADO EXITOSAMENTE
Accuracy: 0.7905

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         0.0       0.87      0.86      0.86      4673
         1.0       0.52      0.56      0.54      1327

    accuracy                           0.79      6000
   macro avg       0.70      0.71      0.70      6000
weighted avg       0.80      0.79      0.79      6000


TOP 10 CARACTERÍSTICAS MÁS IMPORTANTES:
   feature  importance
5       X6    0.248320
6       X7    0.101913
0       X1    0.052161
7       X8    0.051778
8       X9    0.049973
10     X11    0.043462
11     X12    0.040626
17     X18    0.040555
18     X19    0.038957
9      X10    0.035554


In [8]:
import joblib
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

print("REGISTRANDO MODELO EN AZURE ML...")

model_name = "credit_default_model.pkl"
joblib.dump(model, model_name)

file_model = Model(
    path=model_name,
    type=AssetTypes.CUSTOM_MODEL,
    name="credit-default-predictor-v1",
    description=f"Random Forest model for credit default prediction - Accuracy: {accuracy:.4f}",
    tags={
        "algorithm": "RandomForest", 
        "accuracy": f"{accuracy:.4f}",
        "framework": "scikit-learn"
    }
)

try:
    registered_model = ml_client.models.create_or_update(file_model)
    print(f"MODELO REGISTRADO EXITOSAMENTE:")
    print(f"Nombre: {registered_model.name}")
    print(f"Versión: {registered_model.version}")
    print(f"ID: {registered_model.id}")
except Exception as e:
    print(f"Error registrando modelo: {e}")

REGISTRANDO MODELO EN AZURE ML...
MODELO REGISTRADO EXITOSAMENTE:
Nombre: credit-default-predictor-v1
Versión: 2
ID: /subscriptions/1c10247d-21e6-46ad-aad4-0415d628ab58/resourceGroups/synapse-ogvd-rg/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace-ogvd/models/credit-default-predictor-v1/versions/2
