# Tarea 5

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

import optuna
import mlflow
import dagshub

np.random.seed(42)
tf.random.set_seed(42)

In [7]:
# Verificar GPU
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs disponibles: {len(gpus)}")
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    print(f"GPU: {gpus[0].name}")

GPUs disponibles: 1
GPU: /physical_device:GPU:0


In [8]:
#DagsHub
dagshub.init(repo_owner='404brainnotfound-ai', repo_name='Tarea_5', mlflow=True)
print(f"URI de seguimiento: {mlflow.get_tracking_uri()}")

URI de seguimiento: https://dagshub.com/404brainnotfound-ai/Tarea_5.mlflow


In [14]:
# Dataset
df = pd.read_csv('global_house_purchase_dataset.csv')
df = df.drop('property_id', axis=1)

print(f"Forma: {df.shape}")
print(f"Clases: {df['decision'].value_counts().to_dict()}")

Forma: (200000, 24)
Clases: {0: 153932, 1: 46068}


In [15]:
# Codificación one-hot para variables

columnas_categoricas = ['country', 'city', 'property_type', 'furnishing_status']
df_codificado = pd.get_dummies(df, columns=columnas_categoricas, dtype=int)

print(f"Características después de codificar: {df_codificado.shape[1] - 1}")

Características después de codificar: 81


In [16]:
# Separar X, y
X = df_codificado.drop('decision', axis=1).values
y = df_codificado['decision'].values

# Normalizar
escalador = MinMaxScaler()
X_escalado = escalador.fit_transform(X)

print(f"Forma de X: {X_escalado.shape}")
print(f"Forma de y: {y.shape}")

Forma de X: (200000, 81)
Forma de y: (200000,)


In [18]:
# División 70/20/10

X_entreno, X_temp, y_entreno, y_temp = train_test_split(X_escalado, y, test_size=0.30, random_state=42, stratify=y)
X_prueba, X_validacion, y_prueba, y_validacion = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp)

print(f"Entreno: {len(X_entreno)} ({len(X_entreno)/len(X)*100:.1f}%)")
print(f"Prueba:  {len(X_prueba)} ({len(X_prueba)/len(X)*100:.1f}%)")
print(f"Validación: {len(X_validacion)} ({len(X_validacion)/len(X)*100:.1f}%)")

Entreno: 140000 (70.0%)
Prueba:  40000 (20.0%)
Validación: 20000 (10.0%)


In [19]:
# Reshape para CNN 1D (muestras,características,1)

num_caracteristicas = X_entreno.shape[1]
X_entreno_cnn = X_entreno.reshape((X_entreno.shape[0], num_caracteristicas, 1))
X_prueba_cnn = X_prueba.reshape((X_prueba.shape[0], num_caracteristicas, 1))
X_validacion_cnn = X_validacion.reshape((X_validacion.shape[0], num_caracteristicas, 1))

print(f"Forma de entreno CNN: {X_entreno_cnn.shape}")

Forma de entreno CNN: (140000, 81, 1)
