In [None]:
# =======================================================
# 1. CARGA DE LIBRERÍAS
# =======================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib

# =======================================================
# 2. CARGA DE DATOS
# =======================================================
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

print("Datos de entrenamiento:", train.shape)
print("Datos de prueba:", test.shape)

# =======================================================
# 3. SELECCIÓN DE VARIABLES Y MANEJO DE NULOS
# =======================================================
cols_to_drop = ["Id"]
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

# Dividir entre columnas numéricas y categóricas
num_cols = train.select_dtypes(include=[np.number]).columns
cat_cols = train.select_dtypes(exclude=[np.number]).columns

# Rellenar nulos numéricos con la mediana
train[num_cols] = train[num_cols].fillna(train[num_cols].median())

# Rellenar nulos categóricos con el valor más frecuente (moda)
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])

# Codificar variables categóricas con One-Hot Encoding
train = pd.get_dummies(train, drop_first=True)


# =======================================================
# 4. DIVISIÓN DE DATOS
# =======================================================
X = train.drop("SalePrice", axis=1)
y = train["SalePrice"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# =======================================================
# 5. ESCALADO
# =======================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# =======================================================
# 6. ENTRENAMIENTO DEL MODELO
# =======================================================
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(rf, param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

best_model = grid.best_estimator_
print("Mejores parámetros:", grid.best_params_)

# =======================================================
# 7. EVALUACIÓN DEL MODELO
# =======================================================
y_pred = best_model.predict(X_val_scaled)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

# =======================================================
# 8. GUARDAR EL MODELO Y EL ESCALADOR
# =======================================================
joblib.dump(best_model, "model/house_price_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

print("✅ Modelo y escalador guardados correctamente.")


Datos de entrenamiento: (1460, 81)
Datos de prueba: (1459, 80)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Mejores parámetros: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
MAE: 17980.80
RMSE: 29219.95
R2: 0.89


FileNotFoundError: [Errno 2] No such file or directory: 'model/house_price_model.pkl'

In [None]:
import os
os.makedirs("model", exist_ok=True)

In [None]:
# =======================================================
# 8. GUARDAR EL MODELO Y EL ESCALADOR
# =======================================================
import os
os.makedirs("model", exist_ok=True)

joblib.dump(best_model, "model/house_price_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

print("✅ Modelo y escalador guardados correctamente en la carpeta 'model/'.")

✅ Modelo y escalador guardados correctamente en la carpeta 'model/'.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib

# Cargar los datos
train = pd.read_csv("data/train.csv")

# Solo las columnas que usarás en Flask
features = ["GrLivArea", "OverallQual", "GarageCars", "TotalBsmtSF", "YearBuilt"]
X = train[features]
y = train["SalePrice"]

# Escalador
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Entrenar el modelo
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Guardar el modelo y el escalador
joblib.dump(model, "model/house_price_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

print("✅ Modelo entrenado con 5 variables guardado correctamente.")


✅ Modelo entrenado con 5 variables guardado correctamente.


In [None]:
joblib.dump(model, "model/house_price_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

['model/scaler.pkl']

In [None]:
import joblib

# Nuevos objetos
joblib.dump(model, "model/house_price_model.pkl")  # 🔄 se reemplaza automáticamente
joblib.dump(scaler, "model/scaler.pkl")            # 🔄 también se reemplaza


['model/scaler.pkl']