In [0]:
# =========================================
# Datos: airbnb.gold.feature_ml_price
# Objetivo: predecir target_price (regresión)
# =========================================
from pyspark.sql import functions as F

TABLE = "airbnb.gold.feature_ml_price"
COLS = [
    "target_price",
    "accommodates","bedrooms","beds","bathrooms",
    "minimum_nights","maximum_nights",
    "number_of_reviews","review_scores_rating",
    "num_amenities",
    "host_is_superhost","property_type","room_type","country"
]

sdf = (
    spark.table(TABLE)
    .select(*COLS)
    .na.drop(subset=["target_price"])         # target obligatorio
)

df = sdf.toPandas()
df.shape, df.head(3)


In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

target = "target_price"
numeric_features = [
    "accommodates","bedrooms","beds","bathrooms",
    "minimum_nights","maximum_nights",
    "number_of_reviews","review_scores_rating","num_amenities"
]
categorical_features = ["host_is_superhost","property_type","room_type","country"]

# Coerción numérica segura
for c in numeric_features:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Booleana/categórica → string (para OHE estable)
df["host_is_superhost"] = df["host_is_superhost"].astype("boolean").astype("string")
for c in ["property_type","room_type","country"]:
    df[c] = df[c].astype("string")

# X, y
X = df[numeric_features + categorical_features].copy()
y = pd.to_numeric(df[target], errors="coerce")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)
X_train.shape, X_test.shape


In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # opcional para árboles, útil si luego pruebas modelos lineales/boosting
])

categorical_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, numeric_features),
        ("cat", categorical_pre, categorical_features)
    ],
    remainder="drop"
)

model = Pipeline(steps=[
    ("prep", preprocessor),
    ("rf", RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42
    ))
])


In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import time

t0 = time.time()
model.fit(X_train, y_train)
train_time = time.time() - t0

y_pred = model.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print(f"Entrenamiento: {train_time:.1f}s")
print(f"MAE : {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²  : {r2:.3f}")


In [0]:
import joblib
import os

MODEL_PATH = "/Volumes/workspace/airbnb/models/model_airbnb_price.pkl"  # DBFS path visible como archivo
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
joblib.dump(model, MODEL_PATH)
print("Guardado en:", MODEL_PATH)

# Recargar en otra sesión/celda
loaded_model = joblib.load(MODEL_PATH)
_ = loaded_model.predict(X_test.head(3))


In [0]:
import pandas as pd

nuevo = pd.DataFrame([{
    "accommodates": 4,
    "bedrooms": 2,
    "beds": 2,
    "bathrooms": 1.5,
    "minimum_nights": 2,
    "maximum_nights": 30,
    "number_of_reviews": 85,
    "review_scores_rating": 95,
    "num_amenities": 12,
    "host_is_superhost": "True",
    "property_type": "Apartment",
    "room_type": "Entire home/apt",
    "country": "Peru"
}])

pred = model.predict(nuevo)[0]
print(f"Precio estimado: ${pred:,.2f}")


In [0]:
import pandas as pd

nuevo = pd.DataFrame([{
    "accommodates": 1,
    "bedrooms": 4,
    "beds": 3,
    "bathrooms": 2,
    "minimum_nights": 2,
    "maximum_nights": 30,
    "number_of_reviews": 10,
    "review_scores_rating": 95,
    "num_amenities": 12,
    "host_is_superhost": "false",
    "property_type": "Home",
    "room_type": "Entire home/apt",
    "country": "Brazil"
}])

pred = model.predict(nuevo)[0]
print(f"Precio estimado: ${pred:,.2f}")
