# Housing Price Prediction
Pipeline optimizado para predicción de precios de viviendas sin visualizaciones

In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from scipy.stats import randint
from scipy import stats

## 1. Data Loading

In [2]:
# Cargar datos
csv_path = Path(r"..\datasets\housing\housing.csv")
if not csv_path.exists():
    raise FileNotFoundError("No se encontró 'housing.csv'")
df = pd.read_csv(csv_path)
print(f"Dataset shape: {df.shape}")

Dataset shape: (20640, 10)


## 2. Stratified Split

In [3]:
# Crear categorías de ingreso para split estratificado
df["income_cat"] = pd.cut(df["median_income"],
                          bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                          labels=[1, 2, 3, 4, 5])

# Split estratificado
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in splitter.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Eliminar columna temporal
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

print(f"Train: {len(strat_train_set)} | Test: {len(strat_test_set)}")

Train: 16512 | Test: 4128


## 3. Feature Engineering

In [4]:
# Separar features y labels
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

print(f"Features shape: {housing.shape}")
print(f"Labels shape: {housing_labels.shape}")

Features shape: (16512, 9)
Labels shape: (16512,)


## 4. Custom Transformers

In [5]:
# ClusterSimilarity Transformer
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster_{i}_Similarity" for i in range(self.n_clusters)]

## 5. Preprocessing Pipeline

In [6]:
# Pipeline logarítmico
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

# Pipeline categórico
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

# Pipeline numérico default
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

# ClusterSimilarity
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=0.1, random_state=42)

# ColumnTransformer completo
preprocessing = make_column_transformer(
    (log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    (cluster_simil, ["latitude", "longitude"]),
    (cat_pipeline, make_column_selector(dtype_include=object)),
    remainder=default_num_pipeline
)

## 6. Model Training & Evaluation

In [7]:
# Linear Regression
print("=== Linear Regression ===")
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(housing, housing_labels)
predictions = lin_reg.predict(housing)
rmse = root_mean_squared_error(housing_labels, predictions)
print(f"RMSE: {rmse:.2f}")

=== Linear Regression ===
RMSE: 65272.17


In [8]:
# Decision Tree
print("\n=== Decision Tree ===")
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
scores = -cross_val_score(tree_reg, housing, housing_labels, 
                          scoring="neg_mean_squared_error", cv=10)
print(f"CV RMSE: {np.sqrt(scores.mean()):.2f} (+/- {np.sqrt(scores.std()):.2f})")


=== Decision Tree ===
CV RMSE: 65523.57 (+/- 16916.97)


In [9]:
# Random Forest
print("\n=== Random Forest ===")
forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
forest_scores = -cross_val_score(forest_reg, housing, housing_labels,
                                 scoring="neg_mean_squared_error", cv=10)
print(f"CV RMSE: {np.sqrt(forest_scores.mean()):.2f} (+/- {np.sqrt(forest_scores.std()):.2f})")


=== Random Forest ===
CV RMSE: 47493.86 (+/- 8429.02)


## 7. Hyperparameter Tuning

In [10]:
# Grid Search
print("=== Grid Search ===")
full_pipeline = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))

param_grid = [
    {'columntransformer__clustersimilarity__n_clusters': [5, 8, 10],
     'randomforestregressor__max_features': [4, 6, 8]},
    {'columntransformer__clustersimilarity__n_clusters': [10, 15],
     'randomforestregressor__max_features': [6, 8, 10]},
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                          scoring="neg_root_mean_squared_error", verbose=1)
grid_search.fit(housing, housing_labels)
print(f"Best params: {grid_search.best_params_}")
print(f"Best RMSE: {-grid_search.best_score_:.2f}")

=== Grid Search ===
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params: {'columntransformer__clustersimilarity__n_clusters': 15, 'randomforestregressor__max_features': 6}
Best RMSE: 45043.97


In [11]:
# Randomized Search
print("\n=== Randomized Search ===")
param_distribs = {
    "columntransformer__clustersimilarity__n_clusters": randint(low=3, high=50),
    "randomforestregressor__max_features": randint(low=2, high=20)
}

rnd_search = RandomizedSearchCV(full_pipeline, param_distributions=param_distribs,
                               n_iter=10, cv=3, scoring="neg_mean_squared_error",
                               random_state=42, verbose=1)
rnd_search.fit(housing, housing_labels)
print(f"Best params: {rnd_search.best_params_}")
print(f"Best RMSE: {np.sqrt(-rnd_search.best_score_):.2f}")


=== Randomized Search ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'columntransformer__clustersimilarity__n_clusters': 45, 'randomforestregressor__max_features': 9}
Best RMSE: 44221.87


## 8. Final Model Evaluation

In [12]:
# Feature importances
final_model = rnd_search.best_estimator_
feature_importances = final_model.named_steps["randomforestregressor"].feature_importances_
feature_names = final_model["columntransformer"].get_feature_names_out()

importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Features:")
print(importances_df.head(10).to_string(index=False))


Top 10 Features:
                                 feature  importance
               pipeline-1__median_income    0.221006
      pipeline-2__ocean_proximity_INLAND    0.061082
 clustersimilarity__Cluster_3_Similarity    0.041665
clustersimilarity__Cluster_18_Similarity    0.034332
clustersimilarity__Cluster_35_Similarity    0.026109
clustersimilarity__Cluster_22_Similarity    0.025005
clustersimilarity__Cluster_17_Similarity    0.023166
clustersimilarity__Cluster_41_Similarity    0.020659
 clustersimilarity__Cluster_6_Similarity    0.020091
clustersimilarity__Cluster_31_Similarity    0.019491


In [13]:
# Test set evaluation
x_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = final_model.predict(x_test)
final_rmse = root_mean_squared_error(y_test, final_predictions)
print(f"\n=== Test Set Performance ===")
print(f"RMSE: {final_rmse:.2f}")

# Confidence interval
confidence = 0.95
square_errors = (final_predictions - y_test) ** 2
conf_interval = np.sqrt(stats.t.interval(
    confidence, len(square_errors) - 1,
    loc=square_errors.mean(),
    scale=stats.sem(square_errors)
))
print(f"95% Confidence Interval: [{conf_interval[0]:.2f}, {conf_interval[1]:.2f}]")


=== Test Set Performance ===
RMSE: 43411.76
95% Confidence Interval: [41204.57, 45512.03]


## 9. Model Serialization

In [14]:
import joblib
import os

# Guardar modelo
model_path = "final_model_optimized.pkl"
joblib.dump(final_model, model_path)
size_mb = os.path.getsize(model_path) / (1024 * 1024)
print(f"✓ Modelo guardado: {model_path} ({size_mb:.2f} MB)")

# Verificar carga
loaded_model = joblib.load(model_path)
test_predictions = loaded_model.predict(x_test.iloc[:5])
print(f"✓ Predicciones de prueba: {test_predictions.round(-2)}")
print(f"✓ Valores reales: {y_test.iloc[:5].values}")

✓ Modelo guardado: final_model_optimized.pkl (138.45 MB)
✓ Predicciones de prueba: [407700. 207300. 352600. 337300. 281300.]
✓ Valores reales: [397700. 202900. 310000. 314300. 187500.]
