In [3]:

import logging
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
RND = 42
BASE_DIR = Path.cwd()
DATA_PATH = BASE_DIR / "housing.csv"

# 1) Load
df = pd.read_csv(DATA_PATH)
print("Loaded rows:", len(df))

# 2) Basic cleaning + feature engineering
def clean_and_engineer(df):
    df = df.copy()
    # Avoid division by zero
    df['households'] = df['households'].replace(0, np.nan)
    df['total_rooms'] = df['total_rooms'].replace(0, np.nan)
    # Basic engineered features
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']
    # Clip extreme outliers (robust clipping using IQR)
    num_cols = df.select_dtypes(include=[np.number]).columns
    for c in num_cols:
        q1 = df[c].quantile(0.25)
        q3 = df[c].quantile(0.75)
        iqr = q3 - q1
        if iqr > 0:
            low = q1 - 3 * iqr
            high = q3 + 3 * iqr
            df[c] = df[c].clip(lower=low, upper=high)
    return df

df = clean_and_engineer(df)
display(df.head())

# 3) Prepare X/y and preprocessor
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", cat_transformer, categorical_cols)
])

# 4) Baseline CV on RandomForest
pipe_rf = Pipeline([("preproc", preprocessor), ("model", RandomForestRegressor(random_state=RND))])
scores = cross_val_score(pipe_rf, X, y, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
rmse_cv = np.sqrt(-scores)
print(f"Baseline RF CV RMSE: mean={rmse_cv.mean():.2f}, std={rmse_cv.std():.2f}")

# 5) Hold-out split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RND)

# 6) RandomizedSearchCV for RandomForest (feintuning)
param_dist = {
    "model__n_estimators": randint(100, 801),
    "model__max_depth": [None, 10, 20, 30, 50],
    "model__min_samples_leaf": [1, 2, 4, 8],
    "model__max_features": ["sqrt", "log2", 0.5]
}
rs = RandomizedSearchCV(pipe_rf, param_distributions=param_dist, n_iter=25,
                        scoring="neg_mean_squared_error", cv=5, n_jobs=-1, random_state=RND, verbose=1)
rs.fit(X_train, y_train)
best_rf = rs.best_estimator_
y_pred = best_rf.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Best RF params:", rs.best_params_)
print("Best RF test RMSE:", round(test_rmse, 2))

# 7) Compare several models with GridSearch (small grids)
models_and_grids = {
    "RandomForest": (RandomForestRegressor(random_state=RND), {"model__n_estimators": [200], "model__max_depth": [None]}),
    "DecisionTree": (DecisionTreeRegressor(random_state=RND), {"model__max_depth": [10, None], "model__min_samples_leaf":[1,5]}),
    "Ridge": (Ridge(), {"model__alpha": [0.1, 1.0, 10.0]}),
    "LinearRegression": (LinearRegression(), {"model__fit_intercept": [True, False]}),
    "SVR": (SVR(), {"model__C": [1, 10], "model__kernel": ["linear", "rbf"]})
}
results = []
for name, (estimator, grid) in models_and_grids.items():
    pipe = Pipeline([("preproc", preprocessor), ("model", estimator)])
    gs = GridSearchCV(pipe, param_grid=grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=0)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    y_pred_m = best.predict(X_test)
    test_rmse_m = np.sqrt(mean_squared_error(y_test, y_pred_m))
    results.append({"model": name, "cv_rmse": np.sqrt(-gs.best_score_), "test_rmse": test_rmse_m, "best_params": gs.best_params_})
comp_df = pd.DataFrame(results).sort_values("cv_rmse")
display(comp_df)


# 9) Save best model
out_path = BASE_DIR / "best_model.joblib"
joblib.dump(best_rf, out_path)
print("Model saved to:", out_path)


Loaded rows: 20640


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


Baseline RF CV RMSE: mean=69316.60, std=8441.59
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best RF params: {'model__max_depth': 30, 'model__max_features': 0.5, 'model__min_samples_leaf': 2, 'model__n_estimators': 786}
Best RF test RMSE: 48396.74


Unnamed: 0,model,cv_rmse,test_rmse,best_params
0,RandomForest,50267.292452,49401.463635,"{'model__max_depth': None, 'model__n_estimator..."
1,DecisionTree,60003.804746,60141.848569,"{'model__max_depth': 10, 'model__min_samples_l..."
2,Ridge,64560.497959,66165.126785,{'model__alpha': 1.0}
3,LinearRegression,64561.810414,66159.241694,{'model__fit_intercept': True}
4,SVR,80308.167887,77797.941748,"{'model__C': 10, 'model__kernel': 'linear'}"


Model saved to: /Users/cedricstillecke/Documents/CloudExplain/DataScienceTutorial/HousePricing/best_model.joblib
