In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

# 1. Load the dataset
housing = pd.read_csv("housing.csv")

# 2. Add income category for stratified shuffle splitting
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5 , 3.0, 4.5, 6.0, np.inf],
    labels=[ 1, 2, 3, 4, 5]
)

# 3. Stratified split into train/test
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index].drop("income_cat", axis=1)
    strat_test_set = housing.loc[test_index].drop("income_cat", axis=1)

housing = strat_train_set.copy()

# 4. Separate labels and features
housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

# 5. Define numeric and categorical attributes
num_attri = housing.drop("ocean_proximity", axis=1).columns.tolist()
cat_attri = ["ocean_proximity"]

# 6. Create pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# 7. Full preprocessing pipeline
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attri),
    ("cat", cat_pipeline, cat_attri)
])

# 8. Prepare the data
housing_prepared = full_pipeline.fit_transform(housing)

# ----------------------
# Linear Regression
# ----------------------
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
lin_pred = lin_reg.predict(housing_prepared)
lin_rmse = root_mean_squared_error(lin_pred, housing_labels)
print(f"\nLinear Regression - Training RMSE: {lin_rmse:.2f}")

lin_scores = -cross_val_score(
    lin_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print("Linear Regression - Cross-Validation RMSE Stats:")
print(pd.Series(lin_scores).describe())

# ----------------------
# Decision Tree Regressor
# ----------------------
dec_reg = DecisionTreeRegressor()
dec_reg.fit(housing_prepared, housing_labels)
dec_pred = dec_reg.predict(housing_prepared)
dec_rmse = root_mean_squared_error(dec_pred, housing_labels)
print(f"\nDecision Tree - Training RMSE: {dec_rmse:.2f}")

dec_scores = -cross_val_score(
    dec_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print("Decision Tree - Cross-Validation RMSE Stats:")
print(pd.Series(dec_scores).describe())

# ----------------------
# Random Forest Regressor
# ----------------------
rf_reg = RandomForestRegressor()
rf_reg.fit(housing_prepared, housing_labels)
rf_pred = rf_reg.predict(housing_prepared)
rf_rmse = root_mean_squared_error(rf_pred, housing_labels)
print(f"\nRandom Forest - Training RMSE: {rf_rmse:.2f}")

rf_scores = -cross_val_score(
    rf_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print("Random Forest - Cross-Validation RMSE Stats:")
print(pd.Series(rf_scores).describe())



Linear Regression - Training RMSE: 69050.56
Linear Regression - Cross-Validation RMSE Stats:
count       10.000000
mean     69204.322755
std       2500.382157
min      65318.224029
25%      67124.346106
50%      69404.658178
75%      70697.800632
max      73003.752739
dtype: float64

Decision Tree - Training RMSE: 0.00
Decision Tree - Cross-Validation RMSE Stats:
count       10.000000
mean     69020.351887
std       2583.422121
min      64687.477346
25%      67717.984627
50%      68591.874868
75%      69962.584462
max      73612.859687
dtype: float64

Random Forest - Training RMSE: 18393.48
Random Forest - Cross-Validation RMSE Stats:
count       10.000000
mean     49467.780988
std       2086.225011
min      46492.253923
25%      47831.688730
50%      49269.927964
75%      50801.093442
max      53093.515874
dtype: float64
