## Import Necessary Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
import tarfile
import urllib

## Define the Custom Transformer

In [4]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        # Indices for the columns needed
        rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
        
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


## Load and Prepare Data

In [5]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()




In [6]:
fetch_housing_data()

In [7]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [8]:
housing = load_housing_data()

In [9]:
# Split into train and test sets
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# Separate features and target variable
housing_features = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

## Build and Use the Pipeline

In [10]:
# Define numerical and categorical attributes
num_attribs = list(housing_features.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# Full pipeline using ColumnTransformer
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

# Transform the data
housing_prepared = full_pipeline.fit_transform(housing_features)


## Train and Evaluate the Model

In [11]:
# Train a linear regression model
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# Evaluate the model using the test set
housing_test_features = test_set.drop("median_house_value", axis=1)
housing_test_labels = test_set["median_house_value"].copy()

housing_test_prepared = full_pipeline.transform(housing_test_features)

# Make predictions
predictions = lin_reg.predict(housing_test_prepared)

# Evaluate using RMSE
mse = mean_squared_error(housing_test_labels, predictions)
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 72701.32600762136
