In [3]:
import numpy as np
import pandas as pd

# 1. Data Normalization
def normalize(x: np.ndarray) -> np.ndarray:
    """
    Scale an array to [0,1] by (x - min) / (max - min).
    """
    x_min = x.min()
    x_max = x.max()
    return (x - x_min) / (x_max - x_min)

# 2. Data Standardization
def standardize(x: np.ndarray) -> np.ndarray:
    """
    Z‑score standardize an array: (x - mean) / std.
    """
    mu = x.mean()
    sigma = x.std(ddof=1)  # sample std
    return (x - mu) / sigma


# 3. Working with the DataFrame
if __name__ == "__main__":
    # (a) Load the CSV and report number of rows
    df = pd.read_csv("calif_housing_data.csv")
    n_rows = df.shape[0]
    print(f"(a) Number of rows in dataset: {n_rows}")

    # (b) Define the target vector (what we want to predict)
    #     According to your description, this is the 'median_house_value' column
    y = df["median_house_value"]
    print(f"(b) Target vector is 'median_house_value' with shape {y.shape}")

    # (c) Create new feature: avg bedrooms per household
    df["bedrooms_per_household"] = df["total_bedrooms"] / df["households"]
    print("(c) Added feature 'bedrooms_per_household' (avg bedrooms per household).")

    # (d) Subset to three features: median_age, median_income, bedrooms_per_household
    features = df[["housing_median_age", "median_income", "bedrooms_per_household"]].copy()
    print("(d) New feature DataFrame head:")
    print(features.head(), "\n")

    # (e) Standardize these three features
    features_std = features.apply(lambda col: standardize(col.values), axis=0)
    features_std = pd.DataFrame(features_std, columns=features.columns)
    print("(e) First five rows of standardized features:")
    print(features_std.head())


(a) Number of rows in dataset: 20640
(b) Target vector is 'median_house_value' with shape (20640,)
(c) Added feature 'bedrooms_per_household' (avg bedrooms per household).
(d) New feature DataFrame head:
   housing_median_age  median_income  bedrooms_per_household
0                  41         8.3252                1.023810
1                  21         8.3014                0.971880
2                  52         7.2574                1.073446
3                  52         5.6431                1.073059
4                  52         3.8462                1.081081 

(e) First five rows of standardized features:
   housing_median_age  median_income  bedrooms_per_household
0            0.982119       2.344709                     NaN
1           -0.607004       2.332181                     NaN
2            1.856137       1.782656                     NaN
3            1.856137       0.932945                     NaN
4            1.856137      -0.012881                     NaN
