In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error


In [4]:
import os
print(os.getcwd())

C:\Users\as972\Amar\Stanford Artificial Intelligence Professional Program\CS229_Practice_Projects\1_bias-variance-california


In [10]:
Results_dir = Path("results")
Fig_dir = Results_dir/"plots"
Fig_dir.mkdir(parents=True, exist_ok=True)

In [None]:
##Load data (as pandas DataFrame) + (optional) engineered features

data = fetch_california_housing(as_frame=True)
df = data.frame
##df.head()

def add_engineered_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    eps = 1e-6
    X["Rooms_per_person"] = X["AveRooms"] / np.maximum(X["AveOccup"], eps)
    X["Bedroom_ratio"] = X["AveBedrms"] / np.maximum(X["AveRooms"], eps)
    X["Close_to_beach_flag"] = ((X["Longitude"] > -122) & (X["Latitude"].between(33,38))).astype(int)
    return X

y = df["MedHouseVal"]
X_raw = df.drop(columns=["MedHouseVal"])

USE_ENG_FEATURES = False

X = add_engineered_features(X_raw) if USE_ENG_FEATURES else X_raw

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

X_train.shape, X_test.shape


    


In [None]:
### 2 -> Model Builders + Evaluators

def linear_poly (degree: int):
    return make_pipeline(
        PolynomialFeatures(degree = degree, include_bias = False),
        StandardScaler(with_mean=False),
        LinearRegression()
    )

def poly_ridge (degree: int, alpha: float):
    return make_pipeline(
        PolynomialFeatures(degree = degree, include_bias = False),
        StandardScaler(with_mean=False),
        Ridge(alpha=alpha, random_state = 42)
    )
        
def poly_lasso (degree: int, alpha: float):
    return make_pipeline(
        PolynomialFeatures(degree = degree, include_bias = False),
        StandardScaler(with_mean=False),
        Lasso(alpha=alpha, random_state = 42, max_iter = 20000)
    )

def eval_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    training_error = mean_squared_error(y_train, model.predict(X_train))
    test_error = mean_squared_error(y_test, model.predict(X_test))
    return training_error, test_error
    
    

In [None]:
### 3 -> Complexicity Sweep: Bias - Reduce but Vaiance Increase

poly_degrees = [1,2,3,5,8,10]
rows = []

for d in poly_degrees:
    model = linear_poly(d)
    training_error, test_error = eval_model(model, X_train, y_train, X_test, y_test)
    rows.append(dict(
        stage="complexity", 
        features = ("eng" if USE_ENG_FEATURES else "raw"), 
        kind="lin", 
        degree =d, 
        alpha=np.nan, 
        train_mse = training_error, 
        test_mse=test_error
    ))

results = pd.DataFrame(rows)

plt.figure(figsize=(7,4))
sub = results.sort_values("degree")
plt.plot(sub["degree"], sub["train_mse"], marker="o", label="Train MSE")
plt.plot(sub["degree"], sub["test_mse"],  marker="o", label="Test MSE")
plt.xlabel("Polynomial Degree"); plt.ylabel("MSE"); plt.title("Complexity Sweep")
plt.legend(); plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(Fig_dir/"01_complexity_sweep.png", dpi=160)
plt.show()

results

In [None]:
### 4 -> Feature engineering comparison (reduce bias smartly)

def run_with_features(use_eng_features: bool, degrees=[1,2,3,5,8,10]):
    X = add_engineered_features(X_raw) if use_eng_features else X_raw
    X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    rows=[]
    for d in degrees:
        model = linear_poly(d)
        train_error, test_error = eval_model(model, X_train , y_train, X_test,  y_test)
        rows.append(dict(
            stage="features",
            features=("eng" if use_eng_features else "raw"),
            kind="lin",
            degree=d,
            aplha=np.nan,
            train_mse=train_error,
            test_mse=test_error
        ))

    return pd.Dataframe(rows)

results_raw = run_with_features(False)
results_features = run_with_features(True)

feature_results = pd.concate([results_raw, results_features], ignore_index=True)

print("Feature results shape:", feature_results.shape)
print("Unique feature sets:", feature_results["features"].unique())

plt.figure(figsize=(7,4))
for label, df_ in feature_results.groupby("features"):
    df_ = df_.sort_values("degree")
    plt.plot(df_["degree"], df_["test_mse"], marker="o", label=f"Test MSE ({label})")
plt.xlabel("Degree"); plt.ylabel("MSE"); plt.title("Effect of Feature Engineering")
plt.legend(); plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(Fig_dir/"02_features_effect.png", dpi=160)
plt.show()

feat_results
        

