In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load dataset
# Replace 'your_dataset.csv' with your file path
df = pd.read_csv("california_housing.csv")

# 2. Drop missing rows
df = df.dropna()

# 3. Drop unnecessary columns if they exist
columns_to_drop = ["ocean_proximity", "latitude", "longitude"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors="ignore")

# 4. Select required features
features = [
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income"
]
target = "median_house_value"

X = df[features]
y = df[target]

# 5. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to run experiments
def run_experiment(X_train, X_test, y_train, y_test, scaler=None, description="Without normalization", learning_rate=0.01):
    # Apply scaling if provided
    if scaler:
        scaler = scaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # Model
    model = SGDRegressor(max_iter=1000, tol=1e-3, eta0=learning_rate, learning_rate='constant', random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Output results
    print("\n===", description, "===")
    print("Learning rate:", learning_rate)
    print("1/2 MSE:",round(mse/2,2))
    print("R²:",round(r2,2))
    print("Converged:", model.t_ < model.max_iter)
    print("Iterations:", model.t_)

# 6. Run experiments
run_experiment(X_train, X_test, y_train, y_test, scaler=None, description="Without normalization")
run_experiment(X_train, X_test, y_train, y_test, scaler=StandardScaler, description="With Z-score normalization")
run_experiment(X_train, X_test, y_train, y_test, scaler=MinMaxScaler, description="With Min-Max scaling")
run_experiment(X_train, X_test, y_train, y_test, scaler=RobustScaler, description="With Robust scaling")



=== Without normalization ===
Learning rate: 0.01
1/2 MSE: 5.028064249092017e+32
R²: -7.353561649761306e+22
Converged: False
Iterations: 114423.0

=== With Z-score normalization ===
Learning rate: 0.01
1/2 MSE: 2958020332.11
R²: 0.57
Converged: False
Iterations: 196153.0

=== With Min-Max scaling ===
Learning rate: 0.01
1/2 MSE: 3041873981.68
R²: 0.56
Converged: False
Iterations: 997107.0

=== With Robust scaling ===
Learning rate: 0.01
1/2 MSE: 2962541488.86
R²: 0.57
Converged: False
Iterations: 196153.0
