In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [3]:
# Load the California Housing dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target

In [5]:
# Check for missing values
print("Missing values in features:\n", X.isnull().sum())

Missing values in features:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"\n{name}:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R^2 Score: {r2:.4f}")
    return {"Model": name, "MSE": mse, "MAE": mae, "R2": r2}


In [15]:
# Initialize models
models = [
    ("Linear Regression", LinearRegression()),
    ("Decision Tree Regressor", DecisionTreeRegressor(random_state=42)),
    ("Random Forest Regressor", RandomForestRegressor(random_state=42)),
    ("Gradient Boosting Regressor", GradientBoostingRegressor(random_state=42)),
    ("Support Vector Regressor (SVR)", SVR())
]

In [17]:
# Evaluate all models and collect results
results = []
for name, model in models:
    if name == "Support Vector Regressor (SVR)":
        results.append(evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test))
    else:
        results.append(evaluate_model(name, model, X_train, X_test, y_train, y_test))


Linear Regression:
MSE: 0.5559
MAE: 0.5332
R^2 Score: 0.5758

Decision Tree Regressor:
MSE: 0.4952
MAE: 0.4547
R^2 Score: 0.6221

Random Forest Regressor:
MSE: 0.2554
MAE: 0.3275
R^2 Score: 0.8051

Gradient Boosting Regressor:
MSE: 0.2940
MAE: 0.3716
R^2 Score: 0.7756

Support Vector Regressor (SVR):
MSE: 0.3570
MAE: 0.3986
R^2 Score: 0.7276


In [18]:
# Compare results
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.sort_values(by="R2", ascending=False))



Model Comparison:
                            Model       MSE       MAE        R2
2         Random Forest Regressor  0.255368  0.327543  0.805123
3     Gradient Boosting Regressor  0.293997  0.371643  0.775645
4  Support Vector Regressor (SVR)  0.357004  0.398599  0.727563
1         Decision Tree Regressor  0.495235  0.454679  0.622076
0               Linear Regression  0.555892  0.533200  0.575788
