In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


Loading and Preprocessing

In [4]:
data = fetch_california_housing()
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['MedHouseValue'] = data.target


Check for missing values

In [6]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64


Missing values can distort model training and lead to errors or misleading results. In this dataset, there are no missing values, so no imputation was needed.

In [10]:
X = df.drop('MedHouseValue', axis=1)
y = df['MedHouseValue']

Split into train and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Feature scaling

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Regression Algorithms

In [19]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

results = []


In [21]:
for name, model in models.items():
    # Use scaled features for SVR and Linear Regression
    if name in ["Support Vector Regressor", "Linear Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R²": r2
    })

Convert results to DataFrame


In [24]:
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results:")
print(results_df.sort_values(by="R²", ascending=False))


Model Evaluation Results:
                         Model       MSE       MAE        R²
2      Random Forest Regressor  0.255368  0.327543  0.805123
3  Gradient Boosting Regressor  0.293997  0.371643  0.775645
4     Support Vector Regressor  0.357004  0.398599  0.727563
1      Decision Tree Regressor  0.495235  0.454679  0.622076
0            Linear Regression  0.555892  0.533200  0.575788


Best and Worst Performing Models



In [26]:
best_model = results_df.loc[results_df['R²'].idxmax()]
worst_model = results_df.loc[results_df['R²'].idxmin()]


R² score (coefficient of determination) tells how well the model explains variance in the target variable. Higher is better, with 1.0 being perfect.

In [28]:
from IPython.display import display, HTML

In [30]:
display(HTML('<span style="color:blue; font-weight:bold; font-size:18px">Best Performing Model</span>'))
print(best_model)


Model    Random Forest Regressor
MSE                     0.255368
MAE                     0.327543
R²                      0.805123
Name: 2, dtype: object


In [32]:
display(HTML('<span style="color:blue; font-weight:bold; font-size:18px">Worst Performing Model</span>'))
print(worst_model)

Model    Linear Regression
MSE               0.555892
MAE                 0.5332
R²                0.575788
Name: 0, dtype: object
