In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="MedHouseVal")

# Handle missing values (check if any exist)
print(X.isnull().sum())  # Display any missing values (if any)

# Feature scaling: Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert scaled features into a DataFrame for better readability
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Print first few rows to inspect
print(X_scaled_df.head())



MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64
     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  
0  -1.327835  
1  -1.322844  
2  -1.332827  
3  -1.337818  
4  -1.337818  


2. Regression Algorithm Implementation

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "SVR": SVR()
}

# Train and predict using each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} - Predictions: {y_pred[:5]}")  # Displaying the first 5 predictions


Linear Regression - Predictions: [0.71912284 1.76401657 2.70965883 2.83892593 2.60465725]
Decision Tree - Predictions: [0.425   1.203   5.00001 2.225   2.257  ]
Random Forest - Predictions: [0.48217   0.71795   4.8455164 2.5594    2.27377  ]
Gradient Boosting - Predictions: [0.50518761 1.09334601 4.24570956 2.54517359 2.27910301]
SVR - Predictions: [0.52166189 1.56843583 3.58873947 2.48204847 2.58237506]


3. Model Evaluation and Comparison 
After training the models, we will evaluate each model using:

Mean Squared Error (MSE)
Mean Absolute Error (MAE)
R-squared Score (R²)

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate models
evaluation_metrics = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    evaluation_metrics[name] = {"MSE": mse, "MAE": mae, "R²": r2}

# Convert to DataFrame for easy comparison
metrics_df = pd.DataFrame(evaluation_metrics).T
print(metrics_df)


                        MSE       MAE        R²
Linear Regression  0.555892  0.533200  0.575788
Decision Tree      0.500997  0.456614  0.617679
Random Forest      0.253631  0.326988  0.806449
Gradient Boosting  0.294080  0.371723  0.775581
SVR                0.355198  0.397763  0.728941
