In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load dataset
housing = fetch_california_housing()

# Convert to DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseValue'] = housing.target

# Save to CSV
df.to_csv("california_housing_dataset.csv", index=False)
print("Dataset saved as 'california_housing_dataset.csv'")

Dataset saved as 'california_housing_dataset.csv'


In [2]:
print(df.isnull().sum())

MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64


In [5]:
from sklearn.preprocessing import StandardScaler

# Separate features and target
features = df.drop('MedHouseValue', axis=1)
target = df['MedHouseValue']

# Apply standard scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Create a new DataFrame with scaled features
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df['MedHouseValue'] = target

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Feature-target split
X = scaled_df.drop('MedHouseValue', axis=1)
y = scaled_df['MedHouseValue']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
Linear Regression models the relationship between the target variable and one or more input features using a linear equation. It assumes that the target is a weighted sum of the input features.

In [9]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Decision Tree Regressor
Decision Trees split the data into subsets using feature values to minimize variance in target values within each subset. It creates a tree-like model of decisions.

In [11]:
from sklearn.tree import DecisionTreeRegressor
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)

# Random Forest Regressor
Random Forest is an ensemble of Decision Trees. It aggregates predictions from multiple trees to reduce overfitting and improve generalization

In [13]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Gradient Boosting Regressor
Gradient Boosting builds models sequentially. Each new model corrects errors made by the previous ones, using a gradient descent approach to minimize the loss.

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
model_gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model_gb.fit(X_train, y_train)

# Support Vector Regressor (SVR)
SVR attempts to fit the best line (or curve) that lies within a threshold margin (epsilon) from the actual values. It uses kernels to capture non-linear relationships.

In [15]:
from sklearn.svm import SVR
model_svr = SVR(kernel='rbf', C=100, epsilon=0.1)
model_svr.fit(X_train, y_train)

In [19]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'SVR': SVR(kernel='rbf', C=100, epsilon=0.1)
}

In [35]:
# Evaluate models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({'Model': name, 'MSE': mse, 'MAE': mae, 'R²': r2})

# Model Evaluation and Comparison
Evaluated the performance of the following regression algorithms on the California Housing dataset using three key metrics:
Mean Squared Error (MSE): Measures the average squared difference between predicted and actual values. Lower is better.
Mean Absolute Error (MAE): Measures the average magnitude of errors in predictions. Lower is better.
R-squared Score (R²): Indicates the proportion of variance explained by the model. Higher is better, with a maximum of 1.

In [38]:
# Display results
results_df = pd.DataFrame(results).sort_values(by='R²', ascending=False).reset_index(drop=True)
print("Model Evaluation Results:\n")
print(results_df)

Model Evaluation Results:

               Model       MSE       MAE        R²
0      Random Forest  0.255498  0.327613  0.805024
1  Gradient Boosting  0.293999  0.371650  0.775643
2                SVR  0.315896  0.369656  0.758933
3      Decision Tree  0.494272  0.453784  0.622811
4  Linear Regression  0.555892  0.533200  0.575788


# Best-Performing Algorithm: Random Forest Regressor

1.Achieved the lowest MSE and MAE, and the highest R² score.
2.Effectively captures non-linear relationships and reduces overfitting through ensemble learning.
3.Robust to outliers and performs well without extensive parameter tuning.

# Worst-Performing Algorithm: Linear Regression:

1.Assumes a linear relationship between features and target, which is often not the case in real-world housing data.
2.Produced the highest error metrics and lowest R² score, indicating underfitting.
3.Not flexible enough to model the complex, non-linear patterns present in the dataset.

In [None]:
# Libraries Used:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd