In [4]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
#1:Load the California Housing dataset
# -------------------------------------------------------------------------------
california = fetch_california_housing(as_frame=True)
#Convert the dataset into a pandas DataFrame 
df = california.frame

In [8]:
#Data Inspection:
print(df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [10]:
print(df.describe())

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1.153956  
min        0.692308     32.54000

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None


In [None]:
# 2)Preprocessing
# --------------------------------------------------------------------------

In [14]:
#checking missing values
print(df.isnull().sum())

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [None]:
#there is no null values in this dataset.

In [16]:
# Separate features (X) and target (y)
x=df.drop('MedHouseVal',axis=1)
y=df['MedHouseVal']


In [18]:
#Splitting the data into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [20]:
#Standardization
scaler=StandardScaler()
x_train_scaler=scaler.fit_transform(x_train)
x_test_scaler=scaler.fit_transform(x_test)

In [68]:
# Explanation
print("preprocessing:")
print("The absence of missing values simplified the preprocessing.Feature scaling (standardization) was crucial due to the wide variations in the ranges of the features.We used StandardScaler to transform the features so that they have a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the model.")



preprocessing:
The absence of missing values simplified the preprocessing.Feature scaling (standardization) was crucial due to the wide variations in the ranges of the features.We used StandardScaler to transform the features so that they have a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the model.


In [None]:
# 3)Regression Algorithm Implementation
# ---------------------------------------------------------------------------

In [26]:
# 1)Linear regression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

print("Linear Regression:")
print("Explanation: Linear Regression models the relationship between the features and the target variable as a linear equation. It's suitable when the relationship is approximately linear.")

Linear Regression:
Explanation: Linear Regression models the relationship between the features and the target variable as a linear equation. It's suitable when the relationship is approximately linear.


In [30]:
# 2)Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)

print("\nDecision Tree Regressor:")
print("Explanation: Decision Tree Regressor partitions the feature space into rectangular regions and fits a simple constant model within each region. It can capture non-linear relationships but is prone to overfitting.")


Decision Tree Regressor:
Explanation: Decision Tree Regressor partitions the feature space into rectangular regions and fits a simple constant model within each region. It can capture non-linear relationships but is prone to overfitting.


In [32]:
# 3)Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

print("\nRandom Forest Regressor:")
print("Explanation: Random Forest Regressor is an ensemble method that builds multiple decision trees and averages their predictions. It reduces overfitting and improves generalization.")


Random Forest Regressor:
Explanation: Random Forest Regressor is an ensemble method that builds multiple decision trees and averages their predictions. It reduces overfitting and improves generalization.


In [34]:
# 4)Gradient Boosting Regressor
gb = GradientBoostingRegressor(random_state=42)
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)

print("\nGradient Boosting Regressor:")
print("Explanation: Gradient Boosting Regressor is another ensemble method that builds trees sequentially, with each tree correcting the errors of the previous ones. It often achieves high accuracy.")


Gradient Boosting Regressor:
Explanation: Gradient Boosting Regressor is another ensemble method that builds trees sequentially, with each tree correcting the errors of the previous ones. It often achieves high accuracy.


In [37]:
# 5)Support Vector Regressor (SVR)
svr = SVR()
svr.fit(x_train, y_train)
y_pred_svr = svr.predict(x_test)

print("\nSupport Vector Regressor (SVR):")
print("Explanation: SVR maps the features into a high-dimensional space and finds the best-fitting hyperplane. It's effective in high-dimensional spaces and can handle non-linear relationships using kernel functions.")


Support Vector Regressor (SVR):
Explanation: SVR maps the features into a high-dimensional space and finds the best-fitting hyperplane. It's effective in high-dimensional spaces and can handle non-linear relationships using kernel functions.


In [None]:
# 4) Model Evaluation and Comparison 
# -------------------------------------------------------------------------------------

In [40]:
# Calculate metrics for each model individually
# linear regression:
mse_lr= mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Dicision tree regressor
mse_dt=mean_squared_error(y_test,y_pred_dt)
mae_dt=mean_absolute_error(y_test,y_pred_dt)
r2_dt=mean_squared_error(y_test,y_pred_dt)

# Random forest regressor
mse_rf=mean_squared_error(y_test,y_pred_rf)
mae_rf=mean_absolute_error(y_test,y_pred_rf)
r2_rf=mean_squared_error(y_test,y_pred_rf)

# Gradient Boosting regressor
mse_gb=mean_squared_error(y_test,y_pred_gb)
mae_gb=mean_absolute_error(y_test,y_pred_gb)
r2_gb=mean_squared_error(y_test,y_pred_gb)

# SVR
mse_svr=mean_squared_error(y_test,y_pred_svr)
mae_svr=mean_absolute_error(y_test,y_pred_svr)
r2_svr=mean_squared_error(y_test,y_pred_svr)

In [46]:
# Create DataFrame to display the results
results = {
    "Linear Regression": [mse_lr, mae_lr, r2_lr],
    "Decision Tree": [mse_dt, mae_dt, r2_dt],
    "Random Forest": [mse_rf, mae_rf, r2_rf],
    "Gradient Boosting": [mse_gb, mae_gb, r2_gb],
    "SVR": [mse_svr, mae_svr, r2_svr]
}

results_df = pd.DataFrame(results, index=["MSE", "MAE", "R2"]).T

print("\nModel Evaluation:")
print(results_df)



Model Evaluation:
                        MSE       MAE        R2
Linear Regression  0.555892  0.533200  0.575788
Decision Tree      0.495235  0.454679  0.495235
Random Forest      0.255368  0.327543  0.255368
Gradient Boosting  0.293997  0.371643  0.293997
SVR                1.332012  0.859951  1.332012


In [48]:
# The best-performing algorithm and worst-performing algorithm
best_model = results_df['MSE'].idxmin()
worst_model = results_df['MSE'].idxmax()

print(f"\nBest Performing Model: {best_model} (Lowest MSE)")
print(f"Worst Performing Model: {worst_model} (Highest MSE)")


Best Performing Model: Random Forest (Lowest MSE)
Worst Performing Model: SVR (Highest MSE)


In [60]:
# Explanation of Model Evaluation:
print("Best Performing Model: Random Forest=It has the lowest MSE (0.255368) and the lowest MAE (0.327543) compared to all other models.While the R2 score is not the highest, the MSE and MAE are the most important factors for this evaluation.'Random Forest, being an ensemble method, effectively reduces overfitting and captures complex relationships in the data. Its ability to average predictions from multiple decision trees leads to more robust and accurate results.")

print("Worst Performing Model: SVR =It has the highest MSE (1.332012) and the highest MAE (0.859951).It also has the highest R2, which indicates a problem with the model. the R2 should be between 0 and 1. An R2 greater than one indicates that the model is fitting the noise within the data.''SVR's performance is highly dependent on the choice of kernel and hyperparameters. In this case, the default parameters or the chosen kernel might not be suitable for the California Housing dataset. Also, the high MSE indicates that the model's predictions are significantly deviating from the actual values. The extremely high R2 shows the model is not working correctly.'")

Best Performing Model: Random Forest=It has the lowest MSE (0.255368) and the lowest MAE (0.327543) compared to all other models.While the R2 score is not the highest, the MSE and MAE are the most important factors for this evaluation.'Random Forest, being an ensemble method, effectively reduces overfitting and captures complex relationships in the data. Its ability to average predictions from multiple decision trees leads to more robust and accurate results.
Worst Performing Model: SVR =It has the highest MSE (1.332012) and the highest MAE (0.859951).It also has the highest R2, which indicates a problem with the model. the R2 should be between 0 and 1. An R2 greater than one indicates that the model is fitting the noise within the data.''SVR's performance is highly dependent on the choice of kernel and hyperparameters. In this case, the default parameters or the chosen kernel might not be suitable for the California Housing dataset. Also, the high MSE indicates that the model's predic