In [31]:
            # Loading and Preprocessing
# Import necessary libraries
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [33]:
# Load the California Housing dataset
housing = fetch_california_housing()

In [35]:
# Convert to pandas DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target


In [37]:
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None


In [39]:
# missing values
print(df.isnull().sum())


MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [41]:
print(df.describe())

             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1.153956  
min        0.692308     32.54000

In [45]:
# Standardization
from sklearn.preprocessing import StandardScaler


X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)


print(X_scaled_df.head())


     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  
0  -1.327835  
1  -1.322844  
2  -1.332827  
3  -1.337818  
4  -1.337818  


In [None]:
#Preprocessing Steps and Justification
# Loading the Dataset
The California Housing dataset was loaded with fetch_california_housing from sklearn.
It was converted to a pandas DataFrame, feature columns (MedInc, HouseAge, AveRooms, etc.) and a target column ['MedHouseVal'].
It is easier to manipulate data using pandas functions by converting to DataFrame.
# Dealing with Missing Values
We conducted a missing value check throughout the DataFrame with df.isnull().sum().
The data set had no missing values and therefore needed no imputation or deletion.
Making sure that there are no missing values is essential because they can have a negative effect on the performance of regression models.
# Feature Scaling (Standardization)
We used standardization with StandardScaler on the feature columns (not the target variable: PRICE).
Standardization makes every feature such that its distribution has a mean of 0 and a standard deviation of 1.
This is crucial for regression models since it will stop features with higher scales from taking over the learning process, make sure that every feature 
contributes proportionally to the model, and enhance the convergence rate while optimizing.


In [None]:
            # Regression Algorithm Implementation
# 1. Linear Regression
  # How it works
Linear Regression fits a straight line (or hyperplane in higher dimensions) to the data by minimizing the Mean Squared Error (MSE) between actual and predicted values.
  # Why it's suitable:
*Good for identifying linear relationships between features and the target.
*Serves as a solid baseline model.


In [None]:
# implementation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr_model = LinearRegression()
lr_model.fit(X_scaled_df, y)
y_pred_lr = lr_model.predict(X_scaled_df)

print("Linear Regression R²:", r2_score(y, y_pred_lr))
print("Linear Regression RMSE:", mean_squared_error(y, y_pred_lr, squared=False))


In [None]:
# 2.Decision Tree Regressor
  # How it works:
Splits the dataset into smaller subsets based on feature thresholds. At each split, it chooses the feature and value that minimizes the error (usually MSE).
  # Why it's suitable:
*Captures non-linear relationships.
*Interpretable and flexible.


In [54]:
# implementation
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_scaled_df, y)
y_pred_dt = dt_model.predict(X_scaled_df)

print("Decision Tree R²:", r2_score(y, y_pred_dt))
print("Decision Tree RMSE:", mean_squared_error(y, y_pred_dt, squared=False))


Decision Tree R²: 1.0
Decision Tree RMSE: 3.1022245701521287e-16




In [None]:
# 3.Random Forest Regressor
  # How it works:
An ensemble method that builds multiple decision trees on different random subsets of the data and averages their outputs for a final prediction.
  # Why it's suitable:

*Reduces overfitting compared to a single tree.
*Handles high-dimensional, non-linear data well.

In [53]:
# implementation
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_scaled_df, y)
y_pred_rf = rf_model.predict(X_scaled_df)

print("Random Forest R²:", r2_score(y, y_pred_rf))
print("Random Forest RMSE:", mean_squared_error(y, y_pred_rf, squared=False))

Random Forest R²: 0.9739390971989981
Random Forest RMSE: 0.18628312576580003




In [None]:
# 4.Gradient Boosting Regressor
  # How it works:
Builds models sequentially, where each model tries to correct the errors of the previous one by minimizing a loss function (like MSE).
  # Why it's suitable:
*One of the most powerful and accurate models for tabular data.
*Captures complex, non-linear relationships effectively.

In [52]:
# implementataion
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_scaled_df, y)
y_pred_gb = gb_model.predict(X_scaled_df)

print("Gradient Boosting R²:", r2_score(y, y_pred_gb))
print("Gradient Boosting RMSE:", mean_squared_error(y, y_pred_gb, squared=False))


Gradient Boosting R²: 0.8033237500356992
Gradient Boosting RMSE: 0.5117463430831034




In [None]:
# 5.Support Vector Regressor (SVR)
  # How it works:
Fits the best possible hyperplane within a margin (epsilon) around the true values, using kernel tricks to model non-linear data.
  # Why it's suitable:
*Works well for moderate-sized datasets.
*Effective when the relationship between features and target is not purely linear.

In [51]:
# implementataion
from sklearn.svm import SVR
svr_model = SVR(kernel='rbf')  
svr_model.fit(X_scaled_df, y)
y_pred_svr = svr_model.predict(X_scaled_df)
print("SVR R²:", r2_score(y, y_pred_svr))
print("SVR RMSE:", mean_squared_error(y, y_pred_svr, squared=False))

SVR R²: 0.7502397399037277
SVR RMSE: 0.5766873930299687




In [66]:
            # Model Evaluation and Comparison
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)


In [78]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_
print(" Linear Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("R²:", r2_score(y_test, y_pred_lr))


 Linear Regression:
MSE: 0.555891598695244
MAE: 0.5332001304956565
R²: 0.5757877060324511


In [80]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(" Decision Tree Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred_dt))
print("MAE:", mean_absolute_error(y_test, y_pred_dt))
print("R²:", r2_score(y_test, y_pred_dt))


 Decision Tree Regressor:
MSE: 0.5052210710022044
MAE: 0.4562850557170543
R²: 0.6144554262132605


In [82]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(" Random Forest Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))


 Random Forest Regressor:
MSE: 0.25621319799807024
MAE: 0.3276279949127909
R²: 0.8044784473760151


In [84]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print(" Gradient Boosting Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred_gb))
print("MAE:", mean_absolute_error(y_test, y_pred_gb))
print("R²:", r2_score(y_test, y_pred_gb))


 Gradient Boosting Regressor:
MSE: 0.29399901242474274
MAE: 0.37165044848436773
R²: 0.7756433164710084


In [86]:
from sklearn.svm import SVR
svr_model = SVR(kernel='rbf')  
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
print(" Support Vector Regressor:")
print("MSE:", mean_squared_error(y_test, y_pred_svr))
print("MAE:", mean_absolute_error(y_test, y_pred_svr))
print("R²:", r2_score(y_test, y_pred_svr))


 Support Vector Regressor:
MSE: 0.35519846199894217
MAE: 0.39776309634378626
R²: 0.7289407597956459


In [None]:
# comparision
Best Model: Random Forest (R² = 0.8044784473760151)
Worst Model: Linear Regression (R² = 0.5757877060324511)