In [83]:
# 1. Loading and Preprocessing
import pandas as pd
import numpy as np

In [84]:
# Load dataset
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
data = pd.DataFrame(california.data, columns=california.feature_names)
data['MedHouseVal'] = california.target
data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [85]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [88]:
data.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [89]:
data.tail(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,0.894


In [43]:
# checking for missing values
missing_values = print(data.isnull().sum())

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [91]:
from sklearn.preprocessing import StandardScaler

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(data.drop('MedHouseVal', axis=1))
y = data['MedHouseVal']

In [92]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# no misssing values found
#Stamdardisation is necessory

In [93]:
# 2.Regression Algorithm Implementation and Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test, y_pred)
    return {
       'Model': name,
        'MSE': mse,
        'MAE' : mae,
        'R2' : r2}  

result = []

In [94]:
# Linear Regression
from sklearn.linear_model import LinearRegression

lr= LinearRegression()
result.append(evaluate_model(lr, 'Linear Regression'))

In [95]:
#Decision tree Regressor
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state =42)
result.append(evaluate_model(dt, 'Decision Tree Regressor'))

In [98]:
# Random forest Regressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

rf = RandomForestRegressor(random_state = 42)
result.append(evaluate_model(rf, 'Random Forest Regressor'))

In [101]:
# Gradient Boosting Regressor
gb = GradientBoostingRegressor(random_state=42)
result.append(evaluate_model(gb, 'Gradient Boosting Regressor'))

In [106]:
# Support Vector Regressor
from sklearn.svm import SVR
svr = SVR()
result.append(evaluate_model(svr, 'Support Vector Regressor'))

In [108]:
# Creating a Dataframe for model comparison
result_df = pd.DataFrame(result)
result_df_sorted = result_df.sort_values(by='R2', ascending = False)
print(result_df_sorted)

                         Model       MSE       MAE        R2
2      Random Forest Regressor  0.255498  0.327613  0.805024
3      Random Forest Regressor  0.255498  0.327613  0.805024
4  Gradient Boosting Regressor  0.293999  0.371650  0.775643
5     Support Vector Regressor  0.355198  0.397763  0.728941
1      Decision Tree Regressor  0.494272  0.453784  0.622811
0            Linear Regression  0.555892  0.533200  0.575788


In [109]:
## Best and Worst Performing Models
best_model = result_df_sorted.iloc[0]
worst_model = result_df_sorted.iloc[-1]

In [110]:
print(f"Best Performing Model: {best_model['Model']} with R2 Score: {best_model['R2']: 4f}")
print(f"Worst Performing Model : {worst_model['Model']} with R2 Score: {worst_model['R2']: 4f}")

Best Performing Model: Random Forest Regressor with R2 Score:  0.805024
Worst Performing Model : Linear Regression with R2 Score:  0.575788
