In [1]:
#Importing the required packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
#Fetching the dataset from sklearn datasets

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [3]:
#Checking the feature names of the dataset

print(housing.feature_names[0:])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [4]:
#Checking the dimension of the data

print(housing.data.shape)

(20640, 8)


In [5]:
#Checking the dimension of the target

print(housing.target.shape)

(20640,)


In [6]:
#Converting the dataset to pandas framework

data = pd.DataFrame(housing.data)

In [7]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
#Putting the names of the columns for better understanding of the data

data.columns=['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

In [9]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [10]:
target = pd.DataFrame(housing.target)

In [11]:
target.head()

Unnamed: 0,0
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [12]:
target.columns=['HousePrice']

In [13]:
target.head()

Unnamed: 0,HousePrice
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [14]:
data.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
Latitude      float64
Longitude     float64
dtype: object

In [15]:
print(data.isnull().sum())

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64


In [16]:
target.dtypes

HousePrice    float64
dtype: object

In [17]:
df_standardized = (data - data.mean()) / data.std()

In [18]:
df_standardized.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344709,0.982119,0.628544,-0.153754,-0.974405,-0.049595,1.052523,-1.327803
1,2.332181,-0.607004,0.327033,-0.263329,0.861418,-0.09251,1.043159,-1.322812
2,1.782656,1.856137,1.155592,-0.049015,-0.820757,-0.025842,1.038478,-1.332794
3,0.932945,1.856137,0.156962,-0.049832,-0.76601,-0.050328,1.038478,-1.337785
4,-0.012881,1.856137,0.344702,-0.032905,-0.759828,-0.085614,1.038478,-1.337785


In [19]:
target_standardized = (target - target.mean()) / target.std()

In [20]:
# Splitting features (X) and target (y)

X = df_standardized
y = target_standardized

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [24]:
# Split the data into training and testing sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Initialize models
lr_model = LinearRegression()  # Initialize Linear Regression model
gb_model = GradientBoostingRegressor()  # Initialize Gradient Boosting model

# Train the Linear Regression model
lr_model.fit(X_train, y_train)

# Train the Gradient Boosting model
gb_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor()

In [30]:
# Predict with Linear Regression
y_pred_lr = lr_model.predict(X_test)

# Predict with Gradient Boosting
y_pred_gb = gb_model.predict(X_test)

In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [32]:
# Calculate evaluation metrics

def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, r2

In [33]:
# Evaluate both models

lr_mae, lr_mse, lr_r2 = evaluate_model(y_test, y_pred_lr)
gb_mae, gb_mse, gb_r2 = evaluate_model(y_test, y_pred_gb)

In [39]:
print(f"Result for Linear Regression :- \n Mean Absolute Error : {lr_mae}, \n Mean Squared Error : {lr_mse}, \n R^2 : {lr_r2}")
print("\n")
print(f"Result for Gradient Boosting :- \n Mean Absolute Error : {gb_mae}, \n Mean Squared Error : {gb_mse}, \n R^2 : {gb_r2}")

Result for Linear Regression :- 
 Mean Absolute Error : 0.4620627278214268, 
 Mean Squared Error : 0.4174567539270494, 
 R^2 : 0.575787706032451


Result for Gradient Boosting :- 
 Mean Absolute Error : 0.3220663503272389, 
 Mean Squared Error : 0.22078382489078877, 
 R^2 : 0.7756433164710084
