In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the Data
df = pd.read_csv("/home/inventor/Datasets/California House Price/Cal_house/housing.csv")

In [3]:
df.shape

(20640, 10)

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [38]:
# Missing Values 
df["total_bedrooms"].fillna(df["total_bedrooms"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(df["total_bedrooms"].mean(), inplace=True)


In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
df["housing_median_age"] = df["housing_median_age"].astype("int64")
df["total_rooms"] = df["total_rooms"].astype("int64")
df["total_bedrooms"] = df["total_bedrooms"].astype("int64")
df["population"] = df["population"].astype("int64")
df["households"] = df["households"].astype("int64")
df["median_house_value"] = df["median_house_value"].astype("int64")

In [11]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280,565,259,3.8462,342200,NEAR BAY


In [12]:
num_cols = [col for col in df.columns if df[col].dtype in ["float64", "int64"]]
cat_cols = [col for col in df.columns if df[col].dtype not in ["float64", "int64"]]

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[num_cols] = scaler.fit_transform(df[num_cols])

df[num_cols]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-1.327835,1.052548,0.982143,-0.804819,-0.975207,-0.974429,-0.977033,2.344766,2.129631
1,-1.322844,1.043185,-0.607019,2.045890,1.355109,0.861439,1.669961,2.332238,1.314156
2,-1.332827,1.038503,1.856182,-0.535746,-0.829711,-0.820777,-0.843637,1.782699,1.258693
3,-1.337818,1.038503,1.856182,-0.624215,-0.722378,-0.766028,-0.733781,0.932968,1.165100
4,-1.337818,1.038503,1.856182,-0.462404,-0.615046,-0.759847,-0.629157,-0.012881,1.172900
...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.390839,-0.512592,-0.443449,-1.216128,-1.115804
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.925118,-0.944405,-1.008420,-0.691593,-1.124470
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.126085,-0.369537,-0.174042,-1.142593,-0.992746
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.307358,-0.604429,-0.393753,-1.054583,-1.058608


In [14]:
# Encode the Categorical Columns 
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first")

encoder = one_hot.fit_transform(df[cat_cols])

encoded_df = pd.DataFrame(encoder, columns=one_hot.get_feature_names_out(cat_cols))

encoded_df = pd.concat([df, encoded_df], axis=1)

In [15]:
encoded_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.975207,-0.974429,-0.977033,2.344766,2.129631,NEAR BAY,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.355109,0.861439,1.669961,2.332238,1.314156,NEAR BAY,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.829711,-0.820777,-0.843637,1.782699,1.258693,NEAR BAY,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.722378,-0.766028,-0.733781,0.932968,1.1651,NEAR BAY,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.615046,-0.759847,-0.629157,-0.012881,1.1729,NEAR BAY,0.0,0.0,1.0,0.0


In [16]:
encoded_df = encoded_df.drop("ocean_proximity", axis=1)

In [17]:
encoded_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.975207,-0.974429,-0.977033,2.344766,2.129631,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.045890,1.355109,0.861439,1.669961,2.332238,1.314156,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.829711,-0.820777,-0.843637,1.782699,1.258693,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.722378,-0.766028,-0.733781,0.932968,1.165100,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.615046,-0.759847,-0.629157,-0.012881,1.172900,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.390839,-0.512592,-0.443449,-1.216128,-1.115804,1.0,0.0,0.0,0.0
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.925118,-0.944405,-1.008420,-0.691593,-1.124470,1.0,0.0,0.0,0.0
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.126085,-0.369537,-0.174042,-1.142593,-0.992746,1.0,0.0,0.0,0.0
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.307358,-0.604429,-0.393753,-1.054583,-1.058608,1.0,0.0,0.0,0.0


In [18]:
encoded_df["ocean_proximity_INLAND"] = encoded_df["ocean_proximity_INLAND"].astype("int64")
encoded_df["ocean_proximity_ISLAND"] = encoded_df["ocean_proximity_ISLAND"].astype("int64")
encoded_df["ocean_proximity_NEAR BAY"] = encoded_df["ocean_proximity_NEAR BAY"].astype("int64")
encoded_df["ocean_proximity_NEAR OCEAN"] = encoded_df["ocean_proximity_NEAR OCEAN"].astype("int64")

In [19]:
encoded_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.975207,-0.974429,-0.977033,2.344766,2.129631,0,0,1,0
1,-1.322844,1.043185,-0.607019,2.04589,1.355109,0.861439,1.669961,2.332238,1.314156,0,0,1,0
2,-1.332827,1.038503,1.856182,-0.535746,-0.829711,-0.820777,-0.843637,1.782699,1.258693,0,0,1,0
3,-1.337818,1.038503,1.856182,-0.624215,-0.722378,-0.766028,-0.733781,0.932968,1.1651,0,0,1,0
4,-1.337818,1.038503,1.856182,-0.462404,-0.615046,-0.759847,-0.629157,-0.012881,1.1729,0,0,1,0


In [20]:
# Split the train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_df.drop("median_house_value", axis=1), encoded_df["median_house_value"], test_size=0.2, random_state=42)

In [21]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [22]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_sk = lr.predict(X_test)

In [23]:
lr.intercept_, lr.coef_

(np.float64(0.10934835222418349),
 array([-0.46597379, -0.47141468,  0.12020919, -0.11383975,  0.37346445,
        -0.37461883,  0.15987144,  0.64988184, -0.34479317,  1.17966674,
        -0.0445144 ,  0.02973443]))

In [24]:
from sklearn.metrics import r2_score

r2_score_sk = r2_score(y_test, y_pred_sk)
r2_score_sk

0.6257361394054031

In [25]:
class Schostic_Gradient_Descent():

    def __init__(self, learning_rate, epochs):
        self.coef_ = None
        # self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
    
    def fit(self, X_train, y_train):
        X_train = np.insert(X_train, 0,1,axis=1)

        self.coef_ = np.random.randn(X_train.shape[1]) * 0.01

        for i in range(self.epochs):

            for j in range(X_train.shape[0]):

                idx = np.random.randint(0, X_train.shape[0])
               

                y_hat = np.dot(X_train[idx], self.coef_)
                

                error = y_train[idx] - y_hat
               

                coef_slope = np.dot(error, X_train[idx])

                self.coef_ = self.coef_ + (self.lr * coef_slope)
                
        print(f'Coef_: {self.coef_}')

    def predict(self, X_test):
        X_test = np.insert(X_test, 0,1, axis=1)
        y_pred = np.dot(X_test, self.coef_)
        return y_pred

In [26]:
sgd_c = Schostic_Gradient_Descent(learning_rate=0.0001, epochs=100)

In [27]:
sgd_c.fit(X_train, y_train)

Coef_: [ 0.10890816 -0.46589904 -0.46612849  0.12766356 -0.10195028  0.34590841
 -0.36912563  0.17257051  0.63979673 -0.35358968  0.04268675 -0.04068101
  0.0277037 ]


In [28]:
y_pred = sgd_c.predict(X_test)

In [29]:
def r2_score_custom(y_true, y_pred):

    mean_value = np.mean(y_true)

    SSE = np.sum((y_true - y_pred) ** 2)
    TSS = np.sum((y_true - mean_value) ** 2)

    r2_score = 1 - (SSE / TSS)
    return r2_score

In [30]:
r2_score_c = r2_score_custom(y_test, y_pred)

r2_score_c

np.float64(0.626059656232309)

In [31]:
r2_score_custom(y_test, y_pred_sk)

np.float64(0.6257361394054031)

In [32]:
class Schostic_Gradient_Descent_2():

    def __init__(self, learning_rate, epochs):
        self.coef_ = None
        self.intercept_ = None
        self.lr = learning_rate
        self.epochs = epochs
    
    def fit(self, X_train, y_train):
        X_train = np.insert(X_train, 0,1, axis=1)
        self.coef_ = np.random.randn(X_train.shape[1]) * 0.01
        self.intercept_ = 0

        for i in range(self.epochs):
            
            for j in range(X_train.shape[0]):

                idx = np.random.randint(X_train.shape[0])

                y_hat = self.intercept_ + np.dot(X_train[idx], self.coef_)

                intercept_slope = y_train[idx] - y_hat
                self.intercept_ = self.intercept_ + (self.lr * intercept_slope)

                coef_slope = np.dot((y_train[idx] - y_hat), X_train[idx])
                self.coef_ = self.coef_ + (self.lr * coef_slope)

        print(f"Intercept_: {self.intercept_}, Coef: {self.coef_}")

    def predict(self, X_test):
        X_test = np.insert(X_test, 0,1, axis=1)
        y_pred = self.intercept_ + np.dot(X_test, self.coef_)
        return y_pred

In [33]:
sgd = Schostic_Gradient_Descent_2(learning_rate=0.001, epochs=100)

In [34]:
sgd.fit(X_train, y_train)

Intercept_: 0.032907999803613294, Coef: [ 0.04513433 -0.46653804 -0.46855968  0.12732723 -0.10584225  0.36314909
 -0.41312625  0.17917993  0.6520099  -0.33328609  0.41674228 -0.02738624
  0.02090065]


In [36]:
y_pred_2 = sgd.predict(X_test)

In [37]:
r2_score_custom(y_test, y_pred_2)

np.float64(0.6237033198548576)