# Import Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split




In [2]:
data=pd.read_csv("housing_price_dataset.csv")
data.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [3]:
data.tail()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.55559
49998,2596,5,2,Rural,1984,380512.685957
49999,1572,5,3,Rural,2011,221618.583218


In [4]:
df=data.sample(frac=0.25)


In [5]:
df.shape

(12500, 6)

# Data Preprocessing

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12500 entries, 24176 to 42594
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    12500 non-null  int64  
 1   Bedrooms      12500 non-null  int64  
 2   Bathrooms     12500 non-null  int64  
 3   Neighborhood  12500 non-null  object 
 4   YearBuilt     12500 non-null  int64  
 5   Price         12500 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 683.6+ KB


In [7]:
df.shape

(12500, 6)

In [8]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,12500.0,12500.0,12500.0,12500.0,12500.0
mean,2005.93856,3.48296,2.00104,1985.16272,224952.038026
std,574.217217,1.115585,0.81661,20.788153,76063.620917
min,1000.0,2.0,1.0,1950.0,-28774.998022
25%,1512.0,2.0,1.0,1967.0,171363.882616
50%,2013.0,3.0,2.0,1985.0,224668.830061
75%,2499.0,4.0,3.0,2003.0,278848.602738
max,2999.0,5.0,3.0,2021.0,476671.733263


In [9]:
df.corr()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
SquareFeet,1.0,-0.015414,-0.005962,0.003361,0.7507
Bedrooms,-0.015414,1.0,0.014905,0.000409,0.065673
Bathrooms,-0.005962,0.014905,1.0,0.00327,0.026751
YearBuilt,0.003361,0.000409,0.00327,1.0,-0.001794
Price,0.7507,0.065673,0.026751,-0.001794,1.0


In [10]:
df.isnull().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [11]:
df.isna().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

# Regression Model 

In [12]:
X=df.drop(["Neighborhood","Price"],axis=1).values
y=df["Price"].values

In [13]:
print(X)

[[2735    5    1 1953]
 [2564    4    1 1959]
 [1412    2    2 1966]
 ...
 [1544    2    2 1955]
 [1878    3    1 1985]
 [2499    3    3 1976]]


In [14]:
print(y)

[256587.11233009 267047.9885181  170685.99402431 ... 284363.07711943
 187819.15424556 197798.05537676]


In [15]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Adding a column of ones for the intercept term
X_train = np.hstack((np.ones_like(X_train), X_train))
X_test = np.hstack((np.ones_like(X_test), X_test))


In [17]:
print(X_train)

[[   1    1    1 ...    5    3 1995]
 [   1    1    1 ...    2    1 2020]
 [   1    1    1 ...    3    1 2011]
 ...
 [   1    1    1 ...    4    2 2021]
 [   1    1    1 ...    2    2 1978]
 [   1    1    1 ...    2    3 1969]]


In [18]:
def prediction(X, weights):
    return np.dot(X, weights)

In [19]:
def compute_cost(X, y, weights):
    m = len(y)
    error = prediction(X, weights) - y
    cost = np.dot(error.T, error)/(2*m)
    return cost

In [20]:
def gradient_descent(X, y,learning_rate, epochs):
    print(f"Shape of X = {X.shape}")
    print(f"Shape of y = {y.shape}")
    m,n=X.shape
    weights=np.zeros(n)
    costs=[]
    for i in range(epochs):
        predictions = prediction(X, weights)
        error = predictions - y
        print(error.shape)
        print(error)
        gradient = np.dot(X.T, error) / m
        weights-=learning_rate * gradient
        cost=compute_cost(X, y,weights)
        costs.append(cost)
    
    return weights,costs

In [21]:
# Train the model
learning_rate = 1e-6
epochs = 40
weights,costs=gradient_descent(X_train, y_train, learning_rate, epochs)
#print(costs)



Shape of X = (10000, 8)
Shape of y = (10000,)
(10000,)
[-413360.73515668 -270577.4789176  -219575.47064866 ... -197521.82717092
 -288741.17238119 -152558.92367444]
(10000,)
[1818198.94095342 1595855.01929583 1830740.96037699 ... 2045169.05949655
 1963866.28286131 1338096.29804742]
(10000,)
[-14111689.3283089  -11764235.01193068 -12823915.1361188  ...
 -13965852.18388234 -14112535.16706264  -9369238.624813  ]
(10000,)
[9.98787816e+07 8.38064789e+07 9.20256478e+07 ... 1.00603776e+08
 1.00929498e+08 6.71939427e+07]
(10000,)
[-7.15577346e+08 -5.99905814e+08 -6.58052130e+08 ... -7.18996669e+08
 -7.22046567e+08 -4.80564817e+08]
(10000,)
[5.11816449e+09 4.29132382e+09 4.70796442e+09 ... 5.14439245e+09
 5.16549484e+09 3.43804074e+09]
(10000,)
[-3.66160362e+10 -3.07002257e+10 -3.36801700e+10 ... -3.68019038e+10
 -3.69535825e+10 -2.45954358e+10]
(10000,)
[2.61947795e+11 2.19627091e+11 2.40946108e+11 ... 2.63279245e+11
 2.64363634e+11 1.75954254e+11]
(10000,)
[-1.87395885e+12 -1.57119861e+12 -1.7

In [22]:
# Predict on the test set
y_pred_test = np.dot(X_test, weights)

# Calculate regression metrics

mse = np.mean((y_test - y_pred_test)**2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_test - y_pred_test))
total_variance = np.sum((y_test - np.mean(y_test))**2)
r_squared = 1 - (mse / total_variance)


In [23]:
# Print regression metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r_squared}")

Mean Squared Error (MSE): 1.2188671802485943e+79
Root Mean Squared Error (RMSE): 3.491227835946251e+39
Mean Absolute Error (MAE): 3.452765012294533e+39
R-squared: -8.669356350189882e+65
