In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("housing_price_dataset.csv")
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [4]:
df.isna().sum()
print(df.columns)
print("\nMissing Values:\n", df.isnull().sum())
df.info()

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')

Missing Values:
 Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
dtypes: float64(6)

In [6]:

X = df[[
    "Avg. Area Income",
    "Avg. Area House Age",
    "Avg. Area Number of Rooms",
    "Avg. Area Number of Bedrooms",
    "Area Population"
]]

y = df["Price"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (5000, 5)
y shape: (5000,)


In [7]:
model = LinearRegression()
model.fit(X, y)

print("Intercept (a):", model.intercept_)
print("\nCoefficients (b1 to b5):")
for col, coef in zip(X.columns, model.coef_):
    print(f"{col}: {coef}")

Intercept (a): -2637299.033331735

Coefficients (b1 to b5):
Avg. Area Income: 21.57804944849758
Avg. Area House Age: 165637.02694119443
Avg. Area Number of Rooms: 120659.94881589068
Avg. Area Number of Bedrooms: 1651.1390534472102
Area Population: 15.200743923699065


In [8]:
y_pred = model.predict(X)

pd.DataFrame({
    "Actual Price": y[:10],
    "Predicted Price": y_pred[:10]
})

Unnamed: 0,Actual Price,Predicted Price
0,1059034.0,1223847.0
1,1505891.0,1494938.0
2,1058988.0,1253017.0
3,1260617.0,1121224.0
4,630943.5,845388.8
5,1068138.0,1068839.0
6,1502056.0,1670160.0
7,1573937.0,1569962.0
8,798869.5,765891.1
9,1545155.0,1468258.0


In [9]:
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

Mean Squared Error: 10219734313.031612
R² Score: 0.9180238195119546


In [10]:
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

Mean Squared Error: 10219734313.031612
R² Score: 0.9180238195119546


In [11]:
# Example new house data (CHANGE these as needed)
X_new = pd.DataFrame({
    "Avg. Area Income": [50000, 75000],
    "Avg. Area House Age": [5, 10],
    "Avg. Area Number of Rooms": [6, 8],
    "Avg. Area Number of Bedrooms": [3, 4],
    "Area Population": [30000, 45000]
})

y_new_pred = model.predict(X_new)

pd.DataFrame({
    "Input Data": X_new.values.tolist(),
    "Predicted Price": y_new_pred
})

Unnamed: 0,Input Data,Predicted Price
0,"[50000, 5, 6, 3, 30000]",454724.0
1,"[75000, 10, 8, 4, 45000]",2293343.0
