# Import Libraries

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle


# Load Dataset

In [3]:
data = pd.read_csv("housing_price_dataset.csv")

In [4]:
df = data.copy()

# EDA

In [5]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [14]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2006.37468,3.4987,1.99542,1985.40442,224827.325151
std,575.513241,1.116326,0.815851,20.719377,76141.842966
min,1000.0,2.0,1.0,1950.0,-36588.165397
25%,1513.0,3.0,1.0,1967.0,169955.860225
50%,2007.0,3.0,2.0,1985.0,225052.141166
75%,2506.0,4.0,3.0,2003.0,279373.630052
max,2999.0,5.0,3.0,2021.0,492195.259972


In [7]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms', 'Neighborhood', 'YearBuilt',
       'Price'],
      dtype='object')

In [8]:
df['Bedrooms'].value_counts()

3    12661
5    12468
2    12436
4    12435
Name: Bedrooms, dtype: int64

In [10]:
df['Bathrooms'].value_counts()

1    16755
2    16719
3    16526
Name: Bathrooms, dtype: int64

In [11]:
df['Neighborhood'].value_counts()

Suburb    16721
Rural     16676
Urban     16603
Name: Neighborhood, dtype: int64

In [12]:
df['YearBuilt'].nunique()

72

# Model Development

In [17]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [16]:
encoder = OneHotEncoder()

In [18]:
df['Neighborhood'].value_counts()

Suburb    16721
Rural     16676
Urban     16603
Name: Neighborhood, dtype: int64

In [20]:
encoded = encoder.fit_transform(df[['Neighborhood']])

In [25]:
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(['Neighborhood']))

In [27]:
encoded_df.head()

Unnamed: 0,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [28]:
df_encoded = pd.concat([df, encoded_df], axis=1)

In [29]:
df_encoded.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,Rural,1969,215355.283618,1.0,0.0,0.0
1,2459,3,2,Rural,1980,195014.221626,1.0,0.0,0.0
2,1860,2,1,Suburb,1970,306891.012076,0.0,1.0,0.0
3,2294,2,1,Urban,1996,206786.787153,0.0,0.0,1.0
4,2130,5,2,Suburb,2001,272436.239065,0.0,1.0,0.0


In [30]:
df_encoded.drop('Neighborhood', axis=1, inplace=True)

In [31]:
df_encoded.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,1969,215355.283618,1.0,0.0,0.0
1,2459,3,2,1980,195014.221626,1.0,0.0,0.0
2,1860,2,1,1970,306891.012076,0.0,1.0,0.0
3,2294,2,1,1996,206786.787153,0.0,0.0,1.0
4,2130,5,2,2001,272436.239065,0.0,1.0,0.0


In [38]:
df_encoded.dtypes

SquareFeet               int64
Bedrooms                 int64
Bathrooms                int64
YearBuilt                int64
Price                  float64
Neighborhood_Rural     float64
Neighborhood_Suburb    float64
Neighborhood_Urban     float64
dtype: object

In [39]:
X = df_encoded.drop('Price', axis= 1)
y = df_encoded['Price']

In [43]:
X.shape, y.shape

((50000, 7), (50000,))

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [47]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [48]:
y_pred_lr = lr.predict(X_test)

In [51]:
y_pred_lr

array([218329.49032129, 135025.03293153, 255260.82445864, ...,
       310675.23692328, 200904.11512863, 244037.84596718])

In [53]:
y_test.head()

33553    170835.035713
9427     126913.469998
199      246611.883092
12447    244250.462969
39489    271127.650112
Name: Price, dtype: float64

In [58]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"mean squared error: {mse_lr}")

mean squared error: 2436249371.3072467


In [60]:
r2_lr = r2_score(y_test, y_pred_lr)
print(f"r2 score: {r2_lr}")

r2 score: 0.5755628630306235


### XGBoost

In [72]:
xg_reg = xgb.XGBRegressor()

In [73]:
xg_reg.fit(X_train, y_train)

In [74]:
y_pred_xg = xg_reg.predict(X_test)

In [75]:
y_pred_xg

array([218676.47, 145316.75, 258469.19, ..., 324257.5 , 204946.3 ,
       241604.53], dtype=float32)

In [76]:
y_test.head()

33553    170835.035713
9427     126913.469998
199      246611.883092
12447    244250.462969
39489    271127.650112
Name: Price, dtype: float64

In [77]:
mse_xg = mean_squared_error(y_test, y_pred_xg)
print(f"mean squared error: {mse_xg}")

mean squared error: 2552340690.85289


In [78]:
r2_xg = r2_score(y_test, y_pred_xg)
print(f"r2 score: {r2_xg}")

r2 score: 0.5553377301377166


# Saving the Model

In [80]:
with open('housing_price_pred_model.pkl', 'wb') as f:
    pickle.dump(lr, f)