In [None]:
#import the necessary python libraries

In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pickle

#load dataset
df = pd.read_csv("https://raw.githubusercontent.com/kennedyuche/linear-regression/main/housing_price_dataset.csv")

#view dataset
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [3]:
#explore dataset
df.shape

(50000, 6)

In [5]:
#explore dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [7]:
df.describe()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2006.37468,3.4987,1.99542,1985.40442,224827.325151
std,575.513241,1.116326,0.815851,20.719377,76141.842966
min,1000.0,2.0,1.0,1950.0,-36588.165397
25%,1513.0,3.0,1.0,1967.0,169955.860225
50%,2007.0,3.0,2.0,1985.0,225052.141166
75%,2506.0,4.0,3.0,2003.0,279373.630052
max,2999.0,5.0,3.0,2021.0,492195.259972


In [11]:
#checking for missing data
df.isnull().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [None]:
#handle missing data
#for bedroom column, just for practice, as theres no null value
#dropping null values

df=df.dropna(subset=["Bedrooms"])

In [19]:
#handle missing data
#for Bathroom column, just for practice, as theres no null value

mean_value= round(df["Bathrooms"].mean(), 1)
mean_value
df["Bathrooms"]=df["Bathrooms"].fillna(mean_value)

In [23]:
#Encode categorical variables
label_encoder=LabelEncoder()

df["Neighborhood"]=label_encoder.fit_transform(df["Neighborhood"])
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,0,1969,215355.283618
1,2459,3,2,0,1980,195014.221626
2,1860,2,1,1,1970,306891.012076
3,2294,2,1,2,1996,206786.787153
4,2130,5,2,1,2001,272436.239065


In [27]:
#split the dataset into target and features
y=df["Price"]
x=df.drop(["Price", "YearBuilt"], axis=1)

In [29]:
x

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood
0,2126,4,1,0
1,2459,3,2,0
2,1860,2,1,1
3,2294,2,1,2
4,2130,5,2,1
...,...,...,...,...
49995,1282,5,3,0
49996,2854,2,2,1
49997,2979,5,3,1
49998,2596,5,2,0


In [36]:
#split the features and target dataset into train and test

x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.2, random_state=42)

In [38]:
#fit model to training data

model=LinearRegression()
model.fit(x_train, y_train)

In [None]:
MODELEVALUATION

In [42]:
x_test

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood
33553,1894,5,1,0
9427,1001,5,3,1
199,2264,4,3,1
12447,2299,5,1,1
39489,2651,2,1,1
...,...,...,...,...
28567,2005,3,3,2
25079,1725,4,2,1
18707,2885,3,2,2
15200,1674,5,2,2


In [40]:
#Predict the target values for the features test set

y_pred=model.predict(x_test)
y_pred

array([217838.91793321, 135758.35530229, 256005.99550325, ...,
       310213.01289175, 200362.96188532, 243624.01808175])

In [44]:
y_test

33553    170835.035713
9427     126913.469998
199      246611.883092
12447    244250.462969
39489    271127.650112
             ...      
28567    199265.817701
25079    241869.621812
18707    352184.123976
15200    244830.805238
5857     246512.284597
Name: Price, Length: 10000, dtype: float64

In [59]:
#evaluate the regression metrics for the trained model
#compare actual value(y_test) with predicted value(y_pred)

mse=mean_squared_error(y_test, y_pred)
r2=r2_score(y_test, y_pred)

print(f"MSE=> {round(mse, 2)}")
print(f"r2=> {round(r2, 2)}")

MSE=> 2437198334.24
r2=> 0.58


In [None]:
TRAINED MODEL EXPORT
#we export the model and perform inferences with it

In [None]:
#Export the trained model
model_filename = "./trained_model/reg_model.pk1"

with open(model_filename, "wb") as model_file:
    pickle.dump(model, model_file)