# House price prediction

## Dataste
`https://www.kaggle.com/datasets/muhammadbinimran/housing-price-prediction-data`

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [44]:
data = pd.read_csv('housing_price_dataset.csv')  # Replace with actual filename

data

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


## Data cleaning

In [45]:
# Check for missing values
print(data.isnull().sum())

# Split into features and target.
x = data.drop('Price', axis=1)
y = data['Price']

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64


## Data preprocessing

In [46]:
# Data preprocessing.
# Convert categorical variables to dummy variables.
# Neighborhood is a categorical variable.
# Neighborhood: Rural, Suburban, Urban
# Print available values for Neighborhood
print("Available values",x['Neighborhood'].unique())
x = pd.get_dummies(x, columns=['Neighborhood'],)
x

Available values ['Rural' 'Suburb' 'Urban']


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,1969,True,False,False
1,2459,3,2,1980,True,False,False
2,1860,2,1,1970,False,True,False
3,2294,2,1,1996,False,False,True
4,2130,5,2,2001,False,True,False
...,...,...,...,...,...,...,...
49995,1282,5,3,1975,True,False,False
49996,2854,2,2,1988,False,True,False
49997,2979,5,3,1962,False,True,False
49998,2596,5,2,1984,True,False,False


In [47]:
# Split into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

x_train, x_test, y_train, y_test

(       SquareFeet  Bedrooms  Bathrooms  YearBuilt  Neighborhood_Rural  \
 13866        2673         5          1       2010               False   
 49351        2557         3          3       2004               False   
 37694        1477         2          3       1954               False   
 11067        1664         4          1       2019               False   
 30288        1456         5          2       1995                True   
 ...           ...       ...        ...        ...                 ...   
 29285        1933         4          3       2006               False   
 26441        2246         4          2       1990               False   
 31772        1633         2          2       1976               False   
 36293        2716         3          1       2007               False   
 41963        1878         3          1       2009               False   
 
        Neighborhood_Suburb  Neighborhood_Urban  
 13866                False                True  
 49351    

## Model building via linear regression

In [48]:
# Train model.
model = LinearRegression()

model.fit(x_train, y_train)

# Make predictions.
predictions  = model.predict(x_test)

## Model evaluation

In [49]:
# Model evaluation
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r_squared = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r_squared}")

Mean Absolute Error: 39392.53044495386
Mean Squared Error: 2449683664.080558
Root Mean Squared Error: 49494.27910456478
R-squared: 0.5656972895979919


In [50]:
# Test with the first row of the dataset.
sample = x.iloc[0].values.reshape(1, -1)
print(f"Sample: {sample}")

# Predict price for sample.
model.predict(sample)

Sample: [[2126 4 1 1969 True False False]]




array([236638.5897624])

## Model building via support vector regression

In [51]:
# # Train model. Use Support Vector Regression.
# model = SVR()

# # Feature scaling - important for SVR
# scaler_X = StandardScaler()
# scaler_y = StandardScaler()
# X_train_scaled = scaler_X.fit_transform(x_train)
# y_train_scaled = scaler_y.fit_transform(np.array(y_train).reshape(-1, 1)).ravel()

# model.fit(X_train_scaled, y_train_scaled)

# # Make predictions.
# predictions  = model.predict(x_test)

## Model evaluation

In [52]:
# # Model evaluation
# mae = mean_absolute_error(y_test, predictions)
# mse = mean_squared_error(y_test, predictions)
# rmse = mean_squared_error(y_test, predictions, squared=False)
# r_squared = r2_score(y_test, predictions)

# print(f"Mean Absolute Error: {mae}")
# print(f"Mean Squared Error: {mse}")
# print(f"Root Mean Squared Error: {rmse}")
# print(f"R-squared: {r_squared}")