# House price prediction

## Dataste
`https://www.kaggle.com/datasets/muhammadbinimran/housing-price-prediction-data`

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
data = pd.read_csv('housing_price_dataset.csv')  # Replace with actual filename

data

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,Rural,1975,100080.865895
49996,2854,2,2,Suburb,1988,374507.656727
49997,2979,5,3,Suburb,1962,384110.555590
49998,2596,5,2,Rural,1984,380512.685957


## Data cleaning

In [3]:
# Check for missing values
print(data.isnull().sum())

# Split into features and target.
X = data.drop('Price', axis=1)
y = data['Price']

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64


## Data preprocessing

In [4]:
# Data preprocessing.
# Convert categorical variables to dummy variables.
# Neighborhood is a categorical variable.
# Neighborhood: Rural, Suburban, Urban
# Print available values for Neighborhood
print("Available values",X['Neighborhood'].unique())
X = pd.get_dummies(X, columns=['Neighborhood'],)
X

Available values ['Rural' 'Suburb' 'Urban']


Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,1969,True,False,False
1,2459,3,2,1980,True,False,False
2,1860,2,1,1970,False,True,False
3,2294,2,1,1996,False,False,True
4,2130,5,2,2001,False,True,False
...,...,...,...,...,...,...,...
49995,1282,5,3,1975,True,False,False
49996,2854,2,2,1988,False,True,False
49997,2979,5,3,1962,False,True,False
49998,2596,5,2,1984,True,False,False


In [5]:
# Normalize data.
scaler = StandardScaler()
x = scaler.fit_transform(X)

# Split into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

x_train, x_test, y_train, y_test

(array([[-1.33338783,  0.44906695,  1.23134057, ..., -0.70740379,
          1.41076284, -0.70508188],
        [-1.46718279,  0.44906695, -1.22011292, ..., -0.70740379,
         -0.70883636,  1.41827499],
        [ 0.83687064,  1.34487176,  0.00561383, ..., -0.70740379,
          1.41076284, -0.70508188],
        ...,
        [-0.29430491, -0.44673786,  0.00561383, ...,  1.4136198 ,
         -0.70883636, -0.70508188],
        [-0.18136112, -0.44673786, -1.22011292, ..., -0.70740379,
         -0.70883636,  1.41827499],
        [ 1.40680241,  1.34487176,  0.00561383, ...,  1.4136198 ,
         -0.70883636, -0.70508188]]),
 array([[ 1.17048924, -1.34254267,  0.00561383, ..., -0.70740379,
          1.41076284, -0.70508188],
        [ 0.27215166, -0.44673786,  0.00561383, ...,  1.4136198 ,
         -0.70883636, -0.70508188],
        [-0.1900491 , -0.44673786,  1.23134057, ..., -0.70740379,
          1.41076284, -0.70508188],
        ...,
        [-1.07101071,  1.34487176,  1.23134057, ..., -

## Model building via linear regression

In [6]:
# Train model.
model = LinearRegression()

model.fit(x_train, y_train)

# Make predictions.
predictions  = model.predict(x_test)

## Model evaluation

In [7]:
# Model evaluation
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r_squared = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r_squared}")

Mean Absolute Error: 40186.23247696573
Mean Squared Error: 2523323666.006507
Root Mean Squared Error: 50232.69518955266
R-squared: 0.5644461178158379


In [8]:
# Test with the first row of the dataset.
sample = X.iloc[0].values.reshape(1, -1)
print(f"Sample: {sample}")

# Predict price for sample.
model.predict(sample)

Sample: [[2126 4 1 1969 True False False]]


array([1.04037464e+17])

## Model building via random forest

In [9]:
# Build Random Forests model.
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    random_state=42, n_estimators=100,
    max_depth=5, n_jobs=-1,
)
model.fit(x_train, y_train)

# Make predictions.
predictions = model.predict(x_test)

## Model evaluation

In [10]:
# Model evaluation
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r_squared = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r_squared}")

Mean Absolute Error: 40296.31565804884
Mean Squared Error: 2536289167.039078
Root Mean Squared Error: 50361.58423877349
R-squared: 0.5622081273490277


## Model building via support vector regression

In [11]:
# # Train model. Use Support Vector Regression.
# model = SVR()

# # Feature scaling - important for SVR
# scaler_X = StandardScaler()
# scaler_y = StandardScaler()
# X_train_scaled = scaler_X.fit_transform(x_train)
# y_train_scaled = scaler_y.fit_transform(np.array(y_train).reshape(-1, 1)).ravel()

# model.fit(X_train_scaled, y_train_scaled)

# # Make predictions.
# predictions  = model.predict(x_test)

## Model evaluation

In [12]:
# # Model evaluation
# mae = mean_absolute_error(y_test, predictions)
# mse = mean_squared_error(y_test, predictions)
# rmse = mean_squared_error(y_test, predictions, squared=False)
# r_squared = r2_score(y_test, predictions)

# print(f"Mean Absolute Error: {mae}")
# print(f"Mean Squared Error: {mse}")
# print(f"Root Mean Squared Error: {rmse}")
# print(f"R-squared: {r_squared}")