# House Pricing Prediction Project

In this project, we aim to build a predictive model to estimate house prices based on various features such as location, size, number of bedrooms, and other relevant attributes. Accurate house price prediction is valuable for buyers, sellers, and real estate professionals to make informed decisions. We will explore the dataset, perform data preprocessing, conduct exploratory data analysis, and apply machine learning algorithms to develop a robust prediction model. The workflow will include data cleaning, feature engineering, model selection, evaluation, and interpretation of results.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
df = pd.read_csv('../datasets/housing_price_dataset.csv')
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


In [7]:
df.columns

Index(['SquareFeet', 'Bedrooms', 'Bathrooms', 'Neighborhood', 'YearBuilt',
       'Price'],
      dtype='object')

In [8]:
df.isna().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [9]:
df['Neighborhood'].value_counts()

Neighborhood
Suburb    16721
Rural     16676
Urban     16603
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Encoding categorical features
categorical_features = ['Neighborhood']

# Numerical features
numerical_features = ['SquareFeet', 'Bedrooms', 'Bathrooms', 'YearBuilt']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)

x = df.drop('Price', axis=1)

y = df['Price']

x_processed = preprocessor.fit_transform(x)


# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)

In [18]:
# Training function
def train_model(model: dict, x_train: list[float], y_train: list[float]) -> dict:
    """
    Trains a regression model and returns the trained model.
    """
    for model_instance in model.values():
        model_instance.fit(x_train, y_train)
    return model

# Evaluation function
def evaluate_model(model: dict, x_test: list[float], y_test: list[float]) -> dict:
    """
    Evaluates a regression model and returns the evaluation metrics.
    """
    metrics = {}
    for name, model_instance in model.items():
        y_pred = model_instance.predict(x_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse).tolist()
        r2 = r2_score(y_test, y_pred)
        metrics[name] = {
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2
        }
    return metrics
        

In [13]:
# Import the models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Define the models
models = {
    'lr': LinearRegression(),
    'dt': DecisionTreeRegressor(),
    'svr': SVR()
}

# Train the models
trained_models = train_model(models, x_train, y_train)

In [19]:
# Evaluate the models
evaluation_metrics = evaluate_model(trained_models, x_test, y_test)
evaluation_metrics

{'lr': {'MAE': 39430.16533829791,
  'MSE': 2436249371.3072467,
  'RMSE': 49358.376911191546,
  'R2': 0.5755628630306235},
 'dt': {'MAE': 57810.949359091006,
  'MSE': 5219313230.096428,
  'RMSE': 72244.81455506982,
  'R2': 0.09070459271587494},
 'svr': {'MAE': 60220.8876894738,
  'MSE': 5475024929.261311,
  'RMSE': 73993.41139088878,
  'R2': 0.04615515423828587}}