Use a dataset to predict house prices (input: features like size, location; output: price). 
Implement and compare linear regression and decision tree regressors.

In [1]:
import sys
import shutil
import kagglehub
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
cwd = Path().cwd()
project_root = cwd.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print("Done")

Done


In [4]:
data_path = project_root / "data"

In [5]:
# # Download latest version
# cache_path = kagglehub.dataset_download("juhibhojani/house-price")

# for file in Path(cache_path).iterdir():
#     shutil.copy2(file, data_path/file.name)

In [6]:
path = data_path / "house_prices.csv"
data = pd.read_csv(
    filepath_or_buffer=str(path),
    encoding='latin-1',
    sep=",",
    thousands=',',
    na_values=['NA', 'N/A', 'null', 'NULL', '', ' ', 'None'],
    )

In [7]:
data.shape

(187531, 21)

In [8]:
data.head()

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,...,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,...,,,Srushti Siddhi Mangal Murti Complex,1,2.0,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,...,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,...,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,...,,,,1,1.0,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,...,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,


In [9]:
data.columns

Index(['Index', 'Title', 'Description', 'Amount(in rupees)',
       'Price (in rupees)', 'location', 'Carpet Area', 'Status', 'Floor',
       'Transaction', 'Furnishing', 'facing', 'overlooking', 'Society',
       'Bathroom', 'Balcony', 'Car Parking', 'Ownership', 'Super Area',
       'Dimensions', 'Plot Area'],
      dtype='object')

In [10]:
data.isna().all()

Index                False
Title                False
Description          False
Amount(in rupees)    False
Price (in rupees)    False
location             False
Carpet Area          False
Status               False
Floor                False
Transaction          False
Furnishing           False
facing               False
overlooking          False
Society              False
Bathroom             False
Balcony              False
Car Parking          False
Ownership            False
Super Area           False
Dimensions            True
Plot Area             True
dtype: bool

In [11]:
data.isna().any()

Index                False
Title                False
Description           True
Amount(in rupees)    False
Price (in rupees)     True
location             False
Carpet Area           True
Status                True
Floor                 True
Transaction           True
Furnishing            True
facing                True
overlooking           True
Society               True
Bathroom              True
Balcony               True
Car Parking           True
Ownership             True
Super Area            True
Dimensions            True
Plot Area             True
dtype: bool

In [12]:
data.isna().sum()

Index                     0
Title                     0
Description            3023
Amount(in rupees)         0
Price (in rupees)     17665
location                  0
Carpet Area           80673
Status                  615
Floor                  7077
Transaction              83
Furnishing             2897
facing                70233
overlooking           81436
Society              109678
Bathroom                828
Balcony               48935
Car Parking          103357
Ownership             65517
Super Area           107685
Dimensions           187531
Plot Area            187531
dtype: int64

In [13]:
(data.isna().sum()/data.shape[0])*100

Index                  0.000000
Title                  0.000000
Description            1.612000
Amount(in rupees)      0.000000
Price (in rupees)      9.419776
location               0.000000
Carpet Area           43.018488
Status                 0.327946
Floor                  3.773776
Transaction            0.044259
Furnishing             1.544811
facing                37.451408
overlooking           43.425354
Society               58.485264
Bathroom               0.441527
Balcony               26.094352
Car Parking           55.114621
Ownership             34.936624
Super Area            57.422506
Dimensions           100.000000
Plot Area            100.000000
dtype: float64

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

In [15]:
data = data.map(lambda x : x.strip() if isinstance(x, str) else x)

In [16]:
data.columns = data.columns.str.lower().str.replace(" ", "_").str.replace('[^a-z0-9_]', '')

In [17]:
data.columns

Index(['index', 'title', 'description', 'amount(in_rupees)',
       'price_(in_rupees)', 'location', 'carpet_area', 'status', 'floor',
       'transaction', 'furnishing', 'facing', 'overlooking', 'society',
       'bathroom', 'balcony', 'car_parking', 'ownership', 'super_area',
       'dimensions', 'plot_area'],
      dtype='object')

In [18]:
data.isnull().sum()

index                     0
title                     0
description            3023
amount(in_rupees)         0
price_(in_rupees)     17665
location                  0
carpet_area           80673
status                  615
floor                  7077
transaction              83
furnishing             2897
facing                70233
overlooking           81436
society              109678
bathroom                828
balcony               48935
car_parking          103357
ownership             65517
super_area           107685
dimensions           187531
plot_area            187531
dtype: int64

In [19]:
data.columns

Index(['index', 'title', 'description', 'amount(in_rupees)',
       'price_(in_rupees)', 'location', 'carpet_area', 'status', 'floor',
       'transaction', 'furnishing', 'facing', 'overlooking', 'society',
       'bathroom', 'balcony', 'car_parking', 'ownership', 'super_area',
       'dimensions', 'plot_area'],
      dtype='object')

First, I think i should remove the useless columns title, description, (dimensions, plot_area) these two cause these are all filled with nans

In [20]:
data.drop(columns=['title', 'description', 'dimensions', 'plot_area'], inplace=True)

In [21]:
data.shape

(187531, 17)

Here, since we are predicting the prices of the houses so i will drop all the rows that has missing data for prices

In [22]:
data.dropna(subset=["price_(in_rupees)"], inplace=True)

In [23]:
data.shape

(169866, 17)

In [24]:
missing_per = data.isna().mean() * 100

Checking the categorical columns

In [25]:
column_types = data.dtypes

In [26]:
column_types

index                  int64
amount(in_rupees)     object
price_(in_rupees)    float64
location              object
carpet_area           object
status                object
floor                 object
transaction           object
furnishing            object
facing                object
overlooking           object
society               object
bathroom              object
balcony               object
car_parking           object
ownership             object
super_area            object
dtype: object

In [27]:
# Location
u_location = pd.Series(data["location"].unique())
u_location

0             thane
1       navi-mumbai
2            nagpur
3            mumbai
4         ahmedabad
          ...      
76         varanasi
77       vijayawada
78    visakhapatnam
79        vrindavan
80         zirakpur
Length: 81, dtype: object

In [28]:
# Status
u_status = pd.Series(data["status"].unique())
u_status

0    Ready to Move
1              NaN
dtype: object

In [29]:
data.columns

Index(['index', 'amount(in_rupees)', 'price_(in_rupees)', 'location',
       'carpet_area', 'status', 'floor', 'transaction', 'furnishing', 'facing',
       'overlooking', 'society', 'bathroom', 'balcony', 'car_parking',
       'ownership', 'super_area'],
      dtype='object')

In [30]:
missing_per["status"].item()

0.3502760999846938

I think since the columns status only takes one value and it is in almost 99% of the data so, it has almost no variance so, I don't the model will learn anything useful from this. So, I am dropping it.

In [31]:
data.drop(columns=["status"], inplace=True)

In [32]:
# Furnishing
u_furnishing = pd.Series(data['furnishing'].unique())
u_furnishing

0       Unfurnished
1    Semi-Furnished
2         Furnished
3               NaN
dtype: object

In [33]:
missing_per["furnishing"].item()

1.2127206150730576

In [34]:
# facing
u_facing = pd.Series(data["facing"].unique())
u_facing

0             NaN
1            East
2            West
3    North - East
4           North
5    North - West
6           South
7     South -West
8    South - East
dtype: object

In [35]:
missing_per["facing"].item()

36.52997068277348

In [36]:
# overlooking
u_overlooking = pd.Series(data["overlooking"].unique())
u_overlooking

0                                NaN
1                        Garden/Park
2             Garden/Park, Main Road
3                          Main Road
4       Pool, Garden/Park, Main Road
5       Garden/Park, Pool, Main Road
6                  Garden/Park, Pool
7             Main Road, Garden/Park
8       Main Road, Garden/Park, Pool
9                  Pool, Garden/Park
10      Garden/Park, Main Road, Pool
11                              Pool
12                   Pool, Main Road
13      Main Road, Pool, Garden/Park
14      Pool, Main Road, Garden/Park
15          Main Road, Not Available
16                   Main Road, Pool
17        Garden/Park, Not Available
18    Pool, Main Road, Not Available
dtype: object

This one looks very interesting, It seems like this features can be broken into multiple features.

In [37]:
missing_per["overlooking"].item()

41.589252705073406

In [38]:
# ownership
u_ownership = pd.Series(data["ownership"].unique())
u_ownership

0                     NaN
1                Freehold
2    Co-operative Society
3       Power Of Attorney
4               Leasehold
dtype: object

In [39]:
missing_per["ownership"].item()

34.28408274757751

In [40]:
# super_area
u_super_area = pd.Series(data["super_area"].unique())
u_super_area

0             NaN
1        680 sqft
2        575 sqft
3        600 sqft
4       1165 sqft
          ...    
2893    147 sqyrd
2894    217 sqyrd
2895    2066 sqft
2896     406 sqft
2897    2332 sqft
Length: 2898, dtype: object

In [41]:
# data[['super_area_value', 'super_area_unit']] = data['super_area'].str.extract(r'(\d+\.?\d*)\s+([a-zA-Z]+)')

In [42]:
# data[['super_area_value', 'super_area_unit']]

It seems like this column has some messed up data that has to fixed also datatype has to be changed

In [43]:
missing_per["super_area"].item()

55.120506752381296

In [44]:
# car_parking
u_car_parking = pd.Series(data["car_parking"].unique())
u_car_parking

0              NaN
1           1 Open
2        1 Covered
3        2 Covered
4       66 Covered
          ...     
201    205 Covered
202       300 Open
203    11 Covered,
204       123 Open
205       702 Open
Length: 206, dtype: object

I think this column can also be better broken and well feed to the model

In [45]:
missing_per["car_parking"].item()

53.54220385480319

In [46]:
# society
u_society = pd.Series(data['society'].unique())
u_society

0       Srushti Siddhi Mangal Murti Complex
1                               Dosti Vihar
2                      Sunrise by Kalpataru
3               TenX Habitat Raymond Realty
4                              Virat Aangan
                       ...                 
9692             Sushma Chandigarh Infinium
9693                     Silver City Greens
9694                Sushma Crescent Phase 2
9695                  Nirmaan Royale Empire
9696                   Jaivee Radha Enclave
Length: 9697, dtype: object

In [47]:
missing_per["society"].item()

57.358741596317095

In [48]:
# bathroom
u_bathroom = pd.Series(data["bathroom"].unique())
u_bathroom

0        1
1        2
2        3
3        4
4        6
5      NaN
6        5
7        9
8        8
9     > 10
10       7
11      10
dtype: object

In [49]:
missing_per["bathroom"].item()

0.4438792930898473

In [50]:
# balcony
u_balcony = pd.Series(data["balcony"].unique())
u_balcony

0        2
1      NaN
2        1
3        3
4        4
5        6
6        5
7        7
8     > 10
9       10
10       8
11       9
dtype: object

In [51]:
missing_per["balcony"].item()

26.527969105059285

In [52]:
# transaction
u_transaction = pd.Series(data["transaction"].unique())
u_transaction

0          Resale
1    New Property
2           Other
3      Rent/Lease
4             NaN
dtype: object

In [53]:
missing_per["transaction"].item()

0.0035321959662322064

In [63]:
# carpet_area
u_carpet_area = pd.Series(data["carpet_area"].unique())
u_carpet_area

0        500 sqft
1        473 sqft
2        779 sqft
3        635 sqft
4             NaN
          ...    
2586    1797 sqft
2587    1634 sqft
2588    1709 sqft
2589    164 sqyrd
2590     136 sqft
Length: 2591, dtype: object

In [64]:
missing_per["carpet_area"].item()

44.87949324761871

In [54]:
data["amount(in_rupees)"]

0          42 Lac
1          98 Lac
2         1.40 Cr
4         1.60 Cr
5          45 Lac
           ...   
187526     63 Lac
187527     55 Lac
187528     76 Lac
187529     30 Lac
187530    1.18 Cr
Name: amount(in_rupees), Length: 169866, dtype: object

In [55]:
missing_per["amount(in_rupees)"].item()

0.0

In [56]:
data["price_(in_rupees)"].unique()

array([ 6000., 13799., 17500., ...,  2873.,  2663.,  2508.],
      shape=(10958,))

In [58]:
missing_per["price_(in_rupees)"].item()

0.0

I think we should remove the 'price_(in_rupees)' cause we will be be predicting Total amount and using price_(in_rupees) to predict total amount is like cheating.

In [59]:
data.drop(columns=["price_(in_rupees)"], inplace=True)

##### Ok, Till now we have done some data analysis and some data cleaning, Now we will do some more data cleaning and model building 

In [60]:
data.columns

Index(['index', 'amount(in_rupees)', 'location', 'carpet_area', 'floor',
       'transaction', 'furnishing', 'facing', 'overlooking', 'society',
       'bathroom', 'balcony', 'car_parking', 'ownership', 'super_area'],
      dtype='object')

I have a bit of confusion between carpet_area and super_area

In [61]:
u_super_area

0             NaN
1        680 sqft
2        575 sqft
3        600 sqft
4       1165 sqft
          ...    
2893    147 sqyrd
2894    217 sqyrd
2895    2066 sqft
2896     406 sqft
2897    2332 sqft
Length: 2898, dtype: object

In [65]:
u_carpet_area

0        500 sqft
1        473 sqft
2        779 sqft
3        635 sqft
4             NaN
          ...    
2586    1797 sqft
2587    1634 sqft
2588    1709 sqft
2589    164 sqyrd
2590     136 sqft
Length: 2591, dtype: object

So, the carpet_area and super_area are actually must be quite highlt correlated and since i want to predict the actual market value of the house i will keep super_area and drop carpet_area, but after checking there correlation, First let's do some more data cleaning.

Let's first fix carpet_area and super_area

In [None]:
# data[['super_area_value', 'super_area_unit']] = data['super_area'].str.extract(r'(\d+\.?\d*)\s+([a-zA-Z]+)')

In [76]:
idxs = data["carpet_area"][data["carpet_area"].notna()].index

In [70]:
data["carpet_area"].head()

0    500 sqft
1    473 sqft
2    779 sqft
4    635 sqft
5         NaN
Name: carpet_area, dtype: object

In [69]:
missing_per["carpet_area"].item()

44.87949324761871