In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('bengaluru_house_prices.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

<h1 style="color:blue; font-weight:bold;"> Data Cleaning </h1>

Arbitrarily, I choose to remove the columns 'availability' and 'society' which I don't find relevant for our prediction problem.

In [4]:
df1 = df.drop(['availability', 'society'], axis = 'columns')
df1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [5]:
df1.isna().sum()

area_type       0
location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

Since the dataset is big enough, I choose to get rid of the NaN values.

In [6]:
df2 = df1.dropna()
df2.isna().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

Now, we will study some columns which may contain outliers or other wrong features.

In [7]:
df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

Here we can notice that the bedroom counting is not homogeneous. For instance, a house with 3 bedrooms can be written '3 BHK' or '3 Bedroom'. We need to fix this in order to allow the predictions.

In [8]:
def extract_nb_bedrooms(size : str) -> int :
    return size.split(' ')[0]

In [9]:
df3 = df2.copy()
df3['size_formated'] = df2['size'].apply(extract_nb_bedrooms)

In [10]:
df3.drop('size', axis = 'columns')

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,size_formated
0,Super built-up Area,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,2600,5.0,3.0,120.00,4
2,Built-up Area,Uttarahalli,1440,2.0,3.0,62.00,3
3,Super built-up Area,Lingadheeranahalli,1521,3.0,1.0,95.00,3
4,Super built-up Area,Kothanur,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Super built-up Area,Green Glen Layout,1715,3.0,3.0,112.00,3
13315,Built-up Area,Whitefield,3453,4.0,0.0,231.00,5
13317,Built-up Area,Raja Rajeshwari Nagar,1141,2.0,1.0,60.00,2
13318,Super built-up Area,Padmanabhanagar,4689,4.0,1.0,488.00,4


In [38]:
df3['total_sqft']

0        1056
1        2600
2        1440
3        1521
4        1200
         ... 
13314    1715
13315    3453
13317    1141
13318    4689
13319     550
Name: total_sqft, Length: 12710, dtype: object

Some houses are listed with a range for the surface area instead of a unique value or with a different unit. We will replace these ranges with the mean value, and convert in the correct unit.

In [39]:
import re

In [40]:
df3['total_sqft'] = df3['total_sqft'].apply(str)
def format_surf_area(area : str) -> int:
    if '-' in area:
        min = float(area.split(' ')[0])
        max = float(area.split(' ')[-1])
        return (min + max)/2
    else:
        try:
            return float(area)
        except:
            #print(area)
            decomp = re.match(r'([\d.]+)\s*(.*)', area)
            value = float(decomp.group(1))
            unity = decomp.group(2)
            if unity =='Sq. Meter':
                return float(value*10.764) # 1 m² = 10.764 sqft
            elif unity=='Perch':
                return float(value*272,3) # 1 perch = 272.3 sqft
            elif unity=='Acres':
                return float(value*43560) # 1 Acre = 43560 sqft
            elif unity=='Guntha':
                return float(value*1089) # 1 Gunta = 1089 sqft
            elif unity=='Sq. Yards':
                return float(value*9) # 1 Yard = 9 sqft
            elif unity=='Cents':
                return float(value*431) # 1 Cent = 431 sqft
            elif unity=='Grounds':
                return float(value*2400) # 1 Cent = 2400 sqft


In [41]:
df4 = df3.copy()
df4['surf_formated'] = df3['total_sqft'].apply(format_surf_area)

In [43]:
df4['surf_formated'].describe()

count    1.271000e+04
mean     1.791187e+03
std      1.459348e+04
min      5.000000e+00
25%      1.100000e+03
50%      1.260000e+03
75%      1.643750e+03
max      1.306800e+06
Name: surf_formated, dtype: float64

In [45]:
df4 = df4.drop('size', axis = 'columns')
df4.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,size_formated,surf_formated
0,Super built-up Area,Electronic City Phase II,1056,2.0,1.0,39.07,2,1056.0
1,Plot Area,Chikka Tirupathi,2600,5.0,3.0,120.0,4,2600.0
2,Built-up Area,Uttarahalli,1440,2.0,3.0,62.0,3,1440.0
3,Super built-up Area,Lingadheeranahalli,1521,3.0,1.0,95.0,3,1521.0
4,Super built-up Area,Kothanur,1200,2.0,1.0,51.0,2,1200.0


In [46]:
df4.isna().sum()

area_type        0
location         0
total_sqft       0
bath             0
balcony          0
price            0
size_formated    0
surf_formated    0
dtype: int64

At this moment, the values in this dataframe must be well-formated. We still have to One-Hot encode the categorical features.

In [47]:
oh_features = pd.get_dummies(df4[['area_type', 'location']])

In [49]:
oh_features.shape

(12710, 1269)

We encounter an issue here : there are too much different locations, OH encoding would create 1260+ features. We will group the locations with less than 50 occurrences in a category 'Other'.

In [110]:
df_tmp = df4.groupby('location')['location'].agg('count')

In [111]:
list_loc_ok = df_tmp[df_tmp>50].sort_values(ascending=False)

In [112]:
def format_location(loc : str) -> str:
    if loc in list_loc_ok:
        return loc
    else:
        return 'Other'

In [116]:
df5 = df4.copy()
df5['loc_formated'] = df4['location'].apply(format_location)
df5 = df5.drop('location', axis = 'columns')

In [117]:
df5.groupby('loc_formated')['loc_formated'].agg('count')

loc_formated
7th Phase JP Nagar           147
8th Phase JP Nagar            56
Akshaya Nagar                 58
Banashankari                  74
Bannerghatta Road            144
Begur Road                    83
Bellandur                     91
Bisuvanahalli                 51
Budigere                      54
Chandapura                    98
Electronic City              300
Electronic City Phase II     130
Electronics City Phase 1      86
Haralur Road                 135
Harlur                        76
Hebbal                       173
Hennur                        51
Hennur Road                  142
Hoodi                         86
Hormavu                       71
Hosa Road                     72
JP Nagar                      64
Jakkur                        67
KR Puram                      85
Kaggadasapura                 61
Kanakpura Road               261
Kasavanhalli                  77
Kengeri                       71
Koramangala                   69
Kothanur                      

Now, we have 50 different features corresponding to the location, allowing the one hot encoding.

In [103]:
df_tmp = df5.groupby('area_type')['area_type'].agg('count')

In [104]:
df_tmp

area_type
Built-up  Area          2310
Carpet  Area              82
Plot  Area              1837
Super built-up  Area    8481
Name: area_type, dtype: int64

No need to do the same operation with the 'area_type' feature.

In [120]:
df6 = df5.copy()
dummies_var = pd.get_dummies(df5[['loc_formated','area_type']], drop_first = True)

In [121]:
dummies_var.shape

(12710, 52)

In [123]:
df6 = pd.concat([df6.drop('area_type', axis = 'columns'), dummies_var], axis = 'columns')

The dataset is well-formated. We still have to remove the outliers in order to improve the future model we will use.