In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [300]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [301]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [302]:
df.shape

(13320, 9)

In [303]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [304]:
df.drop(['society','area_type','availability','balcony'], axis=1, inplace=True)

In [305]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [306]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [307]:
df['bath'] = df['bath'].fillna(df['bath'].mean())

In [308]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath           0
price          0
dtype: int64

In [309]:
df['size'] = df['size'].fillna('2 BHK')

In [310]:
df = df.dropna()

In [311]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [312]:
df.shape

(13319, 5)

In [313]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [314]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13319 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13319 non-null  object 
 2   total_sqft  13319 non-null  object 
 3   bath        13319 non-null  float64
 4   price       13319 non-null  float64
dtypes: float64(2), object(3)
memory usage: 624.3+ KB


In [315]:
df.describe()

Unnamed: 0,bath,price
count,13319.0,13319.0
mean,2.692587,112.567621
std,1.337824,148.977089
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [316]:
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [317]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [318]:
df['bhk'] = df['size'].apply(lambda x : int(x.split(' ')[0]))

In [319]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [320]:
def clear_highfen(str):
    str = str.split('-')
    if(len(str) > 1):
        return (float(str[0].strip()) + float(str[1].strip())) / 2  
    try:
        return float(str[0])
    except:
        return None
    
df['sqft'] = df['total_sqft'].apply(clear_highfen)

In [321]:
df = df[df['sqft'].notnull()]

In [322]:
df.shape

(13273, 7)

In [323]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
bhk           0
sqft          0
dtype: int64

In [324]:
df.drop(['size'],axis=1,inplace=True)

In [325]:
df['price'] = df['price'].astype(int)

In [326]:
df['price_per_sqft'] = df['price'] * 100000 / df['sqft'] 

In [327]:
df['price_per_sqft'] = df['price_per_sqft'].astype(int)

In [328]:
df.drop(['sqft','price'], axis=1, inplace=True)

In [329]:
df.head()

Unnamed: 0,location,total_sqft,bath,bhk,price_per_sqft
0,Electronic City Phase II,1056,2.0,2,3693
1,Chikka Tirupathi,2600,5.0,4,4615
2,Uttarahalli,1440,2.0,3,4305
3,Lingadheeranahalli,1521,3.0,3,6245
4,Kothanur,1200,2.0,2,4250
