# Real Estate Price Predictor

In [150]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

In [151]:
df1=pd.read_csv("/content/Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


## Data Cleaning

In [152]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [153]:
df2=df1.drop(['area_type','society','availability'],axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [154]:
df2.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [155]:
df3=df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [156]:
df3.shape

(12710, 6)

In [157]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [158]:
df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [159]:
df3=df3.drop('size',axis=1)
df3.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Uttarahalli,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Kothanur,1200,2.0,1.0,51.0,2


In [160]:
df3['bhk'].unique()

array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13])

In [161]:
df3[df3.bhk>20]

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
1718,2Electronic City Phase II,8000,27.0,0.0,230.0,27
4684,Munnekollal,2400,40.0,0.0,660.0,43


In [162]:
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [163]:
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True    

In [164]:
df3[~df3['total_sqft'].apply(is_float)].head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
30,Yelahanka,2100 - 2850,4.0,0.0,186.0,4
122,Hebbal,3067 - 8156,4.0,0.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,0.0,43.49,2
188,KR Puram,1015 - 1540,2.0,0.0,56.8,2


In [165]:
def convert_sqft_to_num(x):
  tokens=x.split('-')
  if len(tokens) == 2:
    return (float(tokens[0])+float(tokens[1]))/2
  try:
    return float(x)
  except:
    return None     

In [166]:
df4=df3.copy()
df4['total_sqft']=df4['total_sqft'].apply(convert_sqft_to_num)
df4.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Kothanur,1200.0,2.0,1.0,51.0,2


In [167]:
df5=df4.copy()
df5['price_per_sqft']=(df5['price']*100000)/df5['total_sqft']
df5.head(10)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,1.0,38.0,2,3247.863248
8,Marathahalli,1310.0,3.0,1.0,63.25,3,4828.244275
10,Whitefield,1800.0,2.0,2.0,70.0,3,3888.888889
11,Whitefield,2785.0,5.0,3.0,295.0,4,10592.459605
12,7th Phase JP Nagar,1000.0,2.0,1.0,38.0,2,3800.0


## Feature Engineering

In [168]:
len(df5.location.unique())

1265

In [169]:
df5.location=df5.location.apply(lambda x:x.strip())

In [170]:
locationst=df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
locationst

location
Whitefield              515
Sarjapur  Road          372
Electronic City         302
Kanakpura Road          261
Thanisandra             234
                       ... 
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1254, dtype: int64

In [171]:
len(locationst[locationst<=10])

1017

In [172]:
location_less_10=locationst[locationst<=10]
location_less_10

location
1st Block Koramangala    10
Kalkere                  10
Basapura                 10
Kodigehalli              10
Gunjur Palya             10
                         ..
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1017, dtype: int64

In [173]:
df5.location=df5.location.apply(lambda x:'Other' if x in location_less_10 else x)
df5.head(10)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,1.0,38.0,2,3247.863248
8,Marathahalli,1310.0,3.0,1.0,63.25,3,4828.244275
10,Whitefield,1800.0,2.0,2.0,70.0,3,3888.888889
11,Whitefield,2785.0,5.0,3.0,295.0,4,10592.459605
12,7th Phase JP Nagar,1000.0,2.0,1.0,38.0,2,3800.0


In [174]:
len(df5.location.unique())
df5.head(20)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,1.0,38.0,2,3247.863248
8,Marathahalli,1310.0,3.0,1.0,63.25,3,4828.244275
10,Whitefield,1800.0,2.0,2.0,70.0,3,3888.888889
11,Whitefield,2785.0,5.0,3.0,295.0,4,10592.459605
12,7th Phase JP Nagar,1000.0,2.0,1.0,38.0,2,3800.0


## Outlier removal

In [175]:
df5[df5.total_sqft/df5.bhk<300].head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
58,Murugeshpalya,1407.0,4.0,1.0,150.0,6,10660.98081
68,Devarachikkanahalli,1350.0,7.0,0.0,85.0,8,6296.296296
70,Other,500.0,3.0,2.0,100.0,3,20000.0
78,Kaval Byrasandra,460.0,1.0,0.0,22.0,2,4782.608696
89,Rajaji Nagar,710.0,6.0,3.0,160.0,6,22535.211268


In [176]:
df5.shape

(12710, 7)

In [177]:
df6=df5[~((df5.total_sqft/df5.bhk)<300)]
df6.shape

(12055, 7)

In [178]:
df6.price_per_sqft.describe()

count     12013.000000
mean       6206.082347
std        3985.518807
min         267.829813
25%        4199.363057
50%        5252.525253
75%        6823.529412
max      176470.588235
Name: price_per_sqft, dtype: float64

In [179]:
#Assuming that the dataset is normally distributed, we will remove data beyond standard deviation 

def remove_outliers(df):
  df_out=pd.DataFrame()
  for key,subdf in df.groupby('location'):
    m=np.mean(subdf.price_per_sqft)
    st=np.std(subdf.price_per_sqft)
    reduced_df=subdf[(subdf.price_per_sqft>(m-st))&(subdf.price_per_sqft<(m+st))]
    df_out=pd.concat([df_out,reduced_df],ignore_index=True)
  return df_out

df7=remove_outliers(df6)
df7.shape

(9848, 7)

In [180]:
#Visualising bhk feature
def plot_scatter_chart(df,location):
  bhk2=df[(df.location==location)&(df.bhk==2)]
  bhk3=df[(df)]

In [181]:
#Now we remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK
#we do this for n BHK apartments as well considering n-1 BHK apartments
def remove_bhk_outliers(df):
  exclude_indices=np.array([])
  for location,location_df in df.groupby('location'):
    bhk_stats={}
    for bhk,bhk_df in location_df.groupby('bhk'):
       bhk_stats[bhk]={
           'mean': np.mean(bhk_df.price_per_sqft),
           'std': np.std(bhk_df.price_per_sqft),
           'count': bhk_df.shape[0]
       }
    for bhk,bhk_df in location_df.groupby('bhk'):
      stats=bhk_stats.get(bhk-1)
      if stats and stats['count']>5:
        exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
  return df.drop(exclude_indices,axis='index') 

df8=remove_bhk_outliers(df7)
df8.shape

(7025, 7)

In [182]:
#Removing houses with more bathrooms than (bedrooms+1) 
df9=df8[df8.bath<=(df8.bhk+1)]

In [183]:
df10=df9[df9.balcony<=df9.bhk]
df10.drop('price_per_sqft',axis='columns')

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2
...,...,...,...,...,...,...
9839,Yeshwanthpur,1195.0,2.0,2.0,100.0,2
9840,Yeshwanthpur,1692.0,3.0,3.0,108.0,3
9842,Yeshwanthpur,2500.0,5.0,2.0,185.0,6
9846,Yeshwanthpur,1855.0,3.0,3.0,135.0,3


## Model Building

In [184]:
df10.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,10833.333333
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,11983.805668


In [185]:
#Since location is a categorical variable we need to convert it to numerical form by one-hot encoding
dummies=pd.get_dummies(df10.location)
dummies.head()
df11=pd.concat([df10,dummies.drop('Other',axis='columns')],axis='columns') 
df11.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,10833.333333,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,11983.805668,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
#Dropping location column
df12=df11.drop('location',axis='columns')
df12.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,1.0,428.0,4,15017.54386,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,2.0,194.0,3,11901.840491,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3.0,235.0,3,12533.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,0.0,130.0,3,10833.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,2.0,148.0,2,11983.805668,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
x=df12.drop(['price','price_per_sqft'],axis='columns')
y=df12.price

In [188]:
#Finding best model using GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

def best_model_using_gridsearchcv(x,y):
  algos={
      'linear_regression':{
          'model':LinearRegression(),
          'params':{
              'positive':['True','False']
          }
      },
      'lasso':{
          'model':Lasso(),
          'params':{
              'alpha':[1,2],
              'selection':['random','cyclic']
          }
      },
      'decision_tree':{
          'model':DecisionTreeRegressor(),
          'params':{
             'criterion':['squared_error','friedman_mse'],
             'splitter':['best','random'] 
          }
      }
  }
  scores=[]
  cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
  for algo_name,config in algos.items():
      gs=GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
      gs.fit(x,y)
      scores.append({
          'model':algo_name,
          'best_score':gs.best_score_,
          'best_params':gs.best_params_ 
      })

  return pd.DataFrame(scores,columns=['model','best_score','best_params'])

best_model_using_gridsearchcv(x,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.780793,{'positive': 'True'}
1,lasso,0.658204,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.655571,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


In [208]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)
from sklearn.linear_model import LinearRegression
lr_rgr=LinearRegression(positive='True')
lr_rgr.fit(x_train,y_train)
lr_rgr.score(x_test,y_test)

0.8182383596215725

In [209]:
x.columns

Index(['total_sqft', 'bath', 'balcony', 'bhk', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Phase JP Nagar', '6th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=241)

In [210]:
def predict_price(location,sqft,bath,balcony,bhk):
  loc_index=np.where(x.columns==location)[0][0]
  s=np.zeros(len(x.columns))
  s[0]=sqft
  s[1]=bath
  s[2]=balcony
  s[3]=bhk
  if loc_index>=0:
    s[loc_index]=1
  return lr_rgr.predict([s])[0]  

In [220]:
print(predict_price('Indira Nagar',1000,3,0,3),'Lakhs')

164.45377770151558 Lakhs


  "X does not have valid feature names, but"
