In [612]:
# importing necessary Libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
import math


In [613]:
# Lets read the CSV first

df = pd.read_csv('\Bengaluru_House_Data.csv')

In [614]:
df.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [615]:
df.shape

(13320, 9)

In [616]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [617]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [618]:
df.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [619]:
# to keep model simple we assume that some features donot conribute much to our Target'Price'
# so we drop columns availabilit, area_type, society, balcony

df1 = df.drop(['area_type','availability','society','balcony'],axis='columns')


In [620]:
df1.shape

(13320, 5)

In [621]:
df1.head(5)

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [622]:
df1.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [623]:
#since dataset is big enough so we can drop na values

df2 = df1.dropna()

In [624]:
# lets confirm there is no na value
df2.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [625]:
# lets look for unique value of size column

df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [627]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]) )


In [628]:
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [561]:
df2 = df2.drop('size', axis='columns')

df2['bhk'].unique()

In [562]:
df2['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [563]:
# looking unusual bhk values 
df2[df2.bhk>20]

Unnamed: 0,location,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,8000,27.0,230.0,27
4684,Munnekollal,2400,40.0,660.0,43


In [564]:
df2 = df2[df2.bhk < 20]

In [565]:
df2['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [566]:
# since datatype is Object, we convert them to float

def is_float(x):
    try:
        float(x)
    except: 
        return False
    return True
        
        

In [567]:
# lets evaluate the total-sqft columns where the values are not float

df2[~df2['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2
648,Arekere,4125Perch,9.0,265.0,9
661,Yelahanka,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,3090 - 5002,4.0,445.0,4


In [568]:
df2['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [569]:
df2.shape

(13244, 5)

In [570]:
# to tackle range, we will consider the mean of two values in range

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if (len(tokens) == 2):
        return ((float(tokens[0]) + float(tokens[1]))/2)
    try:
        return float(x)
    except:
        return None

In [571]:
convert_sqft_to_num('2100-2850')

2475.0

In [572]:
convert_sqft_to_num('2850')

2850.0

In [573]:
df3 = df2.copy()
df3['total_sqft'] = df3['total_sqft'].apply(convert_sqft_to_num)
df3.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2
5,Whitefield,1170.0,2.0,38.0,2
6,Old Airport Road,2732.0,4.0,204.0,4
7,Rajaji Nagar,3300.0,4.0,600.0,4
8,Marathahalli,1310.0,3.0,63.25,3
9,Gandhi Bazar,1020.0,6.0,370.0,6


In [574]:
df3.loc[30]

location      Yelahanka
total_sqft       2475.0
bath                4.0
price             186.0
bhk                   4
Name: 30, dtype: object

df3.head(5)

In [575]:
df3.head(5)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [576]:
# lets get price per sqft

df3['price_per_sqft']= df3['price']*100000/ df3['total_sqft']
df3.head(5)



Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [577]:
df3.shape

(13244, 6)

In [578]:
# Lets explore location column
# as its categorical features, lets get the unique count

len(df3['location'].unique())

1303

In [579]:
# this seems high dimensionality curse

df3.location = df3.location.apply(lambda x: x.strip())
location_stats = df3.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

location
Whitefield                     535
Sarjapur  Road                 392
Electronic City                304
Kanakpura Road                 266
Thanisandra                    236
                              ... 
1 Giri Nagar                     1
Kanakapura main  Road            1
Karnataka Shabarimala            1
Kasthuri Nagar East Of NGEF      1
whitefiled                       1
Name: location, Length: 1292, dtype: int64

In [580]:
# lets call location with less than 10 data points as other locs

len(location_stats[location_stats<=10])

1051

In [581]:
location_stats_less_than_ten = location_stats[location_stats<=10]

In [582]:
df3['location'] = df3['location'].apply(lambda x: 'other' if x in location_stats_less_than_ten else x)

In [583]:
len(df3['location'].unique())

242

In [584]:
# somehow we have managed to reduce the categories of location
len(df3)

13244

In [585]:
# lets focus on some Outlier Detection
# lets look at our dataframe again

df3.head(10)


Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0
5,Whitefield,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,1310.0,3.0,63.25,3,4828.244275
9,other,1020.0,6.0,370.0,6,36274.509804


In [586]:
# lets choose 300sqft per bedroom is threshold creiteria

df3[(df3.total_sqft/df3.bhk)<300]


Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
9,other,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,1407.0,4.0,150.0,6,10660.980810
68,Devarachikkanahalli,1350.0,7.0,85.0,8,6296.296296
70,other,500.0,3.0,100.0,3,20000.000000
...,...,...,...,...,...,...
13277,other,1400.0,7.0,218.0,7,15571.428571
13279,other,1200.0,5.0,130.0,6,10833.333333
13281,Margondanahalli,1375.0,5.0,125.0,5,9090.909091
13303,Vidyaranyapura,774.0,5.0,70.0,5,9043.927649


In [587]:
#lets remove them
df3 = df3[(df3.total_sqft/ df3.bhk)>300]

In [588]:
# lets look at the bath features
# its normal to have no of bath as 2 greater than no. of bedroom

df3[df3.bath > df3.bhk+2]

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
1078,other,3300.0,14.0,500.0,9,15151.515152
2620,other,11338.0,9.0,1000.0,6,8819.897689
6838,Rajaji Nagar,7500.0,8.0,1700.0,5,22666.666667
7709,Chikkabanavar,2460.0,7.0,80.0,4,3252.03252
9974,other,2400.0,6.0,775.0,3,32291.666667
11366,Nagasandra,7000.0,8.0,450.0,4,6428.571429
12103,Thanisandra,1806.0,6.0,116.0,3,6423.03433
12443,other,4350.0,8.0,2600.0,4,59770.114943
13067,other,7150.0,13.0,3600.0,10,50349.65035


In [589]:
# it seems clesrly outlier so removing them
df3 = df3[df3.bath < (df3.bhk+2)]

In [590]:
len(df3)

12127

In [591]:
# lets come to price_per_sqft feature

df3.price_per_sqft.describe()

count     12127.000000
mean       6150.434976
std        3941.096480
min         267.829813
25%        4194.152471
50%        5241.935484
75%        6775.622160
max      176470.588235
Name: price_per_sqft, dtype: float64

In [592]:
df3['price_per_sqft'].agg('mean')

6150.434976414654

In [593]:
df3['price_per_sqft'].agg('std')

3941.0964798686214

In [594]:
# lets consider only those dataset which reside in mean+-std for more simplicity
mean = df3['price_per_sqft'].agg('mean')
std = df3['price_per_sqft'].agg('std')
df3 = df3[(df3.price_per_sqft < (mean + std)) & (df3.price_per_sqft > (mean-std))]
df3.shape

(11045, 6)

In [595]:
# lets go for feature dropping and one hot encoding
 
df3.drop('price_per_sqft', axis = 'columns', inplace = True)


In [596]:
# convert text data feature into numerical by getting dummies

dummies = pd.get_dummies(df3.location)
dummies.head(5)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [597]:
# Lets not fall in dummy trap and drop one dummy
#and join the two dataframe

dummies.drop('other',axis = 'columns')
df3.drop('location',axis = 'columns',inplace= True)
df4 = pd.concat([df3,dummies], axis = 'columns')
df4.head(5)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1056.0,2.0,39.07,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,120.0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,62.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,95.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,51.0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [598]:
df4.shape

(11045, 245)

In [599]:
#lets get started with machine Learning 

X = df4.drop('price', axis = 'columns')
X.head()

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1056.0,2.0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [600]:
y = df4.price
y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [601]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 10)

In [602]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)

LinearRegression()

In [603]:
lr_clf.predict(X_test)

array([ 86.22318451,  60.48100244,  56.47588704, ...,  72.3591707 ,
        46.33542793, 145.20078246])

In [604]:
lr_clf.score(X_test,y_test)

0.8275138133632128

In [605]:
# lets try to improve its accuracy by using K-fold method

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

CV = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X,y, cv= CV)


array([ 7.89479298e-01, -7.05112546e+12,  8.00974991e-01,  8.12252412e-01,
        8.10737012e-01])

In [606]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [607]:
#Hyperparameter Tuning

def find_best_model_using_gridsearchcv(X,y):
    algo= {
        'linear_regression' :{
            'model'  : LinearRegression(),
            'params' : {
                'normalize' :[True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algo.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [608]:
# lets apply on X and y

find_best_model_using_gridsearchcv(X,y)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_base.py", line 569, in fit
    linalg.lstsq(X, y)
  File "C:\ProgramData\Anaconda3\lib\site-packages\scipy\linalg\basic.py", line 1212, in lstsq
    raise LinAlgError("SVD did not converge in Linear Least Squares")
numpy.linalg.LinAlgError: SVD did not converge in Linear Least Squares



Unnamed: 0,model,best_score,best_params
0,linear_regression,-1410225000000.0,{'normalize': False}
1,lasso,0.7771191,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.7486467,"{'criterion': 'mse', 'splitter': 'best'}"


In [610]:

def predict_price(location,sqft,bath,bhk):
    loc_index = np.where(X.columns ==location)[0][0]
    # give the column
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1]= bath
    x[2]= bhk
    if loc_index>= 0:
        x[loc_index] = 1
        
    return lr_clf.predict([x])[0]

In [611]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

87.14513402451495

In [630]:
import pickle
with open('bangaluru_home_prices_model.pickle', 'wb') as f:
    pickle.dump(lr_clf,f)

In [631]:
# lets store the columns info as Json file

import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open("columns.json", "w") as f:
    f.write(json.dumps(columns))
