In [1]:

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

# Loading data into dataframe

In [2]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df1.shape

(13320, 9)

In [4]:
df1.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

# Count of Unique types of area in the data

In [5]:

df1['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

# Dropping of features that might not be very useful in estimating price

In [6]:
df2 = df1.drop(['society','availability'],axis='columns')
df2.shape

(13320, 7)

# Counting the number of NULL values

In [7]:
df2.isnull().sum()

area_type       0
location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

# Dropping all rows having some NULL values

In [8]:
df3 = df2.dropna()
df3.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [9]:

df3.shape

(12710, 7)

In [10]:
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


# Notice above the "size" column has some values in terms of "bedroom" and some in terms of 'BHK' ,so we can dig deep into this column

In [11]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

# Add new feature(integer) for bhk (Bedrooms Hall Kitchen)

In [12]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.bhk.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13],
      dtype=int64)

# Homes with 27 and 43 rooms does not make any sense

In [13]:

df3[df3.bhk>15]

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
1718,Super built-up Area,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
4684,Plot Area,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43


In [14]:
df3.drop([1718,4684],axis=0,inplace=True)
df3[df3.bhk>15]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk


# another important feature to explore is "total_sqft"

In [15]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [16]:

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [17]:

df3[~df3['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0,4
122,Super built-up Area,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8,2
410,Super built-up Area,Kengeri,1 BHK,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,Hennur Road,2 BHK,1195 - 1440,2.0,0.0,63.77,2
661,Super built-up Area,Yelahanka,2 BHK,1120 - 1145,2.0,0.0,48.13,2
672,Built-up Area,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,0.0,445.0,4
772,Super built-up Area,Banashankari Stage VI,2 BHK,1160 - 1195,2.0,0.0,59.935,2


Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range.

In [18]:

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [19]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.loc[30]

area_type     Super built-up  Area
location                 Yelahanka
size                         4 BHK
total_sqft                  2475.0
bath                           4.0
balcony                        0.0
price                        186.0
bhk                              4
Name: 30, dtype: object

For above
row, it shows total_sqft as 2475 which is an average of the range 2100-2850



# adding a new feature "price per sqft" which is common in India

In [20]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [21]:

df5['price_per_sqft'].describe()


count    1.266600e+04
mean     6.874965e+03
std      2.263456e+04
min      2.678298e+02
25%      4.242820e+03
50%      5.376344e+03
75%      7.142857e+03
max      2.300000e+06
Name: price_per_sqft, dtype: float64

In [22]:
df5['location'].value_counts()

Whitefield            513
Sarjapur  Road        372
Electronic City       300
Kanakpura Road        259
Thanisandra           230
                     ... 
Milk Colony             1
Sundara Nagar           1
Jaladarsini Layout      1
Madanayakahalli         1
Abshot Layout           1
Name: location, Length: 1258, dtype: int64

# Dimensionality Reduction
Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns


Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of location

In [23]:
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)
location_stats

Whitefield           514
Sarjapur  Road       372
Electronic City      302
Kanakpura Road       259
Thanisandra          233
                    ... 
Subbannaiah Palya      1
whitefiled             1
Medi Agrahara          1
Sadduguntepalya        1
Abshot Layout          1
Name: location, Length: 1247, dtype: int64

In [24]:
len(location_stats[location_stats>10])

235

In [25]:

len(location_stats[location_stats<=10])

1012

In [26]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

1st Block Koramangala    10
Gunjur Palya             10
Kalkere                  10
Nagappa Reddy Layout     10
Dairy Circle             10
                         ..
Subbannaiah Palya         1
whitefiled                1
Medi Agrahara             1
Sadduguntepalya           1
Abshot Layout             1
Name: location, Length: 1012, dtype: int64

In [27]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

236

In [28]:
df5.head(10)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0
5,Super built-up Area,Whitefield,2 BHK,1170.0,2.0,1.0,38.0,2,3247.863248
8,Super built-up Area,Marathahalli,3 BHK,1310.0,3.0,1.0,63.25,3,4828.244275
10,Super built-up Area,Whitefield,3 BHK,1800.0,2.0,2.0,70.0,3,3888.888889
11,Plot Area,Whitefield,4 Bedroom,2785.0,5.0,3.0,295.0,4,10592.459605
12,Super built-up Area,7th Phase JP Nagar,2 BHK,1000.0,2.0,1.0,38.0,2,3800.0


# Outlier Removal Using Business Logic
Normally square ft per bedroom is around 300-400 (i.e. 2 bhk apartment is minimum 600 sqf,so for example 400 sqft apartment with 2 bhk seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 350 sqft

In [29]:

df5[df5.total_sqft/df5.bhk<350].head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
26,Super built-up Area,Electronic City,2 BHK,660.0,1.0,1.0,23.1,2,3500.0
29,Super built-up Area,Electronic City,3 BHK,1025.0,2.0,1.0,47.0,3,4585.365854
58,Plot Area,Murugeshpalya,6 Bedroom,1407.0,4.0,1.0,150.0,6,10660.98081
68,Plot Area,Devarachikkanahalli,8 Bedroom,1350.0,7.0,0.0,85.0,8,6296.296296
70,Plot Area,other,3 Bedroom,500.0,3.0,2.0,100.0,3,20000.0


Check above data points. We have 6 bhk apartment with 1407 sqft. Another one is 8 bhk and total sqft is 1350. These are clear data errors that can be removed safely

In [30]:

df5.shape

(12666, 9)

In [31]:
df6 = df5[~(df5.total_sqft/df5.bhk<350)]
df6.shape

(11686, 9)

In [32]:
df6['price_per_sqft'].describe()

count     11686.000000
mean       6097.109904
std        3859.638681
min         267.829813
25%        4190.260476
50%        5219.868416
75%        6722.689076
max      176470.588235
Name: price_per_sqft, dtype: float64

#
Here we find that min price per sqft is around 267 rs/sqft whereas max is around 11686 rs/sqft, this shows a wide variation in property prices. We should remove outliers per location using mean and 3* standard deviation



In [33]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-3*st)) & (subdf.price_per_sqft<=(m+3*st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

(11512, 9)

In [34]:
df7.rename(columns = {'price':'price(in lakhs)'}, inplace = True)
df7.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price(in lakhs),bhk,price_per_sqft
0,Super built-up Area,1st Block Jayanagar,4 BHK,2850.0,4.0,1.0,428.0,4,15017.54386
1,Super built-up Area,1st Block Jayanagar,3 BHK,1630.0,3.0,2.0,194.0,3,11901.840491
2,Super built-up Area,1st Block Jayanagar,3 BHK,1875.0,2.0,3.0,235.0,3,12533.333333
3,Plot Area,1st Block Jayanagar,4 Bedroom,2400.0,4.0,2.0,450.0,4,18750.0
4,Super built-up Area,1st Block Jayanagar,2 BHK,1000.0,3.0,2.0,60.0,2,6000.0


In [35]:
df7["bath"].describe()

count    11512.000000
mean         2.473853
std          0.960796
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         13.000000
Name: bath, dtype: float64

# We see above maximum no. of bathrooms are=13 which seems to be an  outlier ,hence we take care of such data points

In [36]:
df7[df7.bath>df7.bhk+2]

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price(in lakhs),bhk,price_per_sqft
1836,Built-up Area,Chikkabanavar,4 Bedroom,2460.0,7.0,2.0,80.0,4,3252.03252
6660,Super built-up Area,Rajaji Nagar,5 BHK,7500.0,8.0,3.0,1700.0,5,22666.666667
7854,Super built-up Area,Thanisandra,3 BHK,1806.0,6.0,2.0,116.0,3,6423.03433
9685,Super built-up Area,other,6 BHK,11338.0,9.0,1.0,1000.0,6,8819.897689


also if in any home no. of bathrooms are greater than number of bedrooms by 2 ,then is its an unsual data so we remove such points

In [37]:
df7.drop([1836,6660,7854,9685],axis=0,inplace=True)

In [38]:
df7.drop('size',axis='columns',inplace=True)


In [39]:
df7.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price(in lakhs),bhk,price_per_sqft
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333
3,Plot Area,1st Block Jayanagar,2400.0,4.0,2.0,450.0,4,18750.0
4,Super built-up Area,1st Block Jayanagar,1000.0,3.0,2.0,60.0,2,6000.0


In [40]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11508 entries, 0 to 11511
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area_type        11508 non-null  object 
 1   location         11508 non-null  object 
 2   total_sqft       11508 non-null  float64
 3   bath             11508 non-null  float64
 4   balcony          11508 non-null  float64
 5   price(in lakhs)  11508 non-null  float64
 6   bhk              11508 non-null  int64  
 7   price_per_sqft   11508 non-null  float64
dtypes: float64(5), int64(1), object(2)
memory usage: 809.2+ KB


Now we observe that we have two columns which have text data values in it ,hence for them we'll use one hot encoding

In [41]:
dummies1 = pd.get_dummies(df7.location)
dummies1.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
dummies2= pd.get_dummies(df7.area_type)
dummies2.head(3)

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1


In [43]:
df8=pd.concat([df7,dummies1,dummies2],axis='columns')
df8.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price(in lakhs),bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,Super built-up Area,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,15017.54386,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Super built-up Area,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491,1,0,...,0,0,0,0,0,0,0,0,0,1
2,Super built-up Area,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,12533.333333,1,0,...,0,0,0,0,0,0,0,0,0,1
3,Plot Area,1st Block Jayanagar,2400.0,4.0,2.0,450.0,4,18750.0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,Super built-up Area,1st Block Jayanagar,1000.0,3.0,2.0,60.0,2,6000.0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [44]:
df8.drop(['area_type','location'],axis='columns',inplace=True)
df8.head()

Unnamed: 0,total_sqft,bath,balcony,price(in lakhs),bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,2850.0,4.0,1.0,428.0,4,15017.54386,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1630.0,3.0,2.0,194.0,3,11901.840491,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1875.0,2.0,3.0,235.0,3,12533.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2400.0,4.0,2.0,450.0,4,18750.0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1000.0,3.0,2.0,60.0,2,6000.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11508 entries, 0 to 11511
Columns: 246 entries, total_sqft to Super built-up  Area
dtypes: float64(5), int64(1), uint8(240)
memory usage: 3.2 MB


Building Machine learning model now

In [46]:
df8.shape

(11508, 246)

In [48]:
X = df8.drop(['price(in lakhs)','price_per_sqft'],axis='columns')
X.head(3)

Unnamed: 0,total_sqft,bath,balcony,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,...,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,2850.0,4.0,1.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1630.0,3.0,2.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1875.0,2.0,3.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [49]:
X.shape

(11508, 244)

In [50]:
y = df8['price(in lakhs)']
y.head(3)

0    428.0
1    194.0
2    235.0
Name: price(in lakhs), dtype: float64

In [51]:
len(y)

11508

In [52]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [53]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.5780233322988548

# Use K Fold cross validation to measure accuracy of our LinearRegression model

In [54]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.55071473, 0.51047348, 0.53556419, 0.56772006, 0.36263423])

We can see that in 5 iterations we get a highest score of around 56%. This is decent good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose

# Find best model using GridSearchCV

In [55]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.506258,{'normalize': True}
1,lasso,0.417647,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.477673,"{'criterion': 'friedman_mse', 'splitter': 'ran..."



Based on above results we can say that LinearRegression gives the best score. Hence we will use that

# Test the model for few properties

In [56]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [57]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

105.41556849890294

In [58]:
predict_price('1st Phase JP Nagar',1000, 3, 3)

137.86943514021465

# Export the tested model to a pickle file

In [59]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

# Export location and column information to a file that will be useful later on in our prediction application

In [60]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))