In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
data = pd.read_csv("Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
data.shape

(13320, 9)

In [4]:
data.describe

<bound method NDFrame.describe of                  area_type   availability                  location  \
0      Super built-up Area         19-Dec  Electronic City Phase II   
1                Plot Area  Ready To Move          Chikka Tirupathi   
2            Built-up Area  Ready To Move               Uttarahalli   
3      Super built-up Area  Ready To Move        Lingadheeranahalli   
4      Super built-up Area  Ready To Move                  Kothanur   
...                    ...            ...                       ...   
13315        Built-up Area  Ready To Move                Whitefield   
13316  Super built-up Area  Ready To Move             Richards Town   
13317        Built-up Area  Ready To Move     Raja Rajeshwari Nagar   
13318  Super built-up Area         18-Jun           Padmanabhanagar   
13319  Super built-up Area  Ready To Move              Doddathoguru   

            size  society total_sqft  bath  balcony   price  
0          2 BHK  Coomee        1056   2.0      1.0

In [5]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
#Data clean: handle null values
data = data.dropna()

In [7]:
data.area_type.unique()

array(['Super built-up Area', 'Plot Area', 'Built-up Area', 'Carpet Area'],
      dtype=object)

In [8]:
#drop availability because it's not required 
data = data.drop('availability',axis=1)

In [9]:
data.location.unique()

array(['Electronic City Phase II', 'Chikka Tirupathi',
       'Lingadheeranahalli', 'Whitefield', '7th Phase JP Nagar',
       'Sarjapur', 'Mysore Road', 'Bisuvanahalli',
       'Raja Rajeshwari Nagar', 'Ramakrishnappa Layout', 'Binny Pete',
       'Thanisandra', ' Thanisandra', 'Electronic City',
       'Ramagondanahalli', 'Yelahanka', 'Hebbal', 'Kanakpura Road',
       'Kundalahalli', 'Sarjapur  Road', 'Ganga Nagar', 'Doddathoguru',
       'Adarsh Nagar', 'Bhoganhalli', 'Lakshminarayana Pura',
       'Begur Road', 'Varthur', 'Gunjur', 'Hegde Nagar', 'Haralur Road',
       'Hennur Road', 'Cholanayakanahalli', 'Kodanda Reddy Layout',
       'EPIP Zone', 'Dasanapura', 'Kasavanhalli', 'Sanjay nagar',
       'Kengeri', 'Yeshwanthpur', 'Chandapura', 'Kothanur',
       'Green View Layout', 'Shantiniketan Layout', 'Rajaji Nagar',
       'Devanahalli', 'Byatarayanapura', 'Akshaya Nagar',
       'LB Shastri Nagar', 'Hormavu', 'Peenya', 'Kudlu Gate',
       '8th Phase JP Nagar', 'Chandra Layout

In [10]:
len(data.location.unique())

634

In [11]:
location_count = data.groupby('location').size().sort_values(ascending=False)
location_count

location
Whitefield           397
Sarjapur  Road       310
Electronic City      236
Kanakpura Road       216
Thanisandra          212
                    ... 
Jeevan bima nagar      1
Jnana Ganga Nagar      1
Jogupalya              1
KPC Layout             1
 Banaswadi             1
Length: 634, dtype: int64

In [12]:
len(location_count[location_count<=10])

491

In [13]:
location_less_10 = location_count[location_count<=10]

In [14]:
#if the location have less than 10 or 10 houses than that location are known as other location
data.location = data.location.apply(lambda x: 'other' if x in location_less_10 else x)

In [15]:
len(data.location.unique())

144

In [16]:
data['bhk'] = data['size'].apply(lambda x: int(x.split(' ')[0]))
data.bhk.unique() 

array([ 2,  4,  3,  1,  5, 11,  9,  6,  7], dtype=int64)

In [17]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [18]:
data.total_sqft = data.total_sqft.apply(convert_sqft_to_num)
data = data[data.total_sqft.notnull()]
data.head()

Unnamed: 0,area_type,location,size,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0,4
3,Super built-up Area,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0,3
5,Super built-up Area,Whitefield,2 BHK,DuenaTa,1170.0,2.0,1.0,38.0,2
11,Plot Area,Whitefield,4 Bedroom,Prrry M,2785.0,5.0,3.0,295.0,4


In [19]:
#drop society column because it's not required
data = data.drop('society',axis=1)

In [20]:
data.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
5,Super built-up Area,Whitefield,2 BHK,1170.0,2.0,1.0,38.0,2
11,Plot Area,Whitefield,4 Bedroom,2785.0,5.0,3.0,295.0,4


In [21]:
#create new column price_sqft that contain price per sqft
data['price_sqft'] = data.price*100000/data.total_sqft

In [22]:
#remove outlier using standard deviation
upper_limit = data.price_sqft.mean() + data.price_sqft.std()
lower_limit = data.price_sqft.mean() - data.price_sqft.std()
data = data[data.price_sqft<upper_limit] 
data = data[data.price_sqft > lower_limit]

In [23]:
#remove 2BHK house that price per sqft is less than the mean of 1bhk house price
def remove_bhk_outliers(data):
    exclude_indices = np.array([])
    for location, location_df in data.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_sqft),
                'std': np.std(bhk_df.price_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_sqft<(stats['mean'])].index.values)
    return data.drop(exclude_indices,axis='index')
data = remove_bhk_outliers(data)

In [24]:
data = data.drop('price_sqft',axis=1)

In [25]:
#apply one hot encoding in location column
dummies = pd.get_dummies(data.location)
location_cat = pd.get_dummies(data.location)
dummies.head()

Unnamed: 0,1st Phase JP Nagar,5th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,Abbigere,Akshaya Nagar,Ambalipura,Ambedkar Nagar,...,Tumkur Road,Uttarahalli,Varthur,Vijayanagar,Vittasandra,Whitefield,Yelahanka,Yelahanka New Town,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
12,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
</head>

<body>
    <div class="page-wrapper bg-blue p-t-100 p-b-100 font-robo">
        <div class="wrapper wrapper--w680">
            <div class="card card-1">
                <div class="card-heading"></div>
                <div class="card-body">
                    <h2 class="title">Real Estate Price Prediction ML App </h2>
                    <form action="{{ url_for('predict')}}" method="POST">
                        <div class="input-group">
                            <input class="input--style-1" type="text" placeholder="Enter a location name" name="location">
                        </div>

                        <div class="input-group">
                            <input class="input--style-1" type="text" placeholder="Enter a total sqft." name="sqft">
                        </div>

                        <div class="input-group">
                            <div class="rs-select2 js-select-simple select--no-search">
                                <select name="area">
                                    <option disabled="disabled" selected="selected">Area type</option>
                                    <option value="Built-up Area">Built-up Area</option>
                                    <option value="Carpet Area">Carpet Area</option>
                                    <option value="Plot Area">Plot Area</option>
                                    <option value="Super built-up Area">Super built-up Area</option>
                                </select>
                                <div class="select-dropdown"></div>
                            </div>
                        </div>

                        <div class="input-group">
                            <div class="rs-select2 js-select-simple select--no-search">
                                <select name="size">
                                    <option disabled="disabled" selected="selected">Size</option>
                                    <option value="1">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                                <div class="select-dropdown"></div>
                            </div>
                        </div>

                        <div class="input-group">
                            <div class="rs-select2 js-select-simple select--no-search">
                                <select name="bath">
                                    <option disabled="disabled" selected="selected">Bath</option>
                                    <option value="1">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                                <div class="select-dropdown"></div>
                            </div>
                        </div>

                        <div class="input-group">
                            <div class="rs-select2 js-select-simple select--no-search">
                                <select name="balcony">
                                    <option disabled="disabled" selected="selected">Balcony</option>
                                    <option value="1">1</option>
                                    <option value="2">2</option>
                                    <option value="3">3</option>
                                    <option value="4">4</option>
                                    <option value="5">5</option>
                                </select>
                                <div class="select-dropdown"></div>
                            </div>
                        </div>
                        <div class="p-t-20">
                            <button class="btn btn--radius btn--green" type="submit">Submit</button>
                        </div>
                    </form>
                    {{Predict_score}}
                </div>
            </div>
        </div>
    </div>

In [26]:
data = pd.concat([data,dummies.drop('1st Phase JP Nagar',axis='columns')],axis='columns')
data.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,5th Phase JP Nagar,7th Phase JP Nagar,...,Tumkur Road,Uttarahalli,Varthur,Vijayanagar,Vittasandra,Whitefield,Yelahanka,Yelahanka New Town,Yeshwanthpur,other
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
11,Plot Area,Whitefield,4 Bedroom,2785.0,5.0,3.0,295.0,4,0,0,...,0,0,0,0,0,1,0,0,0,0
12,Super built-up Area,7th Phase JP Nagar,2 BHK,1000.0,2.0,1.0,38.0,2,0,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#apply one hot encoding in area column
dummies = pd.get_dummies(data.area_type)
dummies.head()

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
1,0,0,1,0
3,0,0,0,1
11,0,0,1,0
12,0,0,0,1


In [28]:
data = pd.concat([data,dummies],axis='columns')
data.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,bhk,5th Phase JP Nagar,7th Phase JP Nagar,...,Vittasandra,Whitefield,Yelahanka,Yelahanka New Town,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,0,0,...,0,0,0,0,0,0,0,0,0,1
11,Plot Area,Whitefield,4 Bedroom,2785.0,5.0,3.0,295.0,4,0,0,...,0,1,0,0,0,0,0,0,1,0
12,Super built-up Area,7th Phase JP Nagar,2 BHK,1000.0,2.0,1.0,38.0,2,0,1,...,0,0,0,0,0,0,0,0,0,1


In [29]:
data = data.drop(['location','size','area_type'] , axis=1)

In [30]:
data.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,5th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,...,Vittasandra,Whitefield,Yelahanka,Yelahanka New Town,Yeshwanthpur,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,1056.0,2.0,1.0,39.07,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2600.0,5.0,3.0,120.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1521.0,3.0,1.0,95.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11,2785.0,5.0,3.0,295.0,4,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
12,1000.0,2.0,1.0,38.0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [31]:
#split data into dependent feature & independent feature
X = data.drop('price',axis=1)

In [32]:
y=data['price']

In [33]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.90264951, 0.89014555, 0.86736688, 0.89276874, 0.87845573])

In [34]:
np.average(regressor)

0.8862772833440082

In [35]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=0.3)

In [36]:
model = LinearRegression()
model.fit(x_train , y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [37]:
y_pred = model.predict(x_test)

In [38]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

18.084377586936473
818.0534588407193
28.601633849147834


In [39]:
#save the model
import pickle
with open('model.pkl', 'wb') as fh:
   pickle.dump(model, fh)

In [40]:
cat = data.drop(['price'] , axis=1)
index_dict = dict(zip(cat.columns,range(cat.shape[1])))
with open('cat', 'wb') as fid:
    pickle.dump(index_dict, fid,2)  

In [41]:
#save the column name
location_list = list(zip(location_cat.columns,range(cat.shape[1])))

In [42]:
with open('location_cat', 'wb') as fid:
    pickle.dump(location_list, fid,2)  

In [43]:
def predict(location,area,size,bath,balcony,total_sqft):
    output = np.zeros(151)
    output[0] = total_sqft
    output[1] = bath
    output[2] = balcony
    output[3] = size
    
    result_location = location
    if result_location not in location_cat:
        output[146] = 1
    else:
        output[index_dict[str(location)]] = 1
    
    return model.predict([output])[0]

In [44]:
#(location , area_type , size , bath , balcony , total_sqft)
predict('Yelahanka','Carpet Area', 3, 3, 2 ,3000)

267.2057470938144

In [45]:
predict('Yeshwanthpur' , 'Carpet Area' , 3 , 3 , 2 ,3560)

350.43652391145173

In [46]:
predict('Yelahanka New Town' , 'Carpet Area' , 3 , 4 , 2 ,4000)

382.3749549092262

In [47]:
predict('Yeshwanthpur' , 'Carpet Area' , 3 , 3 , 3 ,2550)

237.86795998317092

In [48]:
predict('rajkot' , 'Carpet Area' , 3 , 3 , 3 ,2550)

231.37971364990213