In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
columns = df.columns
columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [None]:
df.shape

(13320, 9)

In [None]:
df['balcony'].value_counts()

balcony
2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: count, dtype: int64

In [None]:
df.drop(['society','area_type','availability','balcony'], axis=1, inplace=True)
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Richards Town,4 BHK,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [None]:
df.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [None]:
df.dropna(inplace=True)
df.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [None]:
df.shape

(13246, 5)

In [None]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00,4


In [None]:
df.drop('size', axis=1, inplace=True)
df

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.00,4
2,Uttarahalli,1440,2.0,62.00,3
3,Lingadheeranahalli,1521,3.0,95.00,3
4,Kothanur,1200,2.0,51.00,2
...,...,...,...,...,...
13315,Whitefield,3453,4.0,231.00,5
13316,Richards Town,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,1141,2.0,60.00,2
13318,Padmanabhanagar,4689,4.0,488.00,4


In [None]:
df['total_sqft'].value_counts()

total_sqft
1200    843
1100    221
1500    204
2400    195
600     180
       ... 
5985      1
3580      1
2461      1
1437      1
4689      1
Name: count, Length: 2067, dtype: int64

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2
648,Arekere,4125Perch,9.0,265.0,9
661,Yelahanka,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,3090 - 5002,4.0,445.0,4


In [None]:
def convert_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_to_num)
df.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2
5,Whitefield,1170.0,2.0,38.0,2
6,Old Airport Road,2732.0,4.0,204.0,4
7,Rajaji Nagar,3300.0,4.0,600.0,4
8,Marathahalli,1310.0,3.0,63.25,3
9,Gandhi Bazar,1020.0,6.0,370.0,6


In [None]:
df.isna().sum()

location       0
total_sqft    46
bath           0
price          0
bhk            0
dtype: int64

In [None]:
df.dropna(inplace=True)
df.isna().sum()

location      0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [None]:
df.shape

(13200, 5)

In [None]:
df['price_per_sqft'] = df['price']*100000/df['total_sqft']
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [None]:
location_count = df['location'].value_counts()
location_count

location
Whitefield                   532
Sarjapur  Road               392
Electronic City              302
Kanakpura Road               264
Thanisandra                  232
                            ... 
Indiranagar HAL 2nd Stage      1
Maruthi HBCS Layout            1
K R C kothanur                 1
1Channasandra                  1
Abshot Layout                  1
Name: count, Length: 1298, dtype: int64

In [None]:
location_count[(location_count < 10)]

location
2nd Phase JP Nagar           9
Chandra Layout               9
Yemlur                       9
Kaverappa Layout             9
Banagiri Nagar               9
                            ..
Indiranagar HAL 2nd Stage    1
Maruthi HBCS Layout          1
K R C kothanur               1
1Channasandra                1
Abshot Layout                1
Name: count, Length: 1045, dtype: int64

In [None]:
df['location'] = df['location'].apply(lambda x : "other" if x in location_count[location_count < 10] else x)

In [None]:
location_count = df['location'].value_counts()
location_count

location
other                   2757
Whitefield               532
Sarjapur  Road           392
Electronic City          302
Kanakpura Road           264
                        ... 
Nagappa Reddy Layout      10
BTM 1st Stage             10
Basapura                  10
Kalkere                   10
Nagadevanahalli           10
Name: count, Length: 254, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13200 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   location        13200 non-null  object 
 1   total_sqft      13200 non-null  float64
 2   bath            13200 non-null  float64
 3   price           13200 non-null  float64
 4   bhk             13200 non-null  int64  
 5   price_per_sqft  13200 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 721.9+ KB


In [None]:
df = df[~(df.total_sqft/df.bhk<300)]
df

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...
13315,Whitefield,3453.0,4.0,231.00,5,6689.834926
13316,other,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4689.0,4.0,488.00,4,10407.336319


In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df = remove_pps_outliers(df)
df.shape

(10226, 6)

In [None]:
df

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Devarachikkanahalli,1250.0,2.0,44.00,3,3520.000000
1,Devarachikkanahalli,1250.0,2.0,40.00,2,3200.000000
2,Devarachikkanahalli,1200.0,2.0,83.00,2,6916.666667
3,Devarachikkanahalli,1170.0,2.0,40.00,2,3418.803419
4,Devarachikkanahalli,1425.0,2.0,65.00,3,4561.403509
...,...,...,...,...,...,...
10221,other,1353.0,2.0,110.00,2,8130.081301
10222,other,812.0,1.0,26.00,1,3201.970443
10223,other,1440.0,2.0,63.93,3,4439.583333
10224,other,1075.0,2.0,48.00,2,4465.116279


In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df = remove_bhk_outliers(df)
df.shape

(7380, 6)

In [None]:
df = df[df.bath<df.bhk+2]
df.shape

(7302, 6)

In [None]:
df.head(2)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
1,Devarachikkanahalli,1250.0,2.0,40.0,2,3200.0
2,Devarachikkanahalli,1200.0,2.0,83.0,2,6916.666667


In [None]:
df = df.drop(['price_per_sqft'],axis='columns')
df.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
1,Devarachikkanahalli,1250.0,2.0,40.0,2
2,Devarachikkanahalli,1200.0,2.0,83.0,2
3,Devarachikkanahalli,1170.0,2.0,40.0,2


In [None]:
df = pd.concat([df, pd.get_dummies(df['location'], drop_first=True)], axis=1)
df.drop('location', axis=1, inplace=True)
df.head(3)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
1,1250.0,2.0,40.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1200.0,2.0,83.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1170.0,2.0,40.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
x = df.drop('price', axis=1)
y = df['price']

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# x = sc.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
lr.predict(x_test)
lr.score(x_test, y_test)

0.9003014918200722

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), x, y, cv=cv)

array([0.84384221, 0.82909644, 0.83378618, 0.84849956, 0.81198277])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

model_params = {
    "linear_regression" : {
        "model" : LinearRegression(),
        "params" : {
            # "normalize" : [True]
        }
    },
    "lasso" : {
        "model" : Lasso(),
        "params" : {
            "alpha" : [1,2],
            "selection" : ["random", "cyclic"]
        }
    },
    "decision_tree" : {
        "model" : DecisionTreeRegressor(),
        "params" : {
            "criterion" : ["mse", "friedman_mse"],
            "splitter" : ["best", "random"]
        }
    }
}

score = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=cv, return_train_score=False)
    clf.fit(x, y)
    score.append({
        "model" : model_name,
        "best_score" : clf.best_score_,
        "best_params" : clf.best_params_
    })

score = pd.DataFrame(score, columns=["model", "best_score", "best_params"])
score

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.833441,{}
1,lasso,0.69403,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.763436,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


In [None]:
import pickle
with open('banglore_home_prices_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

In [None]:
import json
columns = {
    "data_columns" : [col.lower() for col in x.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))

In [None]:
def predict_price(location,sqft,bath,bhk):
    loc_index = np.where(x.columns==location)[0][0]

    arr = np.zeros(len(x.columns))
    arr[0] = sqft
    arr[1] = bath
    arr[2] = bhk
    if loc_index >= 0:
        arr[loc_index] = 1

    return lr.predict([arr])[0]

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

85.67511958787543