In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('bengaluru_house_prices.csv')

### DATA EXPLORATION

In [3]:
df.shape

(13320, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.sample(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
188,Super built-up Area,Ready To Move,KR Puram,2 BHK,MCvarar,1015 - 1540,2.0,0.0,56.8
13051,Super built-up Area,Ready To Move,Kenchenahalli,3 BHK,Unateit,1720,3.0,,100.0
3340,Super built-up Area,18-Nov,Thanisandra,3 BHK,,1588,3.0,1.0,75.55
6166,Super built-up Area,18-May,Yelahanka,3 BHK,Miitat,1890,3.0,2.0,108.0
4954,Super built-up Area,22-Jun,Hoodi,2 BHK,GoAirej,1430,2.0,2.0,110.0


In [6]:
for column in df.columns:
    print(df[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

In [7]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
df.drop(['area_type', 'balcony', 'availability', 'society'], inplace= True, axis =1)

In [9]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [12]:
df['location'] = df['location'].fillna('Sarjapur Road')

In [13]:
df['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [14]:
df['size'] = df['size'].fillna('2 BHK')

In [15]:
df['bath'].value_counts

<bound method IndexOpsMixin.value_counts of 0        2.0
1        5.0
2        2.0
3        3.0
4        2.0
        ... 
13315    4.0
13316    5.0
13317    2.0
13318    4.0
13319    1.0
Name: bath, Length: 13320, dtype: float64>

In [16]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [17]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [19]:
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [20]:
df[df.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [21]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return(float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [23]:
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [24]:
df['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [25]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


#### Price Per Square feet

In [26]:
df['price_per_sqft'] = df['price']*1000 / df['total_sqft']

In [27]:
df['price_per_sqft']

0         36.998106
1         46.153846
2         43.055556
3         62.458909
4         42.500000
            ...    
13315     66.898349
13316    111.111111
13317     52.585451
13318    104.073363
13319     30.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [28]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,79.075012
std,1238.405258,1.338754,148.971674,1.294496,1064.295948
min,1.0,1.0,8.0,1.0,2.678298
25%,1100.0,2.0,50.0,2.0,42.668645
50%,1276.0,2.0,72.0,3.0,54.343063
75%,1680.0,3.0,120.0,3.0,73.117461
max,52272.0,40.0,3600.0,43.0,120000.0


In [29]:
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1306, dtype: int64

In [30]:
df['location'] = df['location'].apply(lambda x: x.strip())
location_count = df['location'].value_counts()

In [31]:
location_count_less_10 = location_count[location_count<10]
location_count_less_10

location
Volagerekallahalli                    9
Medahalli                             9
Vishwanatha Nagenahalli               9
2nd Phase JP Nagar                    9
Chennammana Kere                      9
                                     ..
1Channasandra                         1
Hosahalli                             1
Vijayabank bank layout                1
near Ramanashree California resort    1
Abshot Layout                         1
Name: count, Length: 1041, dtype: int64

In [32]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [33]:
df['location'].value_counts()

location
other                  2756
Whitefield              541
Sarjapur  Road          399
Electronic City         304
Kanakpura Road          273
                       ... 
BTM 1st Stage            10
Basapura                 10
Sector 1 HSR Layout      10
Kalkere                  10
Nagadevanahalli          10
Name: count, Length: 255, dtype: int64

In [34]:
def remove_outliers(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)

    return df_output

df = remove_outliers(df)
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,11449.0,11449.0,11449.0,11449.0,11449.0
mean,1536.635566,2.603371,104.593878,2.718491,62.85768
std,1142.361892,1.20649,134.951057,1.153099,35.896412
min,250.0,1.0,8.0,1.0,2.678298
25%,1100.0,2.0,50.0,2.0,42.857143
50%,1280.0,2.0,70.0,3.0,53.097345
75%,1660.0,3.0,110.0,3.0,68.376068
max,52272.0,27.0,3600.0,27.0,531.5


In [35]:
df.shape

(11449, 7)

In [36]:
def bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
            # print(location, bhk_stats)

    for bhk, bhk_df in location_df.groupby('bhk'):
        stats = bhk_stats.get(bhk-1)
        if stats and stats['count'] > 5:
            exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)

    return df.drop(exclude_indices, axis='index')

df = bhk_outliers(df)
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10254.0,10254.0,10254.0,10254.0,10254.0
mean,1528.292979,2.589819,108.626805,2.695924,64.898978
std,947.074596,1.177953,140.498767,1.108909,37.100612
min,250.0,1.0,8.0,1.0,11.501721
25%,1101.0,2.0,50.015,2.0,44.0
50%,1288.5,2.0,72.11,3.0,55.062571
75%,1680.0,3.0,115.0,3.0,70.305804
max,30400.0,27.0,3600.0,27.0,531.5


In [37]:
df.shape

(10254, 7)

In [38]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,150.175439
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,119.018405
2,1st Block Jayanagar,6 BHK,1200.0,6.0,125.0,6,104.166667
3,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,125.333333
4,1st Block Jayanagar,7 Bedroom,930.0,4.0,85.0,7,91.397849


In [39]:
df.drop(columns = ['size', 'price_per_sqft'], inplace=True)

In [40]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1200.0,6.0,125.0,6
3,1st Block Jayanagar,1875.0,2.0,235.0,3
4,1st Block Jayanagar,930.0,4.0,85.0,7


In [41]:
df.to_csv('Cleaned_data.csv')

## Spliting Training and Testing Data

In [42]:
x = df.drop(columns=['price'])
y = df['price']

In [43]:
X_train, X_test,y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=0)

## Model training

### Applying Linear Regression

In [44]:
column_transform = make_column_transformer((OneHotEncoder(sparse_output=False), ['location']), remainder='passthrough')

scaler = StandardScaler()

In [45]:
lr = LinearRegression()

In [46]:
pipe = make_pipeline(column_transform, scaler, lr)

In [47]:
pipe.fit(X_train, y_train)

In [48]:
y_pred = pipe.predict(X_test)

In [49]:
r2_score(y_test, y_pred) 

0.6946387072220161

### Applying Lasso

In [50]:
lasso = Lasso()

In [51]:
pipe = make_pipeline(column_transform, scaler, lasso)

In [52]:
pipe.fit(X_train, y_train)

In [53]:
y_pred_lasso = pipe.predict(X_test)

In [54]:
r2_score(y_test, y_pred_lasso) 

0.6989758128710108

### Applying Ridge

In [55]:
ridge = Ridge()

In [56]:
pipe = make_pipeline(column_transform, scaler, ridge)

In [57]:
pipe.fit(X_train, y_train)

In [58]:
y_pred_ridge = pipe.predict(X_test)

In [59]:
r2_score(y_test, y_pred_ridge) 

0.6947707770532627

In [60]:
import pickle

In [61]:
pickle.dump(pipe, open('Model.pkl', 'wb'))