In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
# read dataset
df = pd.read_csv('Bengaluru_House_Data.csv')
df.tail()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.0
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.0
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.0
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.0
13319,Super built-up Area,Ready To Move,Doddathoguru,1 BHK,,550,1.0,1.0,17.0


In [3]:
# shape of dataset
df.shape

(13320, 9)

In [4]:
#check the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
#check null values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
#check the percentage of missing values
(df.isnull().sum()/len(df))*100

area_type        0.000000
availability     0.000000
location         0.007508
size             0.120120
society         41.306306
total_sqft       0.000000
bath             0.548048
balcony          4.572072
price            0.000000
dtype: float64

In [7]:
#drop the society 
df.drop(['society'], axis=1,inplace=True)

In [8]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [55]:
# check the missing
df.loc[df['size'].isnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
579,Plot Area,Immediate Possession,Sarjapur Road,,Asiss B,1200 - 2400,,,34.185
1775,Plot Area,Immediate Possession,IVC Road,,Orana N,2000 - 5634,,,124.0
2264,Plot Area,Immediate Possession,Banashankari,,,2400,,,460.0
2809,Plot Area,Immediate Possession,Sarjapur Road,,AsdiaAr,1200 - 2400,,,28.785
2862,Plot Area,Immediate Possession,Devanahalli,,Ajleyor,1500 - 2400,,,46.8
5333,Plot Area,Immediate Possession,Devanahalli,,Emngs S,2100 - 5405,,,177.115
6423,Plot Area,Immediate Possession,Whitefield,,SRniaGa,2324,,,26.73
6636,Plot Area,Immediate Possession,Jigani,,S2enste,1500,,,25.49
6719,Plot Area,Immediate Possession,Hoskote,,SJowsn,800 - 2660,,,28.545
7680,Plot Area,Immediate Possession,Kasavanhalli,,,5000,,,400.0


In [10]:
df['size'].fillna(0, inplace=True)

In [11]:
df['bhk'] = df['size'].apply(lambda x : int(str(x).split(' ')[0]))

In [12]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [13]:
df.drop(['size'],axis=1,inplace=True)

In [14]:
df.isnull().sum()

area_type         0
availability      0
location          1
total_sqft        0
bath             73
balcony         609
price             0
bhk               0
dtype: int64

In [15]:
df.loc[df['bath'].isnull()]

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
56,Built-up Area,20-Feb,Devanahalli,3010 - 3410,,,192.000,4
81,Built-up Area,18-Oct,Hennur Road,2957 - 3450,,,224.500,4
224,Super built-up Area,19-Dec,Devanahalli,1520 - 1740,,,74.820,3
344,Super built-up Area,21-Dec,Kanakpura Road,525,,,21.530,1
579,Plot Area,Immediate Possession,Sarjapur Road,1200 - 2400,,,34.185,0
...,...,...,...,...,...,...,...,...
11496,Super built-up Area,21-Dec,Kanakpura Road,525,,,27.000,1
11569,Plot Area,Immediate Possession,Hosur Road,1350,,,8.440,0
12768,Built-up Area,18-Mar,Bettahalsoor,3210,,,353.000,5
12861,Super built-up Area,21-Dec,KR Puram,2204 - 2362,,,121.000,4


In [16]:
df.head(20)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.0,2
5,Super built-up Area,Ready To Move,Whitefield,1170,2.0,1.0,38.0,2
6,Super built-up Area,18-May,Old Airport Road,2732,4.0,,204.0,4
7,Super built-up Area,Ready To Move,Rajaji Nagar,3300,4.0,,600.0,4
8,Super built-up Area,Ready To Move,Marathahalli,1310,3.0,1.0,63.25,3
9,Plot Area,Ready To Move,Gandhi Bazar,1020,6.0,,370.0,6


In [17]:
df.loc[df['bhk'] == df['bath']]

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.00,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.00,2
5,Super built-up Area,Ready To Move,Whitefield,1170,2.0,1.0,38.00,2
6,Super built-up Area,18-May,Old Airport Road,2732,4.0,,204.00,4
...,...,...,...,...,...,...,...,...
13312,Super built-up Area,Ready To Move,Bellandur,1262,2.0,2.0,47.00,2
13314,Super built-up Area,Ready To Move,Green Glen Layout,1715,3.0,3.0,112.00,3
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,1141,2.0,1.0,60.00,2
13318,Super built-up Area,18-Jun,Padmanabhanagar,4689,4.0,1.0,488.00,4


In [18]:
df.drop(['bath'],axis=1,inplace=True)

In [19]:
df.loc[df['balcony'].isnull()]

Unnamed: 0,area_type,availability,location,total_sqft,balcony,price,bhk
6,Super built-up Area,18-May,Old Airport Road,2732,,204.00,4
7,Super built-up Area,Ready To Move,Rajaji Nagar,3300,,600.00,4
9,Plot Area,Ready To Move,Gandhi Bazar,1020,,370.00,6
34,Built-up Area,Ready To Move,Kasturi Nagar,1925,,125.00,3
40,Built-up Area,Ready To Move,Murugeshpalya,1296,,81.00,2
...,...,...,...,...,...,...,...
13277,Plot Area,Ready To Move,Kundalahalli Colony,1400,,218.00,7
13279,Plot Area,Ready To Move,Vishwanatha Nagenahalli,1200,,130.00,6
13306,Plot Area,Ready To Move,Rajarajeshwari Nagara,1200,,325.00,4
13309,Super built-up Area,Ready To Move,Yeshwanthpur,1675,,92.13,3


In [20]:
df.loc[df['balcony'].isnull()]['availability'].value_counts()

Ready To Move           503
Immediate Possession     16
18-Dec                   16
18-May                   13
18-Mar                   13
18-Apr                   10
21-Dec                    5
18-Feb                    5
18-Jul                    3
19-Dec                    3
18-Jun                    3
15-Oct                    2
19-Mar                    2
15-Nov                    2
19-Jul                    2
22-May                    2
19-May                    1
19-Jun                    1
18-Oct                    1
17-Feb                    1
18-Sep                    1
20-Feb                    1
17-Aug                    1
17-Sep                    1
19-Oct                    1
Name: availability, dtype: int64

In [21]:
df.loc[df['availability'] == 'Ready To Move']

Unnamed: 0,area_type,availability,location,total_sqft,balcony,price,bhk
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,1.0,51.0,2
5,Super built-up Area,Ready To Move,Whitefield,1170,1.0,38.0,2
...,...,...,...,...,...,...,...
13314,Super built-up Area,Ready To Move,Green Glen Layout,1715,3.0,112.0,3
13315,Built-up Area,Ready To Move,Whitefield,3453,0.0,231.0,5
13316,Super built-up Area,Ready To Move,Richards Town,3600,,400.0,4
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,1141,1.0,60.0,2


In [22]:
df['balcony'].fillna(0, inplace=True)

In [23]:
df.dropna(axis=0,inplace=True)

In [24]:
df.isnull().sum()

area_type       0
availability    0
location        0
total_sqft      0
balcony         0
price           0
bhk             0
dtype: int64

In [25]:
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,1.0,51.0,2


In [26]:
# Example
# use of isinstance
#x = 2
#isinstance('2',int)

In [27]:
help(isinstance)

Help on built-in function isinstance in module builtins:

isinstance(obj, class_or_tuple, /)
    Return whether an object is an instance of a class or of a subclass thereof.
    
    A tuple, as in ``isinstance(x, (A, B, ...))``, may be given as the target to
    check against. This is equivalent to ``isinstance(x, A) or isinstance(x, B)
    or ...`` etc.



In [28]:
#df['total_sqft'] = df['total_sqft'].str.split()

In [29]:
#def cleansqft(x):
    #if isinstance(x,int):
     #   return x
    #elif len(x.split('-')) == 2:
    #    a,b = x.split('-')
   #     s = (float(a) + float(b))/2
  #      return s
 #   else:
#        return none

In [30]:
#cleansqft('1000-1100')

In [31]:
#cleansqft(1200)

In [32]:
#df[['total_sqft']].applymap(cleansqft)

In [33]:
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [34]:
df['area_type'].head(5)

0    Super built-up  Area
1              Plot  Area
2          Built-up  Area
3    Super built-up  Area
4    Super built-up  Area
Name: area_type, dtype: object

In [35]:
pd.get_dummies(df['area_type'])

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
13315,1,0,0,0
13316,0,0,0,1
13317,1,0,0,0
13318,0,0,0,1


In [36]:
df['availability'].unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [37]:
df['location'].nunique()

1305

In [38]:
df['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [39]:
df.columns


Index(['area_type', 'availability', 'location', 'total_sqft', 'balcony',
       'price', 'bhk'],
      dtype='object')

In [40]:
df.drop(['availability', 'location', 'total_sqft'], axis=1,inplace=True)

In [41]:
df.head()

Unnamed: 0,area_type,balcony,price,bhk
0,Super built-up Area,1.0,39.07,2
1,Plot Area,3.0,120.0,4
2,Built-up Area,3.0,62.0,3
3,Super built-up Area,1.0,95.0,3
4,Super built-up Area,1.0,51.0,2


In [42]:
new_df = pd.get_dummies(df)
new_df.head()

Unnamed: 0,balcony,price,bhk,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,1.0,39.07,2,0,0,0,1
1,3.0,120.0,4,0,0,1,0
2,3.0,62.0,3,1,0,0,0
3,1.0,95.0,3,0,0,0,1
4,1.0,51.0,2,0,0,0,1


In [43]:
# dependent & independent variable
x = new_df.drop(['price'],axis=1)
y = new_df['price']

In [44]:
x.head()

Unnamed: 0,balcony,bhk,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,1.0,2,0,0,0,1
1,3.0,4,0,0,1,0
2,3.0,3,1,0,0,0
3,1.0,3,0,0,0,1
4,1.0,2,0,0,0,1


In [45]:
y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [46]:
x.shape

(13319, 6)

In [47]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=42)


In [48]:
x_train.shape,y_train.shape

((9323, 6), (9323,))

In [49]:
x_test.shape,y_test.shape

((3996, 6), (3996,))

In [50]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)

LinearRegression()

In [51]:
tr_pred = lr_model.predict(x_train)
ts_pred = lr_model.predict(x_test)
print(tr_pred.shape)
print(ts_pred.shape)

(9323,)
(3996,)


In [52]:
from sklearn.metrics import r2_score,mean_squared_error
print(f'traning mse - {mean_squared_error(y_train,tr_pred)}')
print(f'traning r2-score - {r2_score(y_train,tr_pred)}')

print(f'testing mse - {mean_squared_error(y_test,ts_pred)}')
print(f'testing r2-score - {r2_score(y_test,ts_pred)}')

traning mse - 19210.88379178513
traning r2-score - 0.16775260785970803
testing mse - 16341.222273768411
testing r2-score - 0.1874785295451119
