In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv('../datasets/df_data_cleaned.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13201 entries, 0 to 13200
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13201 non-null  object 
 1   total_sqft  13201 non-null  float64
 2   bath        13201 non-null  float64
 3   price       13201 non-null  float64
 4   BHK         13201 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 515.8+ KB


In [5]:
df['location'].nunique()

1298

In [55]:
location_data = df['location'].value_counts()

In [62]:
location_threshold_for_other = location_data.mean()

**Areas with value count of greater than mean of the value counts are considered as main areas of bangalore.**

**Areas with value count of lesser or equal to the mean of the value counts are considered as other.**

In [65]:
location_data[location_data > location_threshold_for_other] ## Key Areas Of Bangalore

location
Whitefield          533
Sarjapur  Road      392
Electronic City     302
Kanakpura Road      264
Thanisandra         232
                   ... 
HAL 2nd Stage        11
LB Shastri Nagar     11
Tindlu               11
Marsur               11
Bommenahalli         11
Name: count, Length: 240, dtype: int64

In [68]:
other_locations = location_data[location_data <= location_threshold_for_other] #other

In [72]:
df['location'] = df['location'].apply(lambda x:'other' if x in other_locations else x)

In [73]:
df['location'].value_counts()

location
other                        2887
Whitefield                    533
Sarjapur  Road                392
Electronic City               302
Kanakpura Road                264
                             ... 
Pattandur Agrahara             11
2nd Phase Judicial Layout      11
Bommenahalli                   11
Marsur                         11
LB Shastri Nagar               11
Name: count, Length: 241, dtype: int64

In [74]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


**Now we will find out price per sqft.**

**The Price is given in lacs.**

In [80]:
df['price_per_sqft'] = ((df['price'] * 100000) / df['total_sqft'])

In [81]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [83]:
df.to_csv('../datasets/df_before_one_hot_encoding.csv',index=False)

In [3]:
df = pd.read_csv('../datasets/df_before_one_hot_encoding.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [5]:
df['location'].value_counts()

location
other                        2887
Whitefield                    533
Sarjapur  Road                392
Electronic City               302
Kanakpura Road                264
                             ... 
Pattandur Agrahara             11
2nd Phase Judicial Layout      11
Bommenahalli                   11
Marsur                         11
LB Shastri Nagar               11
Name: count, Length: 241, dtype: int64

In [6]:
df_ohe_copy1 = df.copy()

In [7]:
df_ohe_copy1

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...
13196,Whitefield,3453.0,4.0,231.00,5,6689.834926
13197,other,3600.0,5.0,400.00,4,11111.111111
13198,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2,5258.545136
13199,Padmanabhanagar,4689.0,4.0,488.00,4,10407.336319


**Applying One Hot Encoding on Location Column.**

In [9]:
pd.get_dummies(df_ohe_copy1,columns=['location'])

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft,location_ Devarachikkanahalli,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,...,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_other
0,1056.0,2.0,39.07,2,3699.810606,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2600.0,5.0,120.00,4,4615.384615,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1440.0,2.0,62.00,3,4305.555556,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1521.0,3.0,95.00,3,6245.890861,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1200.0,2.0,51.00,2,4250.000000,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,3453.0,4.0,231.00,5,6689.834926,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
13197,3600.0,5.0,400.00,4,11111.111111,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
13198,1141.0,2.0,60.00,2,5258.545136,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13199,4689.0,4.0,488.00,4,10407.336319,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
pd.get_dummies(df_ohe_copy1.location,dtype='int')

Unnamed: 0,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
ohe = pd.get_dummies(df_ohe_copy1.location,dtype='int')
ohe

Unnamed: 0,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
ohe.drop(columns=['other'],inplace=True)

In [37]:
df_ohe_copy1.shape

(13201, 6)

In [52]:
concat_dummy = pd.concat([df_ohe_copy1, ohe], axis=1)
loc_series = concat_dummy.drop(columns=['location']).iloc[13197,5:] != 1
loc_series.sum()

240

In [49]:
concat_dummy.location == 'other'

0        False
1        False
2        False
3        False
4        False
         ...  
13196    False
13197     True
13198    False
13199    False
13200    False
Name: location, Length: 13201, dtype: bool

In [50]:
concat_dummy

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chikka Tirupathi,2600.0,5.0,120.00,4,4615.384615,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Uttarahalli,1440.0,2.0,62.00,3,4305.555556,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lingadheeranahalli,1521.0,3.0,95.00,3,6245.890861,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Kothanur,1200.0,2.0,51.00,2,4250.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,Whitefield,3453.0,4.0,231.00,5,6689.834926,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13197,other,3600.0,5.0,400.00,4,11111.111111,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13198,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2,5258.545136,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13199,Padmanabhanagar,4689.0,4.0,488.00,4,10407.336319,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
ohe = pd.get_dummies(df.location,dtype='int')
ohe

Unnamed: 0,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
ohe.drop(columns=['other'],inplace=True)

In [57]:
df = pd.concat([df, ohe], axis=1)

In [58]:
df.drop(columns=['location'],inplace=True)

In [59]:
df

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2,3699.810606,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2600.0,5.0,120.00,4,4615.384615,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1440.0,2.0,62.00,3,4305.555556,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1521.0,3.0,95.00,3,6245.890861,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1200.0,2.0,51.00,2,4250.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13196,3453.0,4.0,231.00,5,6689.834926,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13197,3600.0,5.0,400.00,4,11111.111111,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13198,1141.0,2.0,60.00,2,5258.545136,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13199,4689.0,4.0,488.00,4,10407.336319,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
df.to_csv('../datasets/df_after_one_hot_encoding.csv',index=False)