In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv('../datasets/df_data_cleaned.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13201 entries, 0 to 13200
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13201 non-null  object 
 1   size        13201 non-null  object 
 2   total_sqft  13201 non-null  float64
 3   bath        13201 non-null  float64
 4   price       13201 non-null  float64
 5   BHK         13201 non-null  float64
dtypes: float64(4), object(2)
memory usage: 618.9+ KB


In [4]:
df['location'].nunique()

1298

In [5]:
df['location'] = df['location'].apply(lambda x:x.strip())

In [6]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0


In [7]:
location_data = df['location'].value_counts(ascending=False)
location_data

location
Whitefield                   534
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Rajanna Layout                 1
Subramanyanagar                1
Lakshmipura Vidyaanyapura      1
Malur Hosur Road               1
Abshot Layout                  1
Name: count, Length: 1287, dtype: int64

In [8]:
location_threshold_for_other = location_data.mean()
location_threshold_for_other

10.257187257187256

**Areas with value count of greater than mean of the value counts are considered as main areas of bangalore.**

**Areas with value count of lesser or equal to the mean of the value counts are considered as other.**

**We would set the threshold to 10.**

In [9]:
len(location_data)

1287

In [14]:
(location_data <= 10).sum()

1047

In [15]:
(location_data > 10).sum()

240

In [17]:
location_threshold_for_other = 10
other_locations = location_data[location_data <= location_threshold_for_other] #other

In [18]:
df['location'] = df['location'].apply(lambda x:'other' if x in other_locations else x)

In [19]:
df['location'].value_counts()

location
other              2872
Whitefield          534
Sarjapur  Road      392
Electronic City     304
Kanakpura Road      264
                   ... 
Doddaballapur        11
Tindlu               11
Marsur               11
HAL 2nd Stage        11
Kodigehalli          11
Name: count, Length: 241, dtype: int64

In [21]:
len(df['location'].unique())

241

In [22]:
df['location'].unique()

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli',
       'Lingadheeranahalli', 'Kothanur', 'Whitefield', 'Old Airport Road',
       'Rajaji Nagar', 'Marathahalli', 'other', '7th Phase JP Nagar',
       'Gottigere', 'Sarjapur', 'Mysore Road', 'Bisuvanahalli',
       'Raja Rajeshwari Nagar', 'Kengeri', 'Binny Pete', 'Thanisandra',
       'Bellandur', 'Electronic City', 'Ramagondanahalli', 'Yelahanka',
       'Hebbal', 'Kasturi Nagar', 'Kanakpura Road',
       'Electronics City Phase 1', 'Kundalahalli', 'Chikkalasandra',
       'Murugeshpalya', 'Sarjapur  Road', 'HSR Layout', 'Doddathoguru',
       'KR Puram', 'Bhoganhalli', 'Lakshminarayana Pura', 'Begur Road',
       'Varthur', 'Bommanahalli', 'Gunjur', 'Devarachikkanahalli',
       'Hegde Nagar', 'Haralur Road', 'Hennur Road', 'Kothannur',
       'Kalena Agrahara', 'Kaval Byrasandra', 'ISRO Layout',
       'Garudachar Palya', 'EPIP Zone', 'Dasanapura', 'Kasavanhalli',
       'Sanjay nagar', 'Domlur', 'Sarjapura - At

In [23]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0


In [24]:
df.shape

(13201, 6)

**Now we will find out price per sqft.**

**The Price is given in lacs.**

In [25]:
df['price_per_sqft'] = ((df['price'] * 100000) / df['total_sqft'])

In [26]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0,4250.0


In [27]:
df.to_csv('../datasets/df_before_business_logic.csv',index=False)