# Data Preparation and Pre-processing:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('dark_background')

### Extracting the data:

In [2]:
df=pd.read_csv('Bengaluru_House_Data.csv')
df.sample(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
11068,Built-up Area,Ready To Move,Byrathi Village,2 BHK,,800,1.0,3.0,95.0
11244,Plot Area,Ready To Move,BTM 1st Stage,6 Bedroom,,600,6.0,2.0,80.0
9910,Super built-up Area,Ready To Move,Electronic City,3 BHK,GMown E,1360,2.0,1.0,70.0
6439,Super built-up Area,Ready To Move,Hosur Road,2 BHK,Saageat,1345,2.0,2.0,106.0
11498,Super built-up Area,19-Dec,Yelahanka,1 BHK,Goues A,629 - 1026,1.0,0.0,42.535


## Data cleaning:

### Removing unnecessary columns:

In [3]:
df=df.drop(['availability','society'],axis=1)
df.sample(5)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
5418,Super built-up Area,Marathahalli,2 BHK,1060,2.0,1.0,48.0
2472,Super built-up Area,Kothanur,2 BHK,1187,2.0,2.0,58.0
517,Super built-up Area,Begur Road,3 BHK,1584,3.0,,65.0
9893,Super built-up Area,Nagavara,2 BHK,1110,2.0,2.0,44.95
10675,Built-up Area,Electronic City Phase II,2 BHK,1155,2.0,1.0,57.0


### Removing Null values:

In [4]:
df.isnull().sum()

area_type       0
location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [5]:
df=df.dropna()

In [6]:
df.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

## adjusting the values in size column:

In [7]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [8]:
def size_clear(string):
    string=string.split(' ')[0]
    string=int(string)
    return string

In [9]:
df['size']=df['size'].apply(size_clear)
df

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4,2600,5.0,3.0,120.00
2,Built-up Area,Uttarahalli,3,1440,2.0,3.0,62.00
3,Super built-up Area,Lingadheeranahalli,3,1521,3.0,1.0,95.00
4,Super built-up Area,Kothanur,2,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...
13314,Super built-up Area,Green Glen Layout,3,1715,3.0,3.0,112.00
13315,Built-up Area,Whitefield,5,3453,4.0,0.0,231.00
13317,Built-up Area,Raja Rajeshwari Nagar,2,1141,2.0,1.0,60.00
13318,Super built-up Area,Padmanabhanagar,4,4689,4.0,1.0,488.00


### Verifying other discrete numerical values:

In [10]:
df['bath'].unique()

array([ 2.,  5.,  3.,  4.,  1.,  8.,  7.,  6.,  9., 27., 11., 12., 10.,
       40., 15., 13.])

In [11]:
df['balcony'].unique()

array([1., 3., 2., 0.])

## Verifying total_sqft values:

In [12]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

### correcting total_sqft values:

In [13]:
def correcting_total_sqft(val):
    x=None
    try:
        x=float(val)
    except:
        try:
            x_list=list(val.strip(' - '))
            
            x=(float(x_list[0])+float(x_list[1]))/2
        except:
            x=None
    return x

In [14]:
df['total_sqft']=df['total_sqft'].apply(correcting_total_sqft)

### Eliminating NaN values:

In [15]:
df['total_sqft'].isnull().sum()

186

In [16]:
df=df.dropna()

In [17]:
df['total_sqft'].isnull().sum()

0

In [18]:
df.shape

(12524, 7)

## Verifying the location values:

In [19]:
df['location'].nunique()

1259

In [23]:
locations=df.groupby('location')['location'].count().sort_values(ascending=False)
locations

location
Whitefield              498
Sarjapur  Road          366
Electronic City         300
Kanakpura Road          255
Thanisandra             220
                       ... 
Kalasipalya               1
Kalhalli                  1
Kalkere Channasandra      1
 Banaswadi                1
whitefiled                1
Name: location, Length: 1259, dtype: int64

In [27]:
len(locations[locations<=10])

1025

In [28]:
rare_locs=locations[locations<=10]

In [29]:
def loc_assign(loc):
    if loc in rare_locs:
        return 'other'
    else:
        return loc

In [30]:
df['location']=df['location'].apply(loc_assign)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location']=df['location'].apply(loc_assign)


In [31]:
df['location'].nunique()

235

## Setting up a new feature: Price per sqft:

In [32]:
df.sample(5)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
11269,Super built-up Area,Hebbal,3,1645.0,3.0,2.0,117.0
6859,Super built-up Area,Kanakpura Road,2,1339.0,2.0,2.0,85.0
5230,Super built-up Area,Anjanapura,2,950.0,2.0,1.0,40.0
7525,Super built-up Area,Begur,2,1100.0,2.0,2.0,55.0
12697,Super built-up Area,Bellandur,3,1830.0,3.0,3.0,89.89


In [33]:
df['price_per_sqft']=(df['price']*100000)/df['total_sqft']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_per_sqft']=(df['price']*100000)/df['total_sqft']


In [34]:
df.sample(5)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,price_per_sqft
11910,Plot Area,Kadugodi,3,1875.0,2.0,1.0,110.0,5866.666667
10159,Super built-up Area,other,3,1254.0,2.0,1.0,70.0,5582.137161
5133,Plot Area,Yeshwanthpur,2,1200.0,2.0,0.0,63.0,5250.0
1812,Built-up Area,Sarjapur,4,1550.0,2.0,2.0,65.0,4193.548387
5589,Built-up Area,Electronic City,1,630.0,1.0,1.0,60.0,9523.809524


In [37]:
df[df['location']=='other']['location'].count()

2760

In [38]:
df.shape

(12524, 8)