# Bangalore House Price Prediction - Supervised Regression Problem

## Data Preprocessing

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#  Load dataset 

In [2]:
df_raw = pd.read_csv(r"C:\Users\india\Desktop\Projects\ML Projects\Bengaluru_House_Data\Bengaluru_House_Data.csv")
df_raw.shape

(13320, 9)

In [3]:
df_raw.head(30)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.0


In [4]:
df_raw.tail(20)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
13300,Plot Area,Ready To Move,Hosakerehalli,5 Bedroom,,1500,6.0,2.0,145.0
13301,Super built-up Area,Ready To Move,Kothanur,3 BHK,,1454,3.0,3.0,71.5
13302,Super built-up Area,Ready To Move,Annaiah Reddy Layout,2 BHK,,1075,2.0,2.0,48.0
13303,Plot Area,Ready To Move,Vidyaranyapura,5 Bedroom,,774,5.0,3.0,70.0
13304,Super built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,GrrvaGr,1187,2.0,2.0,40.14
13305,Carpet Area,Ready To Move,Hulimavu,1 BHK,,500,1.0,3.0,220.0
13306,Plot Area,Ready To Move,Rajarajeshwari Nagara,4 Bedroom,,1200,5.0,,325.0
13307,Built-up Area,Ready To Move,Billekahalli,3 BHK,,1805,3.0,3.0,134.0
13308,Built-up Area,Ready To Move,Bannerghatta Road,3 BHK,Baanise,1527,3.0,1.0,142.0
13309,Super built-up Area,Ready To Move,Yeshwanthpur,3 BHK,IBityin,1675,3.0,,92.13


## 3. Exploratory Data Analysis

In [5]:
df = df_raw.copy() # get the copy of raw data

In [6]:
# get the information of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 624.4+ KB


In [7]:
# We have only 3 neumerical features - bath, balcony and price
# 6 categorical features - area type, availability, size, society, and total_srft
# Target Feature =======>>>>>> price >>>>>>
# Price in lakh

In [8]:
df.describe()
#observe 75% and max value it shows huge diff

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [None]:
sns.pairplot(df)

# bath and price have slightly linear correlation with some outliers

<seaborn.axisgrid.PairGrid at 0xf1414c0>

In [None]:
df.head(10)

In [None]:
df.columns

In [None]:
df['availability'].value_counts()

In [None]:
# value count of each feature
def value_count(df):
  for var in df.columns:
    print(df[var].value_counts())
    print("--------------------------------")

In [None]:
value_count(df)

In [None]:
# correlation heatmap
num_vars = ["bath", "balcony", "price"]
sns.heatmap(df[num_vars].corr(),cmap="coolwarm", annot=True)

# correlation of bath is greater than a balcony with price

# 4. Preare Data for Machine Learning Model

## Data cleaning

In [None]:
df.isnull().sum() # find the homuch missing data available

In [None]:
df.isnull().mean()*100 # % of measing value

#society has 41.3% missing value (need to drop)

In [None]:
# visualize missing value using heatmap to get idea where is the value missing

plt.figure(figsize=(16,9))
sns.heatmap(df.isnull())

In [None]:
# Drop ----------> society feature
# because 41.3% missing value
df2 = df.drop('society', axis='columns')
df2.shape

In [None]:
# fill mean value in --------> balcony feature
# because it contain 4.5% missing value
df2['balcony'] = df2['balcony'].fillna(df2['balcony'].mean())
df2.isnull().sum()

In [None]:
# drop na value rows from df2
# because there is very less % value missing
df3 = df2.dropna()
df3.shape

In [None]:
df3.isnull().sum()

In [None]:
df3.head()

## Feature Engineering

In [None]:
# to show all th ecolumns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### Converting 'total_sqft' cat feature in numeric

In [None]:
df3['total_sqft'].value_counts()

# here we observe that 'total_sqft' contain string value in diff format
#float, int like value 1689.28,817 
# range value: 540 - 740 
# number and string: 142.84Sq. Meter, 117Sq. Yards, 1Grounds

# best strategy is to convert it into number by spliting it

In [None]:
total_sqft_int = []
for str_val in df3['total_sqft']:
  try:
    total_sqft_int.append(float(str_val)) # if '123.4' like this value in str then conver in float
  except:
    try:
      temp = []
      temp = str_val.split('-')
      total_sqft_int.append((float(temp[0])+float(temp[-1]))/2) # '123 - 534' this str value split and take mean
    except:
      total_sqft_int.append(np.nan) # if value not contain in above format then consider as nan

In [None]:
# reset the index of dataframe
df4 = df3.reset_index(drop=True) # drop=True - don't add index column in df

In [None]:
# join df4 and total_srft_int list
df5 = df4.join(pd.DataFrame({'total_sqft_int':total_sqft_int}))
df5.head()

In [None]:
df5.tail()