#   "Bangalore Real Estate Valuation Model"

Dataset is downloaded from here: https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 

# Load the dataset

In [21]:
df=pd.read_csv('bengaluru_house_prices.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [22]:
# print columns and rows numbers
df.shape
# Print columns and rows numbers
print(f"Columns: {len(df.columns)}, Rows: {df.shape[0]}")


Columns: 9, Rows: 13320


In [23]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [24]:
# Print unique values in the 'area_type' column
print(f"Unique area types: {df.area_type.nunique()}")

# Print count of each area type
print("\nArea Type Counts:")
print(df.area_type.value_counts())

# Print unique locations count
print(f"\nUnique location total number: {df.location.nunique()}")

# Print unique area types count
print(f"Unique area types total number: {df.area_type.nunique()}")


Unique area types: 4

Area Type Counts:
area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

Unique location total number: 1305
Unique area types total number: 4


In [28]:
# dropping features which are not necessary

df2 = df.drop(['area_type','society', 'balcony', 'availability'], axis=1) 
# removing the columns 'area_type', 'society', 'balcony', and 'availability' from the DataFrame df.
# print shape now
print(df2.shape)
print(f"Columns: {len(df2.columns)}, Rows: {df2.shape[0]}")

# clearly we dropped 4 columns from 9 its 5 now

(13320, 5)
Columns: 5, Rows: 13320


# Data cleaning 

In [29]:
# check null values

df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [30]:
# drop null values
df3=df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

# Feature Engineering 
# Feature engineering is the process of transforming raw data into meaningful features that can improve the performance of machine learning models. The goal is to create new input features or modify existing ones that better represent the underlying patterns in the data.

Add new feature(integer) for bhk (Bedrooms Hall Kitchen)

In [38]:
# Add new feature(integer) for bhk (Bedrooms Hall Kitchen)
# Before changing, display the first few rows of the original dataframe
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")
# Extract number of bedrooms (BHK) from the 'size' column
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

# Print unique values in the 'bhk' column
print(df3.bhk.unique())

df3.head()

[ 2  4  3  6  1  8  7  5 11  9 27 10 19 16 43 14 12 13 18]


Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


sqr footage

In [39]:
# Define function to check if a value can be converted to float
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

# Apply the function to check if 'total_sqft' values are valid floats
df['is_valid_sqft'] = df['total_sqft'].apply(is_float)

In [41]:
df3[~df3['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


Real Observations:

    Location: Different areas (e.g., Yelahanka, Hebbal, Sarjapur).
    Size: Various property types (e.g., 1 BHK, 2 BHK, 4 BHK). Some inconsistencies like "9 Bedroom" instead of "9 BHK".
    Total_sqft: Mixed formats (ranges like "2100 - 2850" and different units like "sq meter" and "perch").
    Price: Reasonable prices based on size but need consistency check.

What Needs to be Done:

    Clean the size column: Standardize to "BHK" format (e.g., "4 Bedroom" → "4 BHK").
    Standardize total_sqft: Convert all values to square feet, handle ranges (e.g., "2100 - 2850").
    Check for missing or inconsistent values: Handle any missing data or units that don’t match