In [2]:
import pandas as pd

In [4]:
real_estate_data = pd.read_csv("/content/Bengaluru_House_Data.csv")

# Display the first few rows and summary of the dataset
real_estate_data.head(), real_estate_data.info()
real_estate_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [5]:
# Clean column names by removing spaces and converting to lowercase
real_estate_data.columns = real_estate_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

# Display the cleaned column names
real_estate_data.columns


Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [6]:
# Check for missing values in each column
missing_values = real_estate_data.isnull().sum()
missing_values


Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [7]:
# Handle missing values
real_estate_data['society'].fillna("Unknown", inplace=True)
real_estate_data.dropna(subset=['location'], inplace=True)  # Drop row with missing 'location'
real_estate_data['size'].fillna(real_estate_data['size'].mode()[0], inplace=True)  # Fill 'size' with mode
real_estate_data['bath'].fillna(real_estate_data['bath'].median(), inplace=True)  # Fill 'bath' with median
real_estate_data['balcony'].fillna(real_estate_data['balcony'].median(), inplace=True)  # Fill 'balcony' with median

# Verify if missing values are handled
missing_values_after = real_estate_data.isnull().sum()
missing_values_after


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  real_estate_data['society'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  real_estate_data['size'].fillna(real_estate_data['size'].mode()[0], inplace=True)  # Fill 'size' with mode
The behavior will change in pandas 3.0. This inplace method will 

Unnamed: 0,0
area_type,0
availability,0
location,0
size,0
society,0
total_sqft,0
bath,0
balcony,0
price,0


In [8]:
filtered_data = real_estate_data[
    (real_estate_data['availability'] == 'Ready To Move') &
    (real_estate_data['location'] == 'Electronic City Phase II')
]


In [9]:
real_estate_data

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,Unknown,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,Unknown,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,Unknown,3600,5.0,2.0,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
real_estate_data['size'] = label_encoder.fit_transform(real_estate_data['size'])


In [11]:
neighborhood_avg_price = real_estate_data.groupby('location')['price'].mean()


In [12]:
Q1 = real_estate_data['price'].quantile(0.25)
Q3 = real_estate_data['price'].quantile(0.75)
IQR = Q3 - Q1
filtered_data = real_estate_data[
    (real_estate_data['price'] >= (Q1 - 1.5 * IQR)) &
    (real_estate_data['price'] <= (Q3 + 1.5 * IQR))
]
