In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [5]:
import pandas as pd

df = pd.read_csv("Bengaluru_House_Data.csv")
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   a

In [None]:
# Bengaluru Housing Dataset Complete Preprocessing
import pandas as pd
import numpy as np

# 1️⃣ Load dataset
df = pd.read_csv("Bengaluru_House_Data.csv")
print("Initial shape:", df.shape)

# 2️⃣ Drop irrelevant columns
df = df.drop(['area_type','availability','society','size'], axis=1)

# 3️⃣ Handle missing values
df = df.dropna(subset=['location','bath','balcony','total_sqft','price'])

# 4️⃣ Convert 'total_sqft' to numeric (handle ranges)
def convert_sqft(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1]))/2
        else:
            return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna(subset=['total_sqft'])

# 5️⃣ Convert 'size' to 'bhk' (if you have size column, otherwise skip)
# df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
# Since we dropped 'size', we'll create 'bhk' from total_sqft vs price later (or skip if not needed)

# 6️⃣ Outlier removal
# Price per sqft
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']
df = df[(df['price_per_sqft'] > 1000) & (df['price_per_sqft'] < 20000)]

# BHK vs Total_sqft outliers
# Assuming 300 sqft per bedroom minimum
if 'bhk' not in df.columns:
    # Approximate bhk from price_per_sqft? Or you can add manually later
    df['bhk'] = (df['total_sqft']/300).apply(np.floor)
df = df[df['total_sqft']/df['bhk'] >= 300]

# 7️⃣ Clean location
df['location'] = df['location'].apply(lambda x: x.strip())
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts <= 10]
df['location'] = df['location'].apply(lambda x: 'other' if x in rare_locations else x)

# One-hot encode locations
dummies = pd.get_dummies(df['location'])
df = pd.concat([df, dummies.drop('other', axis=1)], axis=1)

# 8️⃣ Optional: log-transform price if needed
df['price'] = np.log(df['price'])

# 9️⃣ Final dataset ready
X = df.drop(['price','price_per_sqft','location'], axis=1)
y = df['price']

print("Preprocessed shape:", X.shape)
print("Features:", X.columns)
print("Target sample:\n", y.head())


Initial shape: (13320, 9)
Preprocessed shape: (12434, 235)
Features: Index(['total_sqft', 'bath', 'balcony', 'bhk', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Phase JP Nagar', '6th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=235)
Target sample:
 0    3.665355
1    4.787492
2    4.127134
3    4.553877
4    3.931826
Name: price, dtype: float64


In [11]:

print("description")
df.describe()

description


Unnamed: 0,total_sqft,bath,balcony,price,price_per_sqft,bhk
count,12434.0,12434.0,12434.0,12434.0,12434.0,12434.0
mean,1485.78119,2.583642,1.582677,4.329914,6209.481264,4.52083
std,894.019481,1.14737,0.814072,0.657993,3092.392023,3.006937
min,250.0,1.0,0.0,2.079442,1166.666667,0.0
25%,1100.0,2.0,1.0,3.89182,4225.352113,3.0
50%,1260.0,2.0,2.0,4.234107,5319.148936,4.0
75%,1630.0,3.0,2.0,4.70048,7011.070111,5.0
max,36000.0,27.0,3.0,7.696213,19965.277778,120.0
