# 🏠 Bengaluru House Price Prediction (SmartEst)
This notebook walks through data cleaning, feature engineering, and training a machine learning model to predict real estate prices using the Bengaluru dataset.

In [3]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Step 2: Load the dataset
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


## Step 3: Basic data exploration

In [5]:
df.info()
df.describe()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

## Step 4: Data cleaning and preprocessing

In [6]:
# Drop irrelevant columns
df = df.drop(['area_type','society','balcony','availability'], axis=1)
df = df.dropna()

# Extract number of bedrooms
df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop('size', axis=1)

# Convert sqft to float (handle ranges and other formats)
def convert_sqft(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna()

# Feature engineering: price per sqft
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

# Clean location names
df['location'] = df['location'].apply(lambda x: x.strip())
location_stats = df['location'].value_counts()
locations_less_than_10 = location_stats[location_stats <= 10].index
df['location'] = df['location'].apply(lambda x: 'other' if x in locations_less_than_10 else x)

## Step 5: Prepare data for training

In [7]:
# Drop price_per_sqft and encode categorical location
df = df.drop('price_per_sqft', axis=1)
dummies = pd.get_dummies(df['location'])
df = pd.concat([df.drop('location', axis=1), dummies], axis=1)

# Feature-target split
X = df.drop('price', axis=1)
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 6: Train the model

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R2 Score: 0.5120722457220808
MSE: 8661.650692829626


## Step 7: Prediction example

In [10]:
# Pick a test sample and predict
sample = X_test.iloc[0]
print("Predicted Price:", model.predict([sample])[0])
print("Actual Price:", y_test.iloc[0])

Predicted Price: 79.29468190091512
Actual Price: 80.0


### Step:8 

 

In [12]:
import pickle
pickle.dump(model, open("model.pkl", "wb"))
print("save")


save
