In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [54]:
df = pd.read_csv('../data/Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


## Basic info and summary

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [56]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [57]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

### Check unique values and data overview

In [58]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

area_type: 4 unique values
availability: 81 unique values
location: 1305 unique values
size: 31 unique values
society: 2688 unique values
total_sqft: 2117 unique values
bath: 19 unique values
balcony: 4 unique values
price: 1994 unique values


## Drop unnecessary columns

Some columns are not useful for prediction, we can drop these columns.

In [59]:
df = df.drop(['area_type', 'availability', 'society'], axis=1)
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


## Handle missing values

In [60]:
df = df.dropna()
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

## Feature cleanup

Let's Convert “size” column (e.g., “2 BHK”) into a numeric column.

In [61]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


Also converting “total_sqft” into numeric since some rows have ranged values.

In [62]:
def convert_sqft_to_num(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df = df.dropna(subset=['total_sqft'])

Adding a new feature: price per sqft

In [63]:
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


## Data Cleaning & Feature Engineering

Let’s clean and simplify the location names to reduce noise.

In [64]:
df['location'] = df['location'].apply(lambda x: x.strip())
location_stats = df['location'].value_counts(ascending=False)
location_stats

location
Whitefield                                         514
Sarjapur  Road                                     372
Electronic City                                    302
Kanakpura Road                                     259
Thanisandra                                        233
                                                  ... 
12th cross srinivas nagar banshankari 3rd stage      1
Tilak Nagar                                          1
Pattegarhpalya                                       1
Sarvobhogam Nagar                                    1
Prasanna layout Herohalli                            1
Name: count, Length: 1248, dtype: int64

There are a lot of unique location , let's group all rare locations into one category called "other".

In [65]:
location_stats_less_than_10 = location_stats[location_stats <= 10]
df['location'] = df['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df['location'].unique())

236

This reduces complexity and prevents overfitting.

Some listings show unrealistic area values (like 300 sqft per BHK). Let's Remove outliers by sqft per BHK.

In [66]:
df = df[~(df['total_sqft']/df['bhk'] < 300)]

Remove outliers using price_per_sqft

In [67]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df = remove_pps_outliers(df)

Fix “bath” anomalies

In [68]:
df = df[df['bath'] < df['bhk'] + 2]

Prepare final data for model training

In [69]:
df_model = df[['location','total_sqft','bath','bhk','price']]

# One-hot encode location
dummies = pd.get_dummies(df_model['location'])
df_model = pd.concat([df_model.drop('location', axis=1), dummies.drop('other', axis=1)], axis=1)

X = df_model.drop('price', axis=1)
y = df_model['price']

In [70]:
joblib.dump(X.columns.tolist(), "../src/model_columns.pkl")

['../src/model_columns.pkl']

In [72]:
# Save location list
locations = df['location'].unique().tolist()
joblib.dump(locations, "../src/location_list.pkl")

print("Locations saved:", len(locations))

Locations saved: 236


Train-test split

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [74]:
print(X_train.shape, X_test.shape)

(7816, 238) (1955, 238)


## Model Training & Evaluation

In [75]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import numpy as np

Linear Regression model

In [76]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

r2_lr = r2_score(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print(f"Linear Regression R² Score: {r2_lr:.3f}")
print(f"Linear Regression RMSE: {rmse_lr:.3f}")

Linear Regression R² Score: 0.831
Linear Regression RMSE: 29.603


XGBoost model

In [77]:
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

print(f"XGBoost R² Score: {r2_xgb:.3f}")
print(f"XGBoost RMSE: {rmse_xgb:.3f}")

XGBoost R² Score: 0.827
XGBoost RMSE: 29.903


Compare the models

In [78]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'XGBoost'],
    'R2 Score': [r2_lr, r2_xgb],
    'RMSE': [rmse_lr, rmse_xgb]
})
results

Unnamed: 0,Model,R2 Score,RMSE
0,Linear Regression,0.830767,29.603207
1,XGBoost,0.82732,29.90319


For now , persist the trained XGBoost model for future predictions.

In [79]:
import joblib
joblib.dump(xgb_model, '../src/house_price_model.pkl')

['../src/house_price_model.pkl']

Test Prediction

In [80]:
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns == location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return xgb_model.predict([x])[0]

print(predict_price('Indira Nagar', 2000, 3, 3))

156.57716


Hyperparameter Tuning to improve model performance.

In [81]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_reg = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1.0}
Best RMSE: 38.33738784935156


In [82]:
best_xgb = grid_search.best_estimator_
y_pred_best = best_xgb.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print("Tuned XGBoost RMSE:", rmse_best)

Tuned XGBoost RMSE: 29.842672238691602


Save the best model

In [83]:
import joblib
joblib.dump(xgb_model, '../src/house_price_model.pkl')

['../src/house_price_model.pkl']

In [84]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['location'] = le.fit_transform(df['location'])

In [85]:
dict(zip(le.classes_, le.transform(le.classes_)))

{'1st Block Jayanagar': np.int64(0),
 '1st Phase JP Nagar': np.int64(1),
 '2nd Phase Judicial Layout': np.int64(2),
 '2nd Stage Nagarbhavi': np.int64(3),
 '5th Phase JP Nagar': np.int64(4),
 '6th Phase JP Nagar': np.int64(5),
 '7th Phase JP Nagar': np.int64(6),
 '8th Phase JP Nagar': np.int64(7),
 '9th Phase JP Nagar': np.int64(8),
 'AECS Layout': np.int64(9),
 'Abbigere': np.int64(10),
 'Akshaya Nagar': np.int64(11),
 'Ambalipura': np.int64(12),
 'Ambedkar Nagar': np.int64(13),
 'Amruthahalli': np.int64(14),
 'Anandapura': np.int64(15),
 'Ananth Nagar': np.int64(16),
 'Anekal': np.int64(17),
 'Anjanapura': np.int64(18),
 'Ardendale': np.int64(19),
 'Arekere': np.int64(20),
 'Attibele': np.int64(21),
 'BEML Layout': np.int64(22),
 'BTM 2nd Stage': np.int64(23),
 'BTM Layout': np.int64(24),
 'Babusapalaya': np.int64(25),
 'Badavala Nagar': np.int64(26),
 'Balagere': np.int64(27),
 'Banashankari': np.int64(28),
 'Banashankari Stage II': np.int64(29),
 'Banashankari Stage III': np.int64(3