In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt


In [50]:
data=pd.read_csv('Bengaluru_House_Data.csv')


In [51]:
print(data.head())

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [52]:
#data=pd.DataFrame(data)
#print(data.head())

In [53]:
data.shape

(13320, 9)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [55]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*30)   

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
******************************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
******************************
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64
****

In [56]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [57]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [58]:
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [60]:
data['location']=data['location'].fillna('Sarjapur road')

data['size']=data['size'].fillna('2 BHK')
data['bath']=data['bath'].fillna(data['bath'].median())


data.info()
# so there all NULL value are replaced 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [61]:
data['bhk']=data['size'].str.split().str.get(0).astype(int)

In [62]:
data[data.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [63]:
data['total_sqft'].value_counts()

total_sqft
1200           843
1100           221
1500           205
2400           196
600            180
              ... 
2920             1
5665.84          1
1369             1
7150             1
1200 - 1470      1
Name: count, Length: 2117, dtype: int64

In [64]:
def convert(x):
    var=x.split('-')
    if(len(var)==2):
        return (float(var[0])+ float(var[1]))/2
    try:
        return float(x)
    except:
        return None

In [65]:
data['total_sqft']=data['total_sqft'].apply(convert)

In [66]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [67]:
data['price_per_sqft']= (data['price']*100000)/data['total_sqft']
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [68]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [69]:
data['location'].value_counts()

location
Whitefield                                      540
Sarjapur  Road                                  399
Electronic City                                 302
Kanakpura Road                                  273
Thanisandra                                     234
                                               ... 
Maragondana Halli, kr puram, old madras road      1
Chikkajala                                        1
Udayagiri                                         1
pavitra paradise                                  1
Chikbasavanapura                                  1
Name: count, Length: 1306, dtype: int64

In [70]:
data['location']=data['location'].apply(lambda x: x.strip())
location =data['location'].value_counts()


In [71]:
less10location = location[location<=10]
less10location.value_counts()

count
1     474
2     172
3     131
4      76
5      57
7      39
6      37
8      34
9      21
10     13
Name: count, dtype: int64

In [72]:
data['location']=data['location'].apply(lambda x : 'other ' if x in less10location else  x)

In [73]:
data['location'].value_counts()

location
other                        2886
Whitefield                    541
Sarjapur  Road                399
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Marsur                         11
2nd Phase Judicial Layout      11
Thyagaraja Nagar               11
HAL 2nd Stage                  11
Name: count, Length: 242, dtype: int64

## outlier detection 

In [74]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [75]:
(data['total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [76]:
data= data[(data['total_sqft']/data['bhk'])>=300]

In [77]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [78]:
data['price_per_sqft'].value_counts()


price_per_sqft
5000.000000    146
4000.000000    135
6666.666667     68
3500.000000     65
6250.000000     57
              ... 
4439.583333      1
8041.237113      1
6187.845304      1
4917.469051      1
4698.847835      1
Name: count, Length: 7405, dtype: int64

In [79]:

data['location'] = data['location'].apply(lambda x: x.strip())
location_counts = data['location'].value_counts()
less_than_10 = location_counts[location_counts <= 10]
data['location'] = data['location'].apply(lambda x: 'other' if x in less_than_10 else x)




data['sqft_per_bhk'] = data['total_sqft'] / data['bhk']
data['bath_per_bhk'] = data['bath'] / data['bhk']


# removing the outlier.
upper_limit = data['price_per_sqft'].quantile(0.99)
lower_limit = data['price_per_sqft'].quantile(0.01)
data = data[(data['price_per_sqft'] >= lower_limit) & (data['price_per_sqft'] <= upper_limit)]



data = pd.get_dummies(data, columns=['location'], drop_first=True)



In [81]:
#using linear regression

features = ['total_sqft', 'bath', 'bhk', 'price_per_sqft', 'sqft_per_bhk', 'bath_per_bhk']
X = data[features]
y = data['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)



lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression Results:")
print(f"Mean Squared Error is  {lr_mse}")
print(f"R-squared : {lr_r2}")



Linear Regression Results:
Mean Squared Error is  1062.132452420326
R-squared : 0.9028707901528024


In [83]:
# random forest
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
rf_best_model = grid_search.best_estimator_



rf_pred = rf_best_model.predict(X_test_scaled)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)


print("Random Forest Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {rf_mse}")
print(f"Mean Absolute Error: {rf_mae}")
print(f"R-squared: {rf_r2}")



rf_cv_scores = cross_val_score(rf_best_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Average R² from Cross-Validation: {rf_cv_scores.mean()}")

rf_cv_scores = cross_val_score(rf_best_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Average R² from Cross-Validation: {rf_cv_scores.mean()}")

Random Forest Results:
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Mean Squared Error: 101.19942033138011
Mean Absolute Error: 1.2509125652843134
R-squared: 0.9907455800720686
Average R² from Cross-Validation: 0.9859965524714754
Average R² from Cross-Validation: 0.9859965524714754


In [84]:
total_sqft = float(input("Enter the total square feet: "))
bhk = int(input("Enter the number of bedrooms (BHK): "))
bath = int(input("Enter the number of bathrooms: "))


sqft_per_bhk = total_sqft / bhk
bath_per_bhk = bath / bhk
price_per_sqft = total_sqft * 0.01 


new_data = pd.DataFrame({
    'total_sqft': [total_sqft],
    'bath': [bath],
    'bhk': [bhk],
    'price_per_sqft': [price_per_sqft],
    'sqft_per_bhk': [sqft_per_bhk],
    'bath_per_bhk': [bath_per_bhk],
})


new_data_scaled = scaler.transform(new_data)
predicted_price = rf_best_model.predict(new_data_scaled)
print(f"Predicted Price for new data in lakhs : {predicted_price[0]}")


Enter the total square feet:  3000
Enter the number of bedrooms (BHK):  3
Enter the number of bathrooms:  3


Predicted Price for new data in lakhs : 84.43


In [85]:
mape = np.mean(np.abs((y_test -lr_pred) / y_test)) * 100
accuracy = 100 - mape
print(f"Mean Absolute Percentage Error of linear regression is : {mape:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")


Mean Absolute Percentage Error of linear regression is : 21.15%
Accuracy: 78.85%


In [86]:
mape = np.mean(np.abs((y_test -rf_pred) / y_test)) * 100
accuracy = 100 - mape
print(f"Mean Absolute Percentage Error of random forest is : {mape:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")


Mean Absolute Percentage Error of random forest is : 0.66%
Accuracy: 99.34%
