In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load the DATASET

In [2]:
data = pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK        

# Preprocessing Dataset

In [7]:
data.drop(columns=['society'],inplace=True)

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data['bath'].fillna(data['bath'].mean(), inplace=True)

In [10]:
data['balcony'].fillna(data['balcony'].mean(), inplace=True)

In [11]:
most_frequent_value = data['size'].mode()[0]
data['size'].fillna(most_frequent_value, inplace=True)

In [12]:
most_frequent_value = data['location'].mode()[0]
data['location'].fillna(most_frequent_value, inplace=True)

In [13]:
data.isna().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [14]:
data.shape

(12752, 8)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12752 entries, 0 to 13318
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     12752 non-null  object 
 1   availability  12752 non-null  object 
 2   location      12752 non-null  object 
 3   size          12752 non-null  object 
 4   total_sqft    12752 non-null  object 
 5   bath          12752 non-null  float64
 6   balcony       12752 non-null  float64
 7   price         12752 non-null  float64
dtypes: float64(3), object(5)
memory usage: 896.6+ KB


In [16]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

Super built-up  Area    8279
Built-up  Area          2397
Plot  Area              1989
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10140
18-May             290
18-Dec             283
18-Apr             269
18-Aug             187
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                        524
Sarjapur  Road                    379
Electronic City                   286
Kanakpura Road                    242
Thanisandra                       229
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         4920
3 BHK        

In [17]:
data.drop(columns=['area_type','availability'],inplace=True)

In [18]:
data.shape

(12752, 6)

In [19]:
data['BHK']=data['size'].str.split().str.get(0).astype(int)

In [20]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [21]:
data=data[data.BHK < 15]
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,1.000000,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.000000,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,3.000000,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.000000,95.00,3
4,Kothanur,2 BHK,1200,2.0,1.000000,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,3.000000,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,0.000000,231.00,5
13316,Richards Town,4 BHK,3600,5.0,1.582531,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.000000,60.00,2


In [22]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [23]:
def convert(x):
    
    sqft=x.split('-')
    if len(sqft)==2:
        return(float(sqft[0])+float(sqft[1]))/2
    try:
        return float(x)
    except:
        return None

In [24]:
data['total_sqft']=data['total_sqft'].apply(convert)

In [25]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [26]:
data['Price_Per_sqft']=data['price']*100000 / data['total_sqft']

In [27]:
data['Price_Per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13314     6530.612245
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
Name: Price_Per_sqft, Length: 12747, dtype: float64

In [28]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,BHK,Price_Per_sqft
count,12701.0,12747.0,12747.0,12747.0,12747.0,12701.0
mean,1569.399567,2.702272,1.582779,114.365413,2.809838,8030.424
std,1258.243553,1.279238,0.803501,151.533552,1.224426,108801.0
min,1.0,1.0,0.0,8.0,1.0,267.8298
25%,1100.0,2.0,1.0,50.0,2.0,4299.065
50%,1282.0,2.0,2.0,73.0,3.0,5487.805
75%,1691.0,3.0,2.0,121.5,3.0,7404.795
max,52272.0,15.0,3.0,3600.0,14.0,12000000.0


In [29]:
data['location'] = data["location"].apply(lambda x: x.strip())
location_count=data['location'].value_counts()

In [30]:
location_count

Whitefield                525
Sarjapur  Road            379
Electronic City           288
Kanakpura Road            242
Thanisandra               232
                         ... 
K R C kothanur              1
1Channasandra               1
Hosahalli                   1
Vijayabank bank layout      1
Abshot Layout               1
Name: location, Length: 1291, dtype: int64

In [31]:
location_count_less_10= location_count[location_count <= 10]

In [32]:
location_count_less_10

Dodsworth Layout          10
BTM 1st Stage             10
Sector 1 HSR Layout       10
Gunjur Palya              10
Marsur                    10
                          ..
K R C kothanur             1
1Channasandra              1
Hosahalli                  1
Vijayabank bank layout     1
Abshot Layout              1
Name: location, Length: 1058, dtype: int64

In [33]:
data['location']=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [34]:
data['location'].value_counts()

other               2926
Whitefield           525
Sarjapur  Road       379
Electronic City      288
Kanakpura Road       242
                    ... 
Kodigehalli           11
LB Shastri Nagar      11
Thyagaraja Nagar      11
Tindlu                11
HAL 2nd Stage         11
Name: location, Length: 234, dtype: int64

In [35]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,BHK,Price_Per_sqft
count,12701.0,12747.0,12747.0,12747.0,12747.0,12701.0
mean,1569.399567,2.702272,1.582779,114.365413,2.809838,8030.424
std,1258.243553,1.279238,0.803501,151.533552,1.224426,108801.0
min,1.0,1.0,0.0,8.0,1.0,267.8298
25%,1100.0,2.0,1.0,50.0,2.0,4299.065
50%,1282.0,2.0,2.0,73.0,3.0,5487.805
75%,1691.0,3.0,2.0,121.5,3.0,7404.795
max,52272.0,15.0,3.0,3600.0,14.0,12000000.0


In [36]:
data=data[((data['total_sqft']/data['BHK'])>=300)]

In [37]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,BHK,Price_Per_sqft
count,11971.0,11971.0,11971.0,11971.0,11971.0,11971.0
mean,1606.741284,2.576298,1.586306,113.393472,2.66068,6369.133181
std,1283.457258,1.083639,0.797621,154.981816,0.981605,4231.939036
min,300.0,1.0,0.0,8.44,1.0,267.829813
25%,1118.0,2.0,1.0,50.0,2.0,4250.0
50%,1307.0,2.0,2.0,70.0,3.0,5333.333333
75%,1717.0,3.0,2.0,120.0,3.0,6973.880673
max,52272.0,14.0,3.0,3600.0,13.0,176470.588235


In [38]:
data.shape

(11971, 8)

# Removing Outliers

In [39]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.Price_Per_sqft)
        st = np.std(subdf.Price_Per_sqft)
        gen_df = subdf[(subdf.Price_Per_sqft > (m - st)) & (subdf.Price_Per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

In [40]:
data=remove_outliers_sqft(data)

In [41]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,BHK,Price_Per_sqft
count,9863.0,9863.0,9863.0,9863.0,9863.0,9863.0
mean,1519.542114,2.489079,1.591296,92.839202,2.583291,5712.854088
std,893.835774,0.985653,0.787875,87.911911,0.899386,2289.105099
min,300.0,1.0,0.0,10.0,1.0,1250.0
25%,1108.0,2.0,1.0,49.865,2.0,4284.323272
50%,1300.0,2.0,2.0,68.0,2.0,5210.526316
75%,1664.0,3.0,2.0,101.0,3.0,6498.980899
max,30400.0,13.0,3.0,2200.0,13.0,24509.803922


In [42]:
def BHK_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        BHK_stats = {}
        for BHK, BHK_df in location_df.groupby("BHK"):
            BHK_stats[BHK] = {
                'mean': np.mean(BHK_df.Price_Per_sqft),
                'std': np.std(BHK_df.Price_Per_sqft),
                'count': BHK_df.shape[0]
            }
        for BHK, BHK_df in location_df.groupby('BHK'):
            if BHK > 1:
                stats = BHK_stats.get(BHK - 1)
                if stats and stats['count'] > 5:
                    exclude_indices = np.append(exclude_indices, BHK_df[BHK_df.Price_Per_sqft < (stats["mean"])].index.values)
    return df.drop(exclude_indices, axis="index")

In [43]:
data=BHK_outlier_remover(data)

In [44]:
data.shape

(7025, 8)

In [45]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,BHK,Price_Per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,1.000000,428.0,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,2.000000,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,3.000000,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,0.000000,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,2.000000,148.0,2,11983.805668
...,...,...,...,...,...,...,...,...
9853,other,2 BHK,1155.0,2.0,1.000000,64.0,2,5541.125541
9855,other,2 BHK,1200.0,2.0,3.000000,70.0,2,5833.333333
9856,other,1 BHK,1800.0,1.0,1.000000,200.0,1,11111.111111
9859,other,1 Bedroom,812.0,1.0,0.000000,26.0,1,3201.970443


In [46]:
data.drop(columns=['size','Price_Per_sqft','balcony'],inplace=True)

In [47]:
data

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2
...,...,...,...,...,...
9853,other,1155.0,2.0,64.0,2
9855,other,1200.0,2.0,70.0,2
9856,other,1800.0,1.0,200.0,1
9859,other,812.0,1.0,26.0,1


In [48]:
data.to_csv('Clean_dataset.csv')

In [49]:
X=data.drop(columns=['price'])
y=data['price']

# Model Training and Testing

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=0)

In [52]:
X_train.shape

(5620, 4)

In [53]:
X_test.shape

(1405, 4)

# Applying Linear Regression

In [54]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False),['location']),
                                      remainder='passthrough')

In [55]:
scaler = StandardScaler()

In [56]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

categorical_features = ['location']

numerical_features = ['bath','BHK','total_sqft']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False), categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

lr = LinearRegression()
lr.fit(X_preprocessed, y)

In [57]:
pipe= make_pipeline(column_trans,scaler,lr) 

In [58]:
pipe.fit(X_train,y_train)

In [59]:
y_pred_lr = pipe.predict(X_test)

In [60]:
r2_score(y_test,y_pred_lr)

0.8493653302345087

## Lasso

In [61]:
lasso = Lasso()

In [62]:
pipe = make_pipeline(column_trans, scaler, lasso )

In [63]:
pipe.fit(X_train,y_train)

In [64]:
y_pred_lasso = pipe.predict(X_test)

In [65]:
r2_score(y_test,y_pred_lasso)

0.8396983628182132

## Ridge

In [66]:
ridge =Ridge()

In [67]:
pipe = make_pipeline(column_trans, scaler, ridge )

In [68]:
pipe.fit(X_train,y_train)

In [69]:
y_pred_ridge = pipe.predict(X_test)

In [70]:
r2_score(y_test,y_pred_ridge)

0.8537823452260311

In [71]:
print("Linear Regression:",r2_score(y_test,y_pred_lr))
print("Lasso:",r2_score(y_test,y_pred_lasso))
print("Ridge:",r2_score(y_test,y_pred_ridge))

Linear Regression: 0.8493653302345087
Lasso: 0.8396983628182132
Ridge: 0.8537823452260311


In [72]:
import pickle

In [73]:
pickle.dump(pipe, open("RidgeModel.pk1","wb"))

In [74]:
import os

notebook_path = os.getcwd()
print(notebook_path)

C:\Users\sushanta
