In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel('data_ready_for_model_building.xlsx')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,price,size_sqft,price_per_sqft,city,BHK
0,0,118.0,2037.0,5793.0,Bangalore,3
1,1,57.56,1240.0,4642.0,Bangalore,2
2,3,133.0,1641.0,8133.0,Bangalore,3
3,4,55.32,957.0,5787.0,Bangalore,2
4,5,83.47,1575.0,5300.0,Bangalore,3


In [4]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
data.shape

(46175, 5)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46175 entries, 0 to 46174
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           46175 non-null  float64
 1   size_sqft       46175 non-null  float64
 2   price_per_sqft  46175 non-null  float64
 3   city            46175 non-null  object 
 4   BHK             46175 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 1.8+ MB


In [7]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

65.00      494
75.00      478
70.00      454
85.00      454
120.00     428
          ... 
85.92        1
1549.00      1
61.27        1
75.66        1
55.14        1
Name: price, Length: 6422, dtype: int64
********************
1200.000000    813
1834.322174    543
1250.000000    507
1500.000000    463
1100.000000    449
              ... 
4068.000000      1
3129.000000      1
4976.000000      1
4969.000000      1
4334.000000      1
Name: size_sqft, Length: 3503, dtype: int64
********************
5000.000000     438
4500.000000     275
4000.000000     241
5500.000000     232
5200.000000     183
               ... 
5267.118134       1
7494.145199       1
11371.841155      1
6179.196704       1
3217.821782       1
Name: price_per_sqft, Length: 23304, dtype: int64
********************
Bangalore        30604
Hyderabad         5383
Greater Noida     3273
Lucknow           2793
Faridabad         2170
Ghaziabad         1056
Gachibowli         632
Mysore             264
Name: city, dtype: int64


In [8]:
data.describe()

Unnamed: 0,price,size_sqft,price_per_sqft,BHK
count,46175.0,46175.0,46175.0,46175.0
mean,196.681114,1831.907,11405.208568,2.751446
std,317.967073,9579.613,11972.960154,0.950095
min,5.99,300.0,2.257423,1.0
25%,62.5,1135.0,4956.0,2.0
50%,96.0,1400.0,6500.0,3.0
75%,222.0,1852.0,10153.5,3.0
max,32260.0,1306800.0,50000.0,9.0


 - min value of price_per_sqft is 2.25 which is surely an outlier

In [9]:
(data['size_sqft']/data['BHK']).describe()

count     46175.000000
mean        654.890895
std        3620.913386
min         300.000000
25%         483.000000
50%         565.500000
75%         652.000000
max      653400.000000
dtype: float64

- max of size_sqft/BHK is 653400 which looks to be like an outlier

In [13]:
data = data[~(data['size_sqft'] > 100000)]
data.head()

Unnamed: 0,price,size_sqft,price_per_sqft,city,BHK
0,118.0,2037.0,5793.0,Bangalore,3
1,57.56,1240.0,4642.0,Bangalore,2
2,133.0,1641.0,8133.0,Bangalore,3
3,55.32,957.0,5787.0,Bangalore,2
4,83.47,1575.0,5300.0,Bangalore,3


In [11]:
data = data[((data['size_sqft']/data['BHK']) <= 100000)]
data.describe()

Unnamed: 0,price,size_sqft,price_per_sqft,BHK
count,46165.0,46165.0,46165.0,46165.0
mean,195.351108,1723.409362,11406.646494,2.751088
std,246.707591,1531.985356,11973.054469,0.949153
min,5.99,300.0,143.480257,1.0
25%,62.5,1135.0,4956.521739,2.0
50%,96.0,1400.0,6500.0,3.0
75%,222.0,1852.0,10154.0,3.0
max,6600.0,100000.0,50000.0,9.0


In [17]:
X = data.drop(columns=['price'])
y = data['price']

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=0)

In [19]:
X_train.shape

(36932, 4)

In [21]:
X_test

Unnamed: 0,size_sqft,price_per_sqft,city,BHK
44879,4800.0,4895.833333,Lucknow,5
45810,1800.0,8055.555556,Lucknow,3
41523,2050.0,32780.487805,Greater Noida,3
19664,717.0,7334.000000,Bangalore,2
21759,1350.0,11111.111111,Bangalore,3
...,...,...,...,...
5858,3170.0,8500.000000,Bangalore,3
40784,1110.0,39720.720721,Ghaziabad,3
14085,1704.0,6888.000000,Bangalore,3
19811,1100.0,3643.000000,Bangalore,3


## Applying Linear Regression

In [22]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['city']), remainder='passthrough')

In [23]:
scaler = StandardScaler()

In [25]:
lr = LinearRegression()

In [26]:
pipe = make_pipeline(column_trans, scaler, lr)

In [27]:
pipe.fit(X_train, y_train)



In [31]:
y_train

19540    120.0
893       70.0
19605    301.0
24863    325.0
9527     166.0
         ...  
21243     95.0
45901     57.0
42620    950.0
43577     54.9
2732      95.1
Name: price, Length: 36932, dtype: float64

In [32]:
y_test

44879    235.0
45810    145.0
41523    672.0
19664     74.0
21759    150.0
         ...  
5858     269.0
40784    440.9
14085    117.0
19811     47.5
33115     85.0
Name: price, Length: 9233, dtype: float64

In [30]:
y_pred_lr = pipe.predict(X_test)
y_pred_lr

array([324.78824167, 163.17593699, 456.67886667, ..., 144.71792917,
        49.28506785,  68.17105518])

In [29]:
r2_score(y_test, y_pred_lr)

0.772073429425526