In [69]:
# We are woking on project --> Bengaluru_House_Data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

In [70]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head(3)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0


In [71]:
df1.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [72]:
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [73]:
df1.shape

(13320, 9)

In [74]:
df2 = df1.drop(['area_type', 'society', 'balcony', 'availability'], axis = 'columns')
df2.shape

(13320, 5)

# Data Cleaning

In [75]:
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [76]:
df2.shape

(13320, 5)

In [77]:
df3 = df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [78]:
df3.shape

(13246, 5)

In [79]:
s1 = "3 bhk"
int(s1.split()[0])

3

In [80]:
# Feature Engineering


In [81]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split()[0]))
df3.bhk.unique() # df3['total_sqft'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [82]:
df3.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3


In [83]:
df3['total_sqft'].value_counts()

total_sqft
1200    843
1100    221
1500    204
2400    195
600     180
       ... 
5985      1
3580      1
2461      1
1437      1
4689      1
Name: count, Length: 2067, dtype: int64

In [84]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [85]:
# Explore Total SQFT Feature

In [86]:
# Handle Dashes

In [87]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [88]:
df3[~df3['total_sqft'].apply(is_float)].head(10) # "~" this is negation. Find whose values which is not convert after use function above.

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [89]:
# It is a demo --> How we can use split method in function.
s1 = "3090 - 5002"
x = s1.split("-")
(int(x[0])+int(x[1]))/2

4046.0

In [90]:
# Above Data Shows That Total SQFT Can Be A Range (E.G. 2100 - 2850). For Such Cases We Can Just Take Average Of Min & Max Value In The Range. 
# There Are Other Cases Where Values Are In SQM Which Can Be Converted To SQFT Using Unit Conversion.


In [91]:
# This function is for remaining dashes values, values etc.
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1]))/2    # a+b/2 formula
    try:
        return float(x)  # It will direct change value to float.
    except:
        return None # here, None means Null value.

In [92]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.shape

(13200, 6)

In [93]:
df3.shape

(13246, 6)

In [94]:
df4.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3


In [95]:
# Add New Feature Called Price Per Square Feet

In [96]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000 /df5['total_sqft']
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [97]:
# Examine Locations which is a categorical variable. We need to apply the Dimensionality reduction technique here to reduce the number of locations. 

df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending = False)
location_stats

location
Whitefield                   533
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Rajanna Layout                 1
Subramanyanagar                1
Lakshmipura Vidyaanyapura      1
Malur Hosur Road               1
Abshot Layout                  1
Name: count, Length: 1287, dtype: int64

In [98]:
df5.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556


In [99]:
len(location_stats[location_stats > 10]) # checking how many values is greater than 10.

240

In [100]:
len(location_stats)

1287

In [101]:
len(location_stats[location_stats <= 10]) # checking how many values is lesser than 10.

1047

In [102]:
# Dimensionality Reductions --> # We can say that we are optimising this data.

In [103]:
# Any Location having less than 10 Data pints should be tagged as "Other" Location. This way number of categories can be reduced by huge amount.
 # Later on when we do "OneHotEncoding", It will help us with having fewer dummy Columns.

In [104]:
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

location
BTM 1st Stage                10
Gunjur Palya                 10
Nagappa Reddy Layout         10
Sector 1 HSR Layout          10
Thyagaraja Nagar             10
                             ..
Rajanna Layout                1
Subramanyanagar               1
Lakshmipura Vidyaanyapura     1
Malur Hosur Road              1
Abshot Layout                 1
Name: count, Length: 1047, dtype: int64

In [105]:
len(df5.location.unique())

1287

In [106]:
df5.location = df5.location.apply(lambda x: "unknown" if x in location_stats_less_than_10 else x)
len(df5.location.unique())

241

In [107]:
df5.location.value_counts()

location
unknown            2872
Whitefield          533
Sarjapur  Road      392
Electronic City     304
Kanakpura Road      264
                   ... 
Doddaballapur        11
Tindlu               11
Marsur               11
HAL 2nd Stage        11
Kodigehalli          11
Name: count, Length: 241, dtype: int64

In [108]:
df5['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [109]:
df5['bhk'] = df5.bhk.apply(lambda x: x if x<=11 else np.nan)
df5.bhk.unique()

array([ 2.,  4.,  3.,  6.,  1.,  8.,  7.,  5., 11.,  9., nan, 10.])

In [110]:
df5.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
bhk               8
price_per_sqft    0
dtype: int64

In [111]:
df5['bhk'].fillna(df5['bhk'].mode()[0], inplace = True)

In [112]:
df5.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
bhk               0
price_per_sqft    0
dtype: int64

In [113]:
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0,4250.0


In [114]:
# Note --> In industry, we can't do anything without meeting. (Domain Knowledge) 
# -------> As a fresher our first priority is "boxplot" for "detect and remove outliers", then use "Domain knowledge"

In [115]:
# Outlier Removal Using Business Logic

In [116]:
# As a Data Scientists a Data Scientist when you have a conversation with your business manager (who has expertise in real estate), 
 # He will tell you that normally squarei ft per Bedroom is 300 (I.E. 2BHK Apartment is minimum 600 Sqft). 
 # If you have for example 400 Sqft apartment with 2 BHK than that seems suspicious and can be removed as an outlier.
 # We will remove such outliers by keeping our minimum thresold per BHK to Be 300 Sqft.

In [117]:
df5[df5.total_sqft/df5.bhk<300].head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,unknown,6 Bedroom,1020.0,6.0,370.0,6.0,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8.0,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6.0,10660.98081
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8.0,6296.296296
70,unknown,3 Bedroom,500.0,3.0,100.0,3.0,20000.0


In [118]:
df5.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0,4305.555556


In [119]:
# Check the above Data Points. We have 6 BHK Apartments with 1020 SQFT. Another one is 8 BHK And the total SQFT is 600. 
 # These are clear data errors that can be removed safely

In [120]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

(12462, 7)

In [121]:
df6.bath.unique()

array([ 2.,  5.,  3.,  4.,  1.,  8.,  6.,  7.,  9., 14., 27., 12., 16.,
       40., 15., 10., 13., 18.])

In [122]:
df6[df6.bath>10]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
1078,unknown,9 Bedroom,3300.0,14.0,500.0,9.0,15151.515152
1718,unknown,27 BHK,8000.0,27.0,230.0,2.0,2875.0
3096,unknown,10 BHK,12000.0,12.0,525.0,10.0,4375.0
3379,unknown,19 BHK,2000.0,16.0,490.0,2.0,24500.0
3609,unknown,16 BHK,10000.0,16.0,550.0,2.0,5500.0
4684,Munnekollal,43 Bedroom,2400.0,40.0,660.0,2.0,27500.0
4916,unknown,14 BHK,1250.0,15.0,125.0,2.0,10000.0
7979,unknown,11 BHK,6000.0,12.0,150.0,11.0,2500.0
8636,Neeladri Nagar,10 BHK,4000.0,12.0,160.0,10.0,4000.0
9935,unknown,13 BHK,5425.0,13.0,275.0,2.0,5069.124424


In [123]:
# It is unusual to have 2 more bathrooms than number of bedrooms in a home

In [124]:
df6[df6.bath>df6.bhk+2]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
1078,unknown,9 Bedroom,3300.0,14.0,500.0,9.0,15151.515152
1718,unknown,27 BHK,8000.0,27.0,230.0,2.0,2875.0
2620,unknown,6 BHK,11338.0,9.0,1000.0,6.0,8819.897689
3379,unknown,19 BHK,2000.0,16.0,490.0,2.0,24500.0
3609,unknown,16 BHK,10000.0,16.0,550.0,2.0,5500.0
4684,Munnekollal,43 Bedroom,2400.0,40.0,660.0,2.0,27500.0
4916,unknown,14 BHK,1250.0,15.0,125.0,2.0,10000.0
6533,Mysore Road,12 Bedroom,2232.0,6.0,300.0,2.0,13440.860215
6838,Rajaji Nagar,5 BHK,7500.0,8.0,1700.0,5.0,22666.666667
7709,Chikkabanavar,4 Bedroom,2460.0,7.0,80.0,4.0,3252.03252


In [125]:
# Again the business manager has a conversation with you (i.e A data Scientist) That if you have a 4 bedroom home and even 
# if you have a bathroom in all 4 rooms plus one guest bathroom, you will have a total bath = total bed + 1 max. 
# Anything above that is an outlier or a data error and can be removed.

In [126]:
df7 = df6[df6.bath<df6.bhk+2]
df7.shape

(12301, 7)

In [127]:
df8 = df7.drop(["size", "price_per_sqft"], axis = "columns")
df8.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0
2,Uttarahalli,1440.0,2.0,62.0,3.0


In [128]:
# using Ml from here -------->

In [129]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df8['location'] = le.fit_transform(df8['location'])

In [130]:
df8.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,79,1056.0,2.0,39.07,2.0
1,60,2600.0,5.0,120.0,4.0
2,225,1440.0,2.0,62.0,3.0


In [131]:
X = df8.drop('price', axis = 1)
y = df8['price']

In [132]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [133]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.2, random_state = 42)

In [135]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [136]:
y_pred = lr.predict(X_test)

In [137]:
calc = pd.DataFrame(np.c_[y_test, y_pred], columns = ['Original Price', 'Predicted Price'])

In [138]:
calc

Unnamed: 0,Original Price,Predicted Price
0,18.0,2.986041
1,23.5,-5.324339
2,128.0,59.940366
3,45.0,70.948938
4,75.0,57.978599
...,...,...
2456,95.0,70.461644
2457,155.0,158.848079
2458,72.0,54.604003
2459,115.0,92.452253
