In [629]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [630]:
df = pd.read_csv('bengaluru_house_prices.csv')

In [631]:
train_df = df.copy()

In [632]:
from sklearn.model_selection import train_test_split


train_df ,test_df =train_test_split(df,test_size=.2,random_state=69)
train_df ,val_df =train_test_split(df,test_size=.25,random_state=69)

In [633]:
train_df.describe()

Unnamed: 0,bath,balcony,price
count,9945.0,9533.0,9990.0
mean,2.693313,1.573272,111.359511
std,1.363846,0.81169,142.934786
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [634]:
INR_per_lakh = 100000
USD_per_INR = 1/83 

In [635]:
def usd_price(df):
    df['price_usd'] = df['price'] *INR_per_lakh * USD_per_INR

In [636]:
usd_price(train_df)
usd_price(val_df)
usd_price(test_df)

In [637]:
def extract_bedrooms(df):
    df['bedroom'] = df['size'].str.extract(r'(\d+)').astype(float)

In [638]:
extract_bedrooms(train_df)
extract_bedrooms(val_df)
extract_bedrooms(test_df)

In [639]:
train_df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,price_usd,bedroom
9496,Super built-up Area,18-Apr,Banashankari,2 BHK,Bregae,1430,2.0,2.0,87.80,105783.132530,2.0
12149,Super built-up Area,Ready To Move,Mallasandra,3 BHK,ShnyeSy,1524,2.0,3.0,72.00,86746.987952,3.0
9162,Super built-up Area,Ready To Move,Dairy Circle,2 BHK,Soeurri,1541,2.0,1.0,181.00,218072.289157,2.0
8871,Super built-up Area,Ready To Move,Jalahalli East,1 BHK,ShshaSa,750 - 800,1.0,0.0,34.10,41084.337349,1.0
8290,Super built-up Area,Ready To Move,Hennur,2 BHK,,1100,2.0,1.0,44.55,53674.698795,2.0
...,...,...,...,...,...,...,...,...,...,...,...
8631,Super built-up Area,18-Jun,Abbigere,3 BHK,DSestma,1326,2.0,1.0,35.00,42168.674699,3.0
9818,Super built-up Area,Ready To Move,Seegehalli,3 BHK,Niowshi,1683,3.0,3.0,80.00,96385.542169,3.0
10859,Super built-up Area,Ready To Move,Ambedkar Nagar,3 BHK,SoechHa,1935,4.0,2.0,125.00,150602.409639,3.0
4041,Super built-up Area,Ready To Move,Poorna Pragna Layout,3 BHK,,1270,2.0,2.0,50.79,61192.771084,3.0


In [640]:
train_df.total_sqft.value_counts()

total_sqft
1200            637
1100            175
1500            147
1000            137
2400            137
               ... 
1462              1
1932.47           1
7514              1
456               1
315Sq. Yards      1
Name: count, Length: 1875, dtype: int64

In [641]:
def convert_sqft(row):
    try:
        if '-' in row:
            num = row.split('-')
            if len(num) == 2:
                return (float(num[0]) +float(num[1])) / 2
        return float(row)
    except:
        return np.nan
    

In [642]:
train_df['sqft'] = train_df['total_sqft'].apply(convert_sqft)
test_df['sqft']= test_df['total_sqft'].apply(convert_sqft)
val_df['sqft']=val_df['total_sqft'].apply(convert_sqft)

In [643]:
train_df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,price_usd,bedroom,sqft
9496,Super built-up Area,18-Apr,Banashankari,2 BHK,Bregae,1430,2.0,2.0,87.80,105783.132530,2.0,1430.0
12149,Super built-up Area,Ready To Move,Mallasandra,3 BHK,ShnyeSy,1524,2.0,3.0,72.00,86746.987952,3.0,1524.0
9162,Super built-up Area,Ready To Move,Dairy Circle,2 BHK,Soeurri,1541,2.0,1.0,181.00,218072.289157,2.0,1541.0
8871,Super built-up Area,Ready To Move,Jalahalli East,1 BHK,ShshaSa,750 - 800,1.0,0.0,34.10,41084.337349,1.0,775.0
8290,Super built-up Area,Ready To Move,Hennur,2 BHK,,1100,2.0,1.0,44.55,53674.698795,2.0,1100.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8631,Super built-up Area,18-Jun,Abbigere,3 BHK,DSestma,1326,2.0,1.0,35.00,42168.674699,3.0,1326.0
9818,Super built-up Area,Ready To Move,Seegehalli,3 BHK,Niowshi,1683,3.0,3.0,80.00,96385.542169,3.0,1683.0
10859,Super built-up Area,Ready To Move,Ambedkar Nagar,3 BHK,SoechHa,1935,4.0,2.0,125.00,150602.409639,3.0,1935.0
4041,Super built-up Area,Ready To Move,Poorna Pragna Layout,3 BHK,,1270,2.0,2.0,50.79,61192.771084,3.0,1270.0


In [644]:
train_df=train_df.dropna(subset=['sqft','bedroom','location'])
val_df = val_df.dropna(subset=['sqft','bedroom','location'])
test_df = test_df.dropna(subset=['sqft','bedroom','location'])

In [645]:
train_df.isna().sum()

area_type          0
availability       0
location           0
size               0
society         4120
total_sqft         0
bath              38
balcony          447
price              0
price_usd          0
bedroom            0
sqft               0
dtype: int64

In [646]:
from sklearn.impute import SimpleImputer


In [647]:
bath_imputer = SimpleImputer(strategy='median')

In [648]:
bath_imputer.fit(train_df[['bath']])

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [649]:
train_df['bath']=bath_imputer.transform(train_df[['bath']])
test_df['bath']=bath_imputer.transform(test_df[['bath']])
val_df['bath']=bath_imputer.transform(val_df[['bath']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['bath']=bath_imputer.transform(train_df[['bath']])


In [650]:
train_df['balcony']=train_df['balcony'].fillna(0)
val_df['balcony']=val_df['balcony'].fillna(0)
test_df['balcony']=test_df['balcony'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['balcony']=train_df['balcony'].fillna(0)


In [651]:
def sqft_per_bedroom(df):
    df = df[df["sqft"] / df["bedroom"] >= 200]


In [652]:
sqft_per_bedroom(train_df)
sqft_per_bedroom(val_df)
sqft_per_bedroom(test_df)

In [653]:
def listing_frequecny(df):
    location_count = df['location'].value_counts()
    df['location_listing_frequency'] = df['location'].map(location_count)

In [654]:
listing_frequecny(train_df)
listing_frequecny(val_df)
listing_frequecny(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location_listing_frequency'] = df['location'].map(location_count)


In [655]:
def price_per_sqft(df):
    df['price_per_sqft'] = df.groupby('location')['price'].transform('mean')

In [656]:
price_per_sqft(train_df)
price_per_sqft(val_df)
price_per_sqft(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_per_sqft'] = df.groupby('location')['price'].transform('mean')


In [657]:
train_df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,price_usd,bedroom,sqft,location_listing_frequency,price_per_sqft
9496,Super built-up Area,18-Apr,Banashankari,2 BHK,Bregae,1430,2.0,2.0,87.80,105783.132530,2.0,1430.0,47,102.011064
12149,Super built-up Area,Ready To Move,Mallasandra,3 BHK,ShnyeSy,1524,2.0,3.0,72.00,86746.987952,3.0,1524.0,12,67.608333
9162,Super built-up Area,Ready To Move,Dairy Circle,2 BHK,Soeurri,1541,2.0,1.0,181.00,218072.289157,2.0,1541.0,7,314.571429
8871,Super built-up Area,Ready To Move,Jalahalli East,1 BHK,ShshaSa,750 - 800,1.0,0.0,34.10,41084.337349,1.0,775.0,11,56.007273
8290,Super built-up Area,Ready To Move,Hennur,2 BHK,,1100,2.0,1.0,44.55,53674.698795,2.0,1100.0,47,76.455851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,Super built-up Area,18-Jun,Abbigere,3 BHK,DSestma,1326,2.0,1.0,35.00,42168.674699,3.0,1326.0,16,58.525625
9818,Super built-up Area,Ready To Move,Seegehalli,3 BHK,Niowshi,1683,3.0,3.0,80.00,96385.542169,3.0,1683.0,22,130.842727
10859,Super built-up Area,Ready To Move,Ambedkar Nagar,3 BHK,SoechHa,1935,4.0,2.0,125.00,150602.409639,3.0,1935.0,25,188.280000
4041,Super built-up Area,Ready To Move,Poorna Pragna Layout,3 BHK,,1270,2.0,2.0,50.79,61192.771084,3.0,1270.0,16,76.735625


In [658]:
train_df.columns


Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price', 'price_usd', 'bedroom',
       'sqft', 'location_listing_frequency', 'price_per_sqft'],
      dtype='object')

In [659]:
input_cols = [ 'bath', 'balcony', 'bedroom','sqft','location_listing_frequency','price_per_sqft']
target_col = 'price_usd'

In [660]:
top_locations = train_df.location.value_counts().nlargest(25).index
def loc_end(df):
    df['location_encoded'] = df['location'].where(df['location'].isin(top_locations),'Other')


In [661]:
loc_end(train_df)
loc_end(val_df)
loc_end(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location_encoded'] = df['location'].where(df['location'].isin(top_locations),'Other')


In [662]:
train_df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,price_usd,bedroom,sqft,location_listing_frequency,price_per_sqft,location_encoded
9496,Super built-up Area,18-Apr,Banashankari,2 BHK,Bregae,1430,2.0,2.0,87.80,105783.132530,2.0,1430.0,47,102.011064,Other
12149,Super built-up Area,Ready To Move,Mallasandra,3 BHK,ShnyeSy,1524,2.0,3.0,72.00,86746.987952,3.0,1524.0,12,67.608333,Other
9162,Super built-up Area,Ready To Move,Dairy Circle,2 BHK,Soeurri,1541,2.0,1.0,181.00,218072.289157,2.0,1541.0,7,314.571429,Other
8871,Super built-up Area,Ready To Move,Jalahalli East,1 BHK,ShshaSa,750 - 800,1.0,0.0,34.10,41084.337349,1.0,775.0,11,56.007273,Other
8290,Super built-up Area,Ready To Move,Hennur,2 BHK,,1100,2.0,1.0,44.55,53674.698795,2.0,1100.0,47,76.455851,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,Super built-up Area,18-Jun,Abbigere,3 BHK,DSestma,1326,2.0,1.0,35.00,42168.674699,3.0,1326.0,16,58.525625,Other
9818,Super built-up Area,Ready To Move,Seegehalli,3 BHK,Niowshi,1683,3.0,3.0,80.00,96385.542169,3.0,1683.0,22,130.842727,Other
10859,Super built-up Area,Ready To Move,Ambedkar Nagar,3 BHK,SoechHa,1935,4.0,2.0,125.00,150602.409639,3.0,1935.0,25,188.280000,Other
4041,Super built-up Area,Ready To Move,Poorna Pragna Layout,3 BHK,,1270,2.0,2.0,50.79,61192.771084,3.0,1270.0,16,76.735625,Other


In [663]:
cat_cols = ['location_encoded','area_type']

In [664]:
from sklearn.preprocessing import OneHotEncoder


In [665]:
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

In [666]:
encoder.fit(train_df[cat_cols])

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [667]:
encoded_cols = list(encoder.get_feature_names_out(cat_cols))

In [668]:
train_df[encoded_cols] = encoder.transform(train_df[cat_cols])
val_df[encoded_cols] = encoder.transform(val_df[cat_cols])
test_df[encoded_cols] = encoder.transform(test_df[cat_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[encoded_cols] = encoder.transform(train_df[cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[encoded_cols] = encoder.transform(train_df[cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[encoded_cols] = encoder.transform(train_df[cat_cols])
A value is tryin

In [669]:
train_input = train_df[input_cols + encoded_cols]
val_input = val_df[input_cols + encoded_cols]
test_input = test_df[input_cols + encoded_cols]

train_target = train_df[target_col]
val_target = val_df[target_col]
test_target = test_df[target_col]

In [670]:
train_input

Unnamed: 0,bath,balcony,bedroom,sqft,location_listing_frequency,price_per_sqft,location_encoded_7th Phase JP Nagar,location_encoded_Bannerghatta Road,location_encoded_Begur Road,location_encoded_Bellandur,...,location_encoded_Sarjapur Road,location_encoded_Thanisandra,location_encoded_Uttarahalli,location_encoded_Whitefield,location_encoded_Yelahanka,location_encoded_Yeshwanthpur,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
9496,2.0,2.0,2.0,1430.0,47,102.011064,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12149,2.0,3.0,3.0,1524.0,12,67.608333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9162,2.0,1.0,2.0,1541.0,7,314.571429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8871,1.0,0.0,1.0,775.0,11,56.007273,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8290,2.0,1.0,2.0,1100.0,47,76.455851,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,2.0,1.0,3.0,1326.0,16,58.525625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9818,3.0,3.0,3.0,1683.0,22,130.842727,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10859,4.0,2.0,3.0,1935.0,25,188.280000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4041,2.0,2.0,3.0,1270.0,16,76.735625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [671]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [672]:
scaler = StandardScaler()
scaler.fit(train_input[input_cols])


0,1,2
,copy,True
,with_mean,True
,with_std,True


In [673]:
x_train = scaler.transform(train_input[input_cols])

In [674]:
cat_data =train_input[encoded_cols].values

In [675]:
inputs = np.concatenate((x_train,cat_data),axis=1)

In [676]:
model =LinearRegression(n_jobs=-1)

In [677]:
model.fit(inputs,train_target)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,-1
,positive,False


In [678]:
model.score(inputs,train_target)

0.6071806505012081

In [679]:
model.score(val_input,val_target)



-398417.91021248605

In [680]:
from sklearn.ensemble import RandomForestRegressor

In [761]:
model_1 = RandomForestRegressor(n_jobs=-1,n_estimators=133,random_state=69,min_samples_leaf=4,max_depth=11,max_features=19)

In [762]:
model_1.fit(train_input,train_target)

0,1,2
,n_estimators,133
,criterion,'squared_error'
,max_depth,11
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,19
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [763]:
model_1.score(train_input,train_target)

0.8717477873411451

In [764]:
model_1.score(val_input,val_target)

0.8095140979543299

In [765]:
train_input

Unnamed: 0,bath,balcony,bedroom,sqft,location_listing_frequency,price_per_sqft,location_encoded_7th Phase JP Nagar,location_encoded_Bannerghatta Road,location_encoded_Begur Road,location_encoded_Bellandur,...,location_encoded_Sarjapur Road,location_encoded_Thanisandra,location_encoded_Uttarahalli,location_encoded_Whitefield,location_encoded_Yelahanka,location_encoded_Yeshwanthpur,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
9496,2.0,2.0,2.0,1430.0,47,102.011064,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12149,2.0,3.0,3.0,1524.0,12,67.608333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9162,2.0,1.0,2.0,1541.0,7,314.571429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8871,1.0,0.0,1.0,775.0,11,56.007273,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8290,2.0,1.0,2.0,1100.0,47,76.455851,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,2.0,1.0,3.0,1326.0,16,58.525625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9818,3.0,3.0,3.0,1683.0,22,130.842727,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10859,4.0,2.0,3.0,1935.0,25,188.280000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4041,2.0,2.0,3.0,1270.0,16,76.735625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [766]:
train_input.columns

Index(['bath', 'balcony', 'bedroom', 'sqft', 'location_listing_frequency',
       'price_per_sqft', 'location_encoded_7th Phase JP Nagar',
       'location_encoded_Bannerghatta Road', 'location_encoded_Begur Road',
       'location_encoded_Bellandur', 'location_encoded_Chandapura',
       'location_encoded_Electronic City',
       'location_encoded_Electronic City Phase II',
       'location_encoded_Electronics City Phase 1',
       'location_encoded_Haralur Road', 'location_encoded_Harlur',
       'location_encoded_Hebbal', 'location_encoded_Hennur Road',
       'location_encoded_Hoodi', 'location_encoded_KR Puram',
       'location_encoded_Kanakpura Road', 'location_encoded_Marathahalli',
       'location_encoded_Other', 'location_encoded_Raja Rajeshwari Nagar',
       'location_encoded_Rajaji Nagar', 'location_encoded_Sarjapur',
       'location_encoded_Sarjapur  Road', 'location_encoded_Thanisandra',
       'location_encoded_Uttarahalli', 'location_encoded_Whitefield',
       'loca