In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score,KFold, cross_validate
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor

In [31]:
import pickle

In [2]:
pd.options.display.max_columns = None

In [3]:
main_df = pd.read_csv("data/predictor-data-final.csv")
df = main_df.copy()
df.head(1)

Unnamed: 0,property_type,Sector,City,Area,bedRoom,bathroom,balcony,Pooja Room,Servant Room,Store Room,Study Room,Other Room,facing,FloorNo,FloorRise,Main Road,Park/Garden,Pool,Club,Overlook Others,agePossession,Flooring,GatedCommunity,Furnishing,CoveredParking,OpenParking,24*7 Water,MuniCorp Water,Borewell/Tank,PowerBackup,WheelChairFriendly,PetFriendly,Facilities Categories,price
0,House/Villa,Sector 15 Chandigarh,Chandigarh,18000.0,7,7,4,0,1,0,0,0,North-West,0,Low-Rise,0,0,0,0,1,Old,Marble,No,Semifurnished,3,3,1,1,0,No,Yes,Yes,Standard,28.0


Dropping Unnecessary columns.

In [4]:
cols_drop = ['Pooja Room','Servant Room','Study Room','Store Room','Other Room','Main Road','Park/Garden','Club','Overlook Others','Pool','PetFriendly','WheelChairFriendly']

In [5]:
df.drop(columns = cols_drop,inplace = True)

In [6]:
df.head(1)

Unnamed: 0,property_type,Sector,City,Area,bedRoom,bathroom,balcony,facing,FloorNo,FloorRise,agePossession,Flooring,GatedCommunity,Furnishing,CoveredParking,OpenParking,24*7 Water,MuniCorp Water,Borewell/Tank,PowerBackup,Facilities Categories,price
0,House/Villa,Sector 15 Chandigarh,Chandigarh,18000.0,7,7,4,North-West,0,Low-Rise,Old,Marble,No,Semifurnished,3,3,1,1,0,No,Standard,28.0


### Transforming Columns

In [7]:
df['price'] = np.log1p(df['price'])
df['Area'] = np.log(df['Area'])

### Encoding Columns

In [8]:
all_cat_cols = ['property_type','Sector','City','facing','FloorRise','agePossession','Flooring','GatedCommunity','Furnishing','PowerBackup','Facilities Categories']
cols_order = ['agePossession','Furnishing','PowerBackup','Facilities Categories']
cols_no_order = ['property_type','Sector','City','facing','FloorRise','Flooring','GatedCommunity']

In [9]:
cols_order_cats = [['Old','ModOld','RelNew','New','UndConst'],['Unfurnished','Semifurnished','Furnished'],
                   ['No','Partial','Full'],['Basic','Standard','Premium','Luxurious']]

In [10]:
transformer = ColumnTransformer(
    [
        ('order_cols',OrdinalEncoder(categories=cols_order_cats),cols_order),
        ('no_order_cols',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),cols_no_order)
    ],
    remainder='passthrough'
)

### Making Model

In [11]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [12]:
model = Pipeline(
    [('encoding',transformer),
    ('scaler',StandardScaler()),
    ('cat_boost',CatBoostRegressor(verbose = False))]    
)

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['r2', 'neg_mean_absolute_error']

In [14]:
df.head(1)

Unnamed: 0,property_type,Sector,City,Area,bedRoom,bathroom,balcony,facing,FloorNo,FloorRise,agePossession,Flooring,GatedCommunity,Furnishing,CoveredParking,OpenParking,24*7 Water,MuniCorp Water,Borewell/Tank,PowerBackup,Facilities Categories,price
0,House/Villa,Sector 15 Chandigarh,Chandigarh,9.798127,7,7,4,North-West,0,Low-Rise,Old,Marble,No,Semifurnished,3,3,1,1,0,No,Standard,3.367296


In [15]:
## CatBoostRegressor
cat_scores = cross_validate(model, X, y, cv=kf, scoring=scoring,return_train_score = True)
print(f"R2: {np.mean(cat_scores['test_r2']):.4f} ± {np.std(cat_scores['test_r2']):.4f}")
print(f"MAE: {np.expm1(-np.mean(cat_scores['test_neg_mean_absolute_error'])):.4f} ± {np.expm1(np.std(cat_scores['test_neg_mean_absolute_error'])):.4f}")

R2: 0.9607 ± 0.0021
MAE: 0.0961 ± 0.0013


In [16]:
model.fit(X,y)

In [17]:
cols_list = X.columns

In [18]:
X.head(1)

Unnamed: 0,property_type,Sector,City,Area,bedRoom,bathroom,balcony,facing,FloorNo,FloorRise,agePossession,Flooring,GatedCommunity,Furnishing,CoveredParking,OpenParking,24*7 Water,MuniCorp Water,Borewell/Tank,PowerBackup,Facilities Categories
0,House/Villa,Sector 15 Chandigarh,Chandigarh,9.798127,7,7,4,North-West,0,Low-Rise,Old,Marble,No,Semifurnished,3,3,1,1,0,No,Standard


In [19]:
#data = [['House/Villa','Dhakoli','Mohali',6.802394763324311,2,1,1,'South',0,'Low-Rise','Old','Vitrified','Yes','Semifurnished',1,2,0,1,1,'Partial','Standard']]

In [20]:
#data = [['Flat/Apartment','Sector 20 Panchkula','Panchkula',7.355641102974253,3,2,4,'North',5,'Mid-Rise','Old','Vitrified','Yes','Semifurnished',1,1,1,1,1,'Full','Standard']]

In [21]:
#data = [['Flat/Apartment','Zirakpur','Mohali',7.155396301896734,3,3,3,'East',7,'Mid-Rise','New','Vitrified','Yes','Semifurnished',0,1,1,0,0,'Full','Standard']]

In [22]:
data = [['Flat/Apartment','New Chandigarh','Mohali',7.400620577371135,3,2,3,'North-East',1,'Mid-Rise','UndConst','Ceramic','Yes','Semifurnished',2,1,1,0,0,'Full','Premium']]

In [23]:
one_df = pd.DataFrame(data, columns = cols_list)

In [24]:
one_df

Unnamed: 0,property_type,Sector,City,Area,bedRoom,bathroom,balcony,facing,FloorNo,FloorRise,agePossession,Flooring,GatedCommunity,Furnishing,CoveredParking,OpenParking,24*7 Water,MuniCorp Water,Borewell/Tank,PowerBackup,Facilities Categories
0,Flat/Apartment,New Chandigarh,Mohali,7.400621,3,2,3,North-East,1,Mid-Rise,UndConst,Ceramic,Yes,Semifurnished,2,1,1,0,0,Full,Premium


In [25]:
pred = np.expm1(model.predict(one_df))[0]
print(f"House Price is in Between {round((pred - 0.1),2)} Cr to {round((pred + 0.1),2)} Cr, With Average Price of {round(pred,2)} Cr")

House Price is in Between 1.12 Cr to 1.32 Cr, With Average Price of 1.22 Cr


### HyperParameter Tuning

In [26]:
model = Pipeline(
    [('encoding',transformer),
    ('scaler',StandardScaler()),
    ('cat_boost',CatBoostRegressor(verbose = False,depth = 7,iterations = 1100,learning_rate = 0.0625))]    
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['r2', 'neg_mean_absolute_error']

In [27]:
## CatBoostRegressor
cat_scores = cross_validate(model, X, y, cv=kf, scoring=scoring,return_train_score = True)
print(f"R2: {np.mean(cat_scores['test_r2']):.4f} ± {np.std(cat_scores['test_r2']):.4f}")
print(f"MAE: {np.expm1(-np.mean(cat_scores['test_neg_mean_absolute_error'])):.4f} ± {np.expm1(np.std(cat_scores['test_neg_mean_absolute_error'])):.4f}")

R2: 0.9624 ± 0.0020
MAE: 0.0927 ± 0.0011


In [28]:
model.fit(X,y)

In [29]:
pred = np.expm1(model.predict(one_df))[0]
print(f"House Price is in Between {round((pred - 0.1),2)} Cr to {round((pred + 0.1),2)} Cr, With Average Price of {round(pred,2)} Cr")

House Price is in Between 1.14 Cr to 1.34 Cr, With Average Price of 1.24 Cr


### Pickle Data and model

In [32]:
with open('model_pipeline.pkl','wb') as file:
    pickle.dump(model,file)

In [35]:
with open('df.pkl','wb') as file:
    pickle.dump(X,file)

In [36]:
with open('transformer.pkl','wb') as file:
    pickle.dump(transformer,file)

### Requirements

In [39]:
np.__version__

'1.26.3'

In [40]:
pd.__version__

'2.1.4'

In [43]:
df['property_type'].value_counts()

property_type
Flat/Apartment    4901
House/Villa       3226
Builder Floor      286
Name: count, dtype: int64

In [44]:
df['City'].value_counts()

City
Mohali        5781
Chandigarh    1382
Panchkula     1250
Name: count, dtype: int64

In [48]:
df[df['City'] == 'Mohali']['Sector'].value_counts().index

Index(['Zirakpur', 'Kharar', 'Sector 66 Mohali', 'Mullanpur', 'Aerocity',
       'Sector 88 Mohali', 'Sector 127 Mohali', 'Sector 115 Mohali',
       'New Chandigarh', 'Sector 91 Mohali', 'Sector 125 Mohali',
       'Sector 82 Mohali', 'Sector 126 Mohali', 'Dera Bassi',
       'Sector 117 Mohali', 'Phase 3 Mohali', 'Sector 85 Mohali',
       'Sector 99 Mohali', 'Sector 71 Mohali', 'Sector 70 Mohali',
       'Sector 79 Mohali', 'Sector 105 Mohali', 'Phase 7 Mohali',
       'Sector 116 Mohali', 'Dhakoli', 'Sector 80 Mohali', 'Phase 9 Mohali',
       'Sector 89 Mohali', 'Sector 69 Mohali', 'Sector 110 Mohali',
       'Phase 10 Mohali', 'Sector 65 Mohali', 'Sector 74 Mohali',
       'Sector 77 Mohali', 'Phase 4 Mohali', 'Sector 124 Mohali',
       'Sector 68 Mohali', 'Phase 11 Mohali', 'Sector 78 Mohali',
       'Sector 114 Mohali', 'Sector 67 Mohali', 'Sector 109 Mohali',
       'Sector 113 Mohali', 'Sector 121 Mohali', 'Phase 5 Mohali',
       'Sector 90 Mohali', 'Mohali', 'Sector 108 Mo

In [56]:
type(sorted(df['bedRoom'].unique())[0])

numpy.int64

In [51]:
sorted(df['bathroom'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [57]:
sorted(df['facing'].unique())

['East',
 'North',
 'North-East',
 'North-West',
 'South',
 'South-East',
 'South-West',
 'West']

In [63]:
df['Flooring'].value_counts().index

Index(['Vitrified', 'Marble', 'Ceramic', 'Granite', 'Wood', 'Mosaic', 'Others',
       'Concrete', 'Stone', 'Cement', 'Polished concrete', 'Vinyl', 'Spartex',
       'IPSFinish'],
      dtype='object', name='Flooring')