In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('bproperty_bashundhara_RA.csv')

In [3]:
df.head()

Unnamed: 0,dimensions,num_beds,num_baths,house_type,city,location,area_block,description,url,price
0,"2,055 sqft",4,4,Apartment,Dhaka,Bashundhara R-A,Block D,2055 Sq Ft Cozy Flat For Rent In Bashundhara R...,/en/property/details-5237946.html,35000.0
1,"1,911 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block A,Emanate Your Knack For Gardening By Renting Th...,/en/property/details-3875341.html,45000.0
2,"1,200 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block K,"To Secure Your Better State Of Living, Conside...",/en/property/details-5182996.html,20000.0
3,"1,910 sqft",3,3,Apartment,Dhaka,Bashundhara R-A,Block A,For Rental Purpose 1910 Sq Ft Commendable Desi...,/en/property/details-3875358.html,45000.0
4,800 sqft,2,2,Apartment,Dhaka,Bashundhara R-A,Block G,Bright And Cozy Apartment Featuring 800 Sq Ft ...,/en/property/details-3651253.html,18000.0


# Data Preprocessing

In [4]:
def fix_dimensions(dim):
    dim = float(str(dim).replace(' sqft', '').replace(',', ''))
    return dim

df['dimensions'] = df['dimensions'].apply(fix_dimensions)

In [5]:
df['num_beds'] = df['num_beds'].astype('int64')
df['num_baths'] = df['num_baths'].astype('int64')

In [6]:
def fix_blocks(block):
    block = str(block).replace('Block ', '').replace('Bashundhara ', '').replace(' Extension', '')
    return block

df['area_block'] = df['area_block'].apply(fix_blocks)

In [7]:
def description_length(desc):
    return len(str(desc))

df['desc_length'] = df['description'].apply(description_length)

# Feature Selection

In [8]:
df = df[ ['dimensions', 'num_beds', 'num_baths', 'area_block', 'desc_length', 'price'] ].copy()
df = df.drop_duplicates()

In [9]:
df.shape

(1500, 6)

In [10]:
feature_col = ['dimensions', 'num_beds', 'num_baths', 'area_block', 'desc_length']
target_col = ['price']

X = df[feature_col].copy()
y = df[target_col].copy()

# One Hot Encoding

In [11]:
X = pd.get_dummies(X, prefix='area_block')

In [12]:
X.head()

Unnamed: 0,dimensions,num_beds,num_baths,desc_length,area_block_A,area_block_B,area_block_C,area_block_D,area_block_E,area_block_F,area_block_G,area_block_H,area_block_I,area_block_J,area_block_K,area_block_L,area_block_Road
0,2055.0,4,4,85,0,0,0,1,0,0,0,0,0,0,0,0,0
1,1911.0,3,3,116,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1200.0,3,3,105,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1910.0,3,3,97,1,0,0,0,0,0,0,0,0,0,0,0,0
4,800.0,2,2,90,0,0,0,0,0,0,1,0,0,0,0,0,0


# Feature Scaling

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# XGBoost Regressor (Base)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
from xgboost import XGBRegressor

regressor = XGBRegressor()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
regressor.fit(X_train, y_train, early_stopping_rounds=10, eval_metric='mae', eval_set=[(X_test, y_test)])

[0]	validation_0-mae:21832.83789
[1]	validation_0-mae:15544.86816
[2]	validation_0-mae:11196.09863
[3]	validation_0-mae:8295.00781
[4]	validation_0-mae:6605.25732
[5]	validation_0-mae:5638.15869
[6]	validation_0-mae:5105.84033
[7]	validation_0-mae:4841.57764
[8]	validation_0-mae:4701.65527
[9]	validation_0-mae:4635.15478
[10]	validation_0-mae:4646.57178
[11]	validation_0-mae:4648.88477
[12]	validation_0-mae:4620.51367
[13]	validation_0-mae:4636.74707
[14]	validation_0-mae:4628.56836
[15]	validation_0-mae:4626.22168
[16]	validation_0-mae:4637.77832
[17]	validation_0-mae:4644.45459
[18]	validation_0-mae:4710.54492
[19]	validation_0-mae:4727.09473
[20]	validation_0-mae:4753.04932
[21]	validation_0-mae:4774.00244


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
y_pred = regressor.predict(X_test)

# Evaluation Metrics (Base)

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np 

print("Mean Absolute Error: ",mean_absolute_error(y_test,y_pred))
print("Mean Squared Error",mean_squared_error(y_test,y_pred))
print("Root MSE",np.sqrt(mean_squared_error(y_test,y_pred)))
print("RMS Log Error",np.log(np.sqrt(mean_squared_error(y_test,y_pred))))

r2 = r2_score(y_test,y_pred)
print("R Squared Error: ", r2)

scores = {}
scores['Base'] = {'Mean Absolute Error': mean_absolute_error(y_test,y_pred),
#                   'Mean Squared Error':mean_squared_error(y_test,y_pred),
                  'Root MSE':np.sqrt(mean_squared_error(y_test,y_pred)),
                  'RMS Log Error':np.log(np.sqrt(mean_squared_error(y_test,y_pred))),
                  'R Squared Error':r2_score(y_test,y_pred),
                 }

Mean Absolute Error:  4620.514033854167
Mean Squared Error 74103960.10903656
Root MSE 8608.365704884787
RMS Log Error 9.060489765802663
R Squared Error:  0.5613636258528887


# Hyper Parameter Tuning

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
params = {
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.7]
}

In [22]:
regressor = XGBRegressor(verbosity=0)

random_search = RandomizedSearchCV(regressor, param_distributions=params, n_iter=5, scoring='neg_mean_squared_error', n_jobs=-1, cv=5, verbose=3)

In [23]:
random_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          reg_lambda=None,
                                          scale_pos_weight=None, subsample=None,
                                          tree_met

In [24]:
random_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0.4, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [25]:
regressor = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=7, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [26]:
regressor.fit(X_train, y_train, early_stopping_rounds=10, eval_metric='mae', eval_set=[(X_test, y_test)])

[0]	validation_0-mae:29345.26172
[1]	validation_0-mae:27915.53906
[2]	validation_0-mae:26559.25000
[3]	validation_0-mae:25268.57812
[4]	validation_0-mae:24056.70312
[5]	validation_0-mae:22886.36523
[6]	validation_0-mae:21778.79688
[7]	validation_0-mae:20721.70508
[8]	validation_0-mae:19724.50195
[9]	validation_0-mae:18801.73047
[10]	validation_0-mae:17905.96484
[11]	validation_0-mae:17058.99805
[12]	validation_0-mae:16250.06543
[13]	validation_0-mae:15480.04492
[14]	validation_0-mae:14745.05859
[15]	validation_0-mae:14054.27441
[16]	validation_0-mae:13396.65039
[17]	validation_0-mae:12790.46289
[18]	validation_0-mae:12202.08398
[19]	validation_0-mae:11639.87988
[20]	validation_0-mae:11106.43457
[21]	validation_0-mae:10620.06250
[22]	validation_0-mae:10146.48828
[23]	validation_0-mae:9716.25098
[24]	validation_0-mae:9293.81934
[25]	validation_0-mae:8904.72559
[26]	validation_0-mae:8547.46094
[27]	validation_0-mae:8217.17578
[28]	validation_0-mae:7911.29736
[29]	validation_0-mae:7621.974

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=7, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [27]:
y_pred = regressor.predict(X_test)

# Evaluation Metrics (Tuned)

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np 

print("Mean Absolute Error: ",mean_absolute_error(y_test,y_pred))
print("Mean Squared Error",mean_squared_error(y_test,y_pred))
print("Root MSE",np.sqrt(mean_squared_error(y_test,y_pred)))
print("RMS Log Error",np.log(np.sqrt(mean_squared_error(y_test,y_pred))))

r2 = r2_score(y_test,y_pred)
print("R Squared Error: ", r2)

scores['Tuned'] = {'Mean Absolute Error': round(mean_absolute_error(y_test,y_pred), 3),
#                   'Mean Squared Error':round(mean_squared_error(y_test,y_pred), 3),
                  'Root MSE':round(np.sqrt(mean_squared_error(y_test,y_pred)), 3),
                  'RMS Log Error':round(np.log(np.sqrt(mean_squared_error(y_test,y_pred))), 3),
                  'R Squared Error':round(r2_score(y_test,y_pred), 3),
                 }

Mean Absolute Error:  4502.5226041666665
Mean Squared Error 63475067.5835081
Root MSE 7967.124172718039
RMS Log Error 8.983078875140423
R Squared Error:  0.6242781971084215


In [29]:
eval_df = pd.DataFrame(scores)
eval_df

Unnamed: 0,Base,Tuned
Mean Absolute Error,4620.514034,4502.523
Root MSE,8608.365705,7967.124
RMS Log Error,9.06049,8.983
R Squared Error,0.561364,0.624
