In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv('new1.csv')

In [3]:
data.columns

Index(['facing', 'furnishingDesc', 'gym', 'isMaintenance', 'lift',
       'maintenanceAmount', 'parking', 'property_size', 'rent_amount',
       'swimmingPool', 'type_bhk', 'waterSupply', 'property_age', 'Balcony',
       'Bathroom', 'floor/totalFloor', 'locality'],
      dtype='object')

In [4]:
data.drop(['property_age','swimmingPool','gym','facing'],axis=1,inplace=True)

In [5]:
rr=data.columns

In [6]:
len(rr)

13

In [7]:
data.tail()

Unnamed: 0,furnishingDesc,isMaintenance,lift,maintenanceAmount,parking,property_size,rent_amount,type_bhk,waterSupply,Balcony,Bathroom,floor/totalFloor,locality
13495,1,False,False,8,2,3,8000,1,1,1,1,9,10
13496,1,True,True,5,3,5,8500,2,1,1,1,14,25
13497,1,True,False,5,3,9,8500,3,3,4,2,15,10
13498,1,False,False,8,3,11,15000,3,1,1,3,6,10
13499,1,False,False,8,2,5,7000,2,1,1,1,6,16


In [8]:
#X=data.drop(drop_columns)
#X=X.drop('rent_amount_Boxcox')
X=data.drop(['rent_amount'],axis=1)
y=data['rent_amount']

In [9]:
# train-test 80-20 split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
                                     y,                             
                                     test_size = 0.2, 
                                     random_state = 100)


# Linear Regression

In [10]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

In [11]:
linearRegression = linear_model.LinearRegression()

In [12]:
linearRegression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
ypred = linearRegression.predict(X_test)

In [14]:
print('MAE:', metrics.mean_absolute_error(y_test, ypred))
print('MSE:', metrics.mean_squared_error(y_test, ypred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ypred)))
print('r2_score:', metrics.r2_score(y_test, ypred))

MAE: 2468.749366491327
MSE: 11381105.635970024
RMSE: 3373.58942907551
r2_score: 0.6870472906221571


# Ransac Regressor

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
ransac = RANSACRegressor(LinearRegression(),
                        max_trials=20,
                        min_samples=50,
                        residual_threshold=5.0,
                        random_state=0)
ransac.fit(X_train, y_train)

RANSACRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                                n_jobs=None, normalize=False),
                is_data_valid=None, is_model_valid=None, loss='absolute_loss',
                max_skips=inf, max_trials=20, min_samples=50, random_state=0,
                residual_threshold=5.0, stop_n_inliers=inf,
                stop_probability=0.99, stop_score=inf)

In [16]:
ransacpred=ransac.predict(X_test)


In [17]:
print('MAE:', metrics.mean_absolute_error(y_test, ransacpred))
print('MSE:', metrics.mean_squared_error(y_test, ransacpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ransacpred)))
print('r2_score:', metrics.r2_score(y_test, ransacpred))

MAE: 3036.455749880519
MSE: 17961836.608369667
RMSE: 4238.14070181367
r2_score: 0.5060932029111853


# Lasso Regression

In [18]:
from sklearn.linear_model import Lasso

In [19]:
la=Lasso(alpha=0.01)

In [20]:
la.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [21]:
lapred=la.predict(X_test)

In [22]:
print('MAE:', metrics.mean_absolute_error(y_test, lapred))
print('MSE:', metrics.mean_squared_error(y_test, lapred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lapred)))
print('r2_score:', metrics.r2_score(y_test, lapred))

MAE: 2468.745616478789
MSE: 11381095.26047523
RMSE: 3373.5878913221204
r2_score: 0.6870475759230155


# Ridge Regression

In [23]:
from sklearn.linear_model import Ridge

In [24]:
rr = Ridge(alpha=0.01) 

In [25]:
rr.fit(X_train, y_train)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [26]:
rrpred=rr.predict(X_test)

In [27]:
print('MAE:', metrics.mean_absolute_error(y_test, rrpred))
print('MSE:', metrics.mean_squared_error(y_test, rrpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rrpred)))
print('r2_score:', metrics.r2_score(y_test, rrpred))

MAE: 2468.7489218151577
MSE: 11381103.861171925
RMSE: 3373.58916603251
r2_score: 0.6870473394247858


# Random Forest

In [28]:
from sklearn import ensemble

In [29]:
randomForest = ensemble.RandomForestRegressor()

In [30]:
randomForest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [31]:
rfpred = randomForest.predict(X_test)

In [32]:
print('MAE:', metrics.mean_absolute_error(y_test, rfpred))
print('MSE:', metrics.mean_squared_error(y_test, rfpred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rfpred)))
print('r2_score:', metrics.r2_score(y_test, rfpred))

MAE: 2223.985602497955
MSE: 10179221.007046873
RMSE: 3190.4891485549474
r2_score: 0.7200961931639553


# XGBoost

In [33]:
import xgboost as xgb

In [34]:
# Hyper paramaeters for xgb
# https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [35]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [36]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'mae'
}

In [37]:
num_boost_round = 999


In [38]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:9048.34
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:6351.76
[2]	Test-mae:4535.65
[3]	Test-mae:3409.01
[4]	Test-mae:2794.8
[5]	Test-mae:2491.37
[6]	Test-mae:2348.25
[7]	Test-mae:2272.75
[8]	Test-mae:2240.17
[9]	Test-mae:2218.81
[10]	Test-mae:2210.29
[11]	Test-mae:2199.89
[12]	Test-mae:2193.78
[13]	Test-mae:2195.2
[14]	Test-mae:2195.83
[15]	Test-mae:2191.29
[16]	Test-mae:2185.7
[17]	Test-mae:2186.76
[18]	Test-mae:2185.3
[19]	Test-mae:2181.95
[20]	Test-mae:2174.87
[21]	Test-mae:2172.8
[22]	Test-mae:2170.45
[23]	Test-mae:2170.97
[24]	Test-mae:2169.98
[25]	Test-mae:2168.36
[26]	Test-mae:2167.32
[27]	Test-mae:2166.63
[28]	Test-mae:2164.71
[29]	Test-mae:2163.09
[30]	Test-mae:2159.68
[31]	Test-mae:2161.42
[32]	Test-mae:2160.78
[33]	Test-mae:2162.38
[34]	Test-mae:2162.61
[35]	Test-mae:2162.39
[36]	Test-mae:2160.62
[37]	Test-mae:2159.86
[38]	Test-mae:2160.41
[39]	Test-mae:2162.17
[40]	Test-mae:2163.43
Stopping. Best iteration:
[30]	Test-mae:2159.68



In [39]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)

In [40]:
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,9078.736328,20.097668,9086.429297,111.084027
1,6384.625293,14.85492,6402.288281,109.970504
2,4562.336328,12.30531,4591.78125,102.757985
3,3421.286035,12.642377,3477.415674,83.488214
4,2771.463721,14.740804,2855.713965,73.199768
5,2430.409814,14.058801,2539.810645,64.718861
6,2251.609912,14.789621,2384.57832,58.99106
7,2163.00166,15.85478,2318.221875,53.08016
8,2115.824023,15.714314,2285.558935,52.043146
9,2088.302588,19.760146,2273.921045,51.007632


In [41]:
cv_results['test-mae-mean'].min()

2234.3309082

In [42]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [43]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 2234.1213376 for 13 rounds
CV with max_depth=9, min_child_weight=6
	MAE 2236.3591796 for 11 rounds
CV with max_depth=9, min_child_weight=7
	MAE 2233.5447752 for 17 rounds
CV with max_depth=10, min_child_weight=5
	MAE 2237.7655762 for 11 rounds
CV with max_depth=10, min_child_weight=6
	MAE 2245.3039062 for 11 rounds
CV with max_depth=10, min_child_weight=7
	MAE 2240.7346192 for 11 rounds
CV with max_depth=11, min_child_weight=5
	MAE 2250.264795 for 10 rounds
CV with max_depth=11, min_child_weight=6
	MAE 2245.2053712 for 11 rounds
CV with max_depth=11, min_child_weight=7
	MAE 2251.2786619999997 for 11 rounds
Best params: 9, 7, MAE: 2233.5447752


In [44]:
# can change
params['max_depth']=9
params['min_child_weight']=7

In [45]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [46]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))


CV with subsample=1.0, colsample=1.0
	MAE 2233.5447752 for 17 rounds
CV with subsample=1.0, colsample=0.9
	MAE 2229.9153321999997 for 11 rounds
CV with subsample=1.0, colsample=0.8
	MAE 2242.9685546 for 20 rounds
CV with subsample=1.0, colsample=0.7
	MAE 2230.0000976 for 13 rounds
CV with subsample=0.9, colsample=1.0
	MAE 2244.135889 for 10 rounds
CV with subsample=0.9, colsample=0.9
	MAE 2240.3543458 for 10 rounds
CV with subsample=0.9, colsample=0.8
	MAE 2233.4299806 for 12 rounds
CV with subsample=0.9, colsample=0.7
	MAE 2233.0796876 for 16 rounds
CV with subsample=0.8, colsample=1.0
	MAE 2253.9109374 for 11 rounds
CV with subsample=0.8, colsample=0.9
	MAE 2247.8079588 for 11 rounds
CV with subsample=0.8, colsample=0.8
	MAE 2232.3724608 for 11 rounds
CV with subsample=0.8, colsample=0.7
	MAE 2237.916748 for 12 rounds
CV with subsample=0.7, colsample=1.0
	MAE 2265.7612792 for 10 rounds
CV with subsample=0.7, colsample=0.9
	MAE 2240.3746091999997 for 10 rounds
CV with subsample=0.7, c

In [47]:
params['subsample']=1.0
params['colsample_bytree']=0.7

In [48]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params, dtrain,num_boost_round=num_boost_round,seed=42,nfold=5,metrics=['mae'],early_stopping_rounds=10)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 1.63 s
	MAE 2230.0001463999997 for 13 rounds

CV with eta=0.2
Wall time: 1.66 s
	MAE 2204.7876954 for 19 rounds

CV with eta=0.1
Wall time: 3.04 s
	MAE 2187.6327147999996 for 36 rounds

CV with eta=0.05
Wall time: 5.63 s
	MAE 2179.5657714 for 84 rounds

CV with eta=0.01
Wall time: 30.3 s
	MAE 2177.1006836 for 415 rounds

CV with eta=0.005
Wall time: 51 s
	MAE 2175.7582032 for 813 rounds

Best params: 0.005, MAE: 2175.7582032


In [49]:
params['eta']=0.05

In [50]:
params

{'max_depth': 9,
 'min_child_weight': 7,
 'eta': 0.05,
 'subsample': 1.0,
 'colsample_bytree': 0.7,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [51]:
tunedModel=xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest,"Test")],
    early_stopping_rounds=10)

[0]	Test-mae:12272.6
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:11662.9
[2]	Test-mae:11082.7
[3]	Test-mae:10528.4
[4]	Test-mae:10006.8
[5]	Test-mae:9510.95
[6]	Test-mae:9037.6
[7]	Test-mae:8584.04
[8]	Test-mae:8157.82
[9]	Test-mae:7755.13
[10]	Test-mae:7369.45
[11]	Test-mae:7006.34
[12]	Test-mae:6662.15
[13]	Test-mae:6334.93
[14]	Test-mae:6027.85
[15]	Test-mae:5734.2
[16]	Test-mae:5456.18
[17]	Test-mae:5198.7
[18]	Test-mae:4954.09
[19]	Test-mae:4725.31
[20]	Test-mae:4512.67
[21]	Test-mae:4312.72
[22]	Test-mae:4127.51
[23]	Test-mae:3953.68
[24]	Test-mae:3791.05
[25]	Test-mae:3640.62
[26]	Test-mae:3503.2
[27]	Test-mae:3376.72
[28]	Test-mae:3260.44
[29]	Test-mae:3152.2
[30]	Test-mae:3054.31
[31]	Test-mae:2965.88
[32]	Test-mae:2882.47
[33]	Test-mae:2808.93
[34]	Test-mae:2744.06
[35]	Test-mae:2685.94
[36]	Test-mae:2631.65
[37]	Test-mae:2583.05
[38]	Test-mae:2537.25
[39]	Test-mae:2498.13
[40]	Test-mae:2462.64
[41]	Test-mae:2431.83
[42]	Test-mae:2402.36
[43]	Test-mae

In [52]:
print("Best MAE: {:.2f} in {} rounds".format(tunedModel.best_score, tunedModel.best_iteration+1))


Best MAE: 2141.62 in 103 rounds


In [53]:
num_boost_round = tunedModel.best_iteration + 1

In [54]:
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)


[0]	Test-mae:12272.6
[1]	Test-mae:11662.9
[2]	Test-mae:11082.7
[3]	Test-mae:10528.4
[4]	Test-mae:10006.8
[5]	Test-mae:9510.95
[6]	Test-mae:9037.6
[7]	Test-mae:8584.04
[8]	Test-mae:8157.82
[9]	Test-mae:7755.13
[10]	Test-mae:7369.45
[11]	Test-mae:7006.34
[12]	Test-mae:6662.15
[13]	Test-mae:6334.93
[14]	Test-mae:6027.85
[15]	Test-mae:5734.2
[16]	Test-mae:5456.18
[17]	Test-mae:5198.7
[18]	Test-mae:4954.09
[19]	Test-mae:4725.31
[20]	Test-mae:4512.67
[21]	Test-mae:4312.72
[22]	Test-mae:4127.51
[23]	Test-mae:3953.68
[24]	Test-mae:3791.05
[25]	Test-mae:3640.62
[26]	Test-mae:3503.2
[27]	Test-mae:3376.72
[28]	Test-mae:3260.44
[29]	Test-mae:3152.2
[30]	Test-mae:3054.31
[31]	Test-mae:2965.88
[32]	Test-mae:2882.47
[33]	Test-mae:2808.93
[34]	Test-mae:2744.06
[35]	Test-mae:2685.94
[36]	Test-mae:2631.65
[37]	Test-mae:2583.05
[38]	Test-mae:2537.25
[39]	Test-mae:2498.13
[40]	Test-mae:2462.64
[41]	Test-mae:2431.83
[42]	Test-mae:2402.36
[43]	Test-mae:2377.78
[44]	Test-mae:2353.49
[45]	Test-mae:2333.86
[46

In [55]:
mean_absolute_error(best_model.predict(dtest), y_test)

2141.622148618345

In [56]:
best_model.save_model("XGB.model")


* to use model later:

loaded_model = xgb.Booster()

loaded_model.load_model("my_model.model")





* And use it for predictions.

loaded_model.predict(dtest)


In [57]:
data.shape

(13500, 13)

In [58]:
data.columns

Index(['furnishingDesc', 'isMaintenance', 'lift', 'maintenanceAmount',
       'parking', 'property_size', 'rent_amount', 'type_bhk', 'waterSupply',
       'Balcony', 'Bathroom', 'floor/totalFloor', 'locality'],
      dtype='object')

In [59]:
import scipy


In [60]:
data_list = list((np.array([1,False,False,17,4,12,3,2,2,2,16,27])).reshape(-1))

# load bst model
bst_model = xgb.Booster({'bthread': 2})
bst_model.load_model('XGB.model')
df = pd.DataFrame()
col_names = ['f%d' % i for i in range(12)]
df = df.append(pd.Series(data=data_list, index=col_names), ignore_index=True)

result1 = bst_model.predict(xgb.DMatrix(df))
#lambda_ = 0.02336110709655923
#res = scipy.special.inv_boxcox(result1[0], lambda_)
#print( np.around(res, decimals=2))
print(result1)

[19486.383]
