In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import pickle
from sklearn.cross_validation import ShuffleSplit

# Pretty display for notebooks
%matplotlib inline



In [2]:
data = pd.read_csv('DataSet_pricing_model.csv')

In [3]:
features_corelated = ['longitude', 'latitude', 'gym', 'lift', 'swimming_pool', 'property_size', 'bathroom', 'floor', 'total_floor', 'balconies', 'building_type', 'parking', 'type']

In [4]:
y = data['rent']

In [5]:
one_hot_encoded_features = pd.get_dummies(data[features_corelated])

In [6]:
one_hot_encoded_features.describe()

Unnamed: 0,longitude,latitude,gym,lift,swimming_pool,property_size,bathroom,floor,total_floor,balconies,...,parking_BOTH,parking_FOUR_WHEELER,parking_NONE,parking_TWO_WHEELER,type_BHK1,type_BHK2,type_BHK3,type_BHK4,type_BHK4PLUS,type_RK1
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,77.637447,12.9459,0.22608,0.36244,0.1746,1063.76416,1.86112,1.8816,3.81048,1.15212,...,0.52868,0.0694,0.05116,0.35076,0.17636,0.56812,0.21572,0.01196,0.00152,0.02632
std,0.111732,0.029613,0.4183,0.480715,0.379632,591.174731,0.759589,2.187879,3.201751,1.00891,...,0.499187,0.254138,0.220328,0.477217,0.381134,0.495348,0.411329,0.108708,0.038958,0.160088
min,77.500072,12.900004,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,77.571956,12.918511,0.0,0.0,0.0,700.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,77.633363,12.943927,0.0,0.0,0.0,1050.0,2.0,1.0,3.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,77.695972,12.971114,0.0,1.0,0.0,1290.0,2.0,3.0,4.0,2.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
max,80.266346,12.999999,1.0,1.0,1.0,50000.0,34.0,25.0,50.0,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Split into validation and training data
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(one_hot_encoded_features, y, test_size=0.2)



In [8]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)], verbose=False)
val_predictions = xgb_model.predict(val_X)



from sklearn.metrics import mean_absolute_error

train_accuracy = xgb_model.score(train_X, train_y)
val_accuracy = xgb_model.score(val_X, val_y)
print('train_accuracy: ',train_accuracy)
print('val_accuracy: ',val_accuracy)

train_accuracy:  0.8095777207736138
val_accuracy:  0.8059673632938698


In [9]:
# save model
# save the classifier
with open('model.pkl', 'wb') as fid:
    pickle.dump(xgb_model, fid)    

# load it again
with open('model.pkl', 'rb') as fid:
    xgb_model_saved = pickle.load(fid)

In [9]:
data[features_corelated].head()

Unnamed: 0,longitude,latitude,gym,lift,swimming_pool,property_size,bathroom,floor,total_floor,balconies,building_type,parking,type
0,77.576914,12.936601,0,0,0,1000,2,2,2.0,1.0,IF,TWO_WHEELER,BHK2
1,77.545219,12.997989,0,0,0,1218,3,0,1.0,0.0,IH,BOTH,BHK2
2,77.587642,12.935696,0,1,0,1820,3,4,9.0,2.0,AP,BOTH,BHK3
3,77.656118,12.953507,0,1,0,1100,2,4,4.0,1.0,AP,BOTH,BHK2
4,77.742436,12.96852,1,1,1,1475,2,1,9.0,2.0,AP,BOTH,BHK2


In [10]:
data.iloc[0]

id                                  ff8081815917971401591af8895032d0
type                                                            BHK2
locality                                                  Jayanagara
activation_date                                  2018-07-05 17:09:49
latitude                                                     12.9366
longitude                                                    77.5769
lease_type                                                    FAMILY
gym                                                                0
lift                                                               0
swimming_pool                                                      0
negotiable                                                         0
furnishing                                            SEMI_FURNISHED
parking                                                  TWO_WHEELER
property_size                                                   1000
property_age                      

In [11]:
one_hot_encoded_features.columns

Index(['longitude', 'latitude', 'gym', 'lift', 'swimming_pool',
       'property_size', 'bathroom', 'floor', 'total_floor', 'balconies',
       'building_type_AP', 'building_type_GC', 'building_type_IF',
       'building_type_IH', 'parking_BOTH', 'parking_FOUR_WHEELER',
       'parking_NONE', 'parking_TWO_WHEELER', 'type_BHK1', 'type_BHK2',
       'type_BHK3', 'type_BHK4', 'type_BHK4PLUS', 'type_RK1'],
      dtype='object')

In [12]:
one_hot_encoded_features.describe(include='all')

Unnamed: 0,longitude,latitude,gym,lift,swimming_pool,property_size,bathroom,floor,total_floor,balconies,...,parking_BOTH,parking_FOUR_WHEELER,parking_NONE,parking_TWO_WHEELER,type_BHK1,type_BHK2,type_BHK3,type_BHK4,type_BHK4PLUS,type_RK1
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,77.637447,12.9459,0.22608,0.36244,0.1746,1063.76416,1.86112,1.8816,3.81048,1.15212,...,0.52868,0.0694,0.05116,0.35076,0.17636,0.56812,0.21572,0.01196,0.00152,0.02632
std,0.111732,0.029613,0.4183,0.480715,0.379632,591.174731,0.759589,2.187879,3.201751,1.00891,...,0.499187,0.254138,0.220328,0.477217,0.381134,0.495348,0.411329,0.108708,0.038958,0.160088
min,77.500072,12.900004,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,77.571956,12.918511,0.0,0.0,0.0,700.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,77.633363,12.943927,0.0,0.0,0.0,1050.0,2.0,1.0,3.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,77.695972,12.971114,0.0,1.0,0.0,1290.0,2.0,3.0,4.0,2.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
max,80.266346,12.999999,1.0,1.0,1.0,50000.0,34.0,25.0,50.0,22.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
val_X.iloc[0], val_y.iloc[0]

(longitude                77.504896
 latitude                 12.902789
 gym                       0.000000
 lift                      0.000000
 swimming_pool             0.000000
 property_size           850.000000
 bathroom                  1.000000
 floor                     1.000000
 total_floor               1.000000
 balconies                 2.000000
 building_type_AP          0.000000
 building_type_GC          0.000000
 building_type_IF          1.000000
 building_type_IH          0.000000
 parking_BOTH              0.000000
 parking_FOUR_WHEELER      0.000000
 parking_NONE              0.000000
 parking_TWO_WHEELER       1.000000
 type_BHK1                 1.000000
 type_BHK2                 0.000000
 type_BHK3                 0.000000
 type_BHK4                 0.000000
 type_BHK4PLUS             0.000000
 type_RK1                  0.000000
 Name: 1194, dtype: float64, 10000)

In [14]:
xgb_model.predict(val_X.iloc[0:1])

array([8456.397], dtype=float32)

In [15]:
val_X.iloc[0:1].shape

(1, 24)

In [16]:
import numpy as np

In [17]:
qw = np.array([1,2])
qw.shape

(2,)

In [18]:
c = pd.DataFrame([77,12,0,0,0,500,1,1,2,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0])

In [19]:
c=c.transpose()

In [20]:
c.shape

(1, 24)

In [21]:
c.columns = one_hot_encoded_features.columns

In [22]:
xgb_model.predict(c)

array([6762.5913], dtype=float32)