In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import copy
import pickle

In [3]:
training_data = pd.read_csv('data/training_set_VU_DM.csv')
training_data.shape

(4958347, 54)

In [4]:
training_data['target'] = (5*training_data['booking_bool']) + training_data['click_bool']

In [5]:
z = training_data['prop_id'].value_counts() 
dict_z = z.to_dict() # converts to dictionary
training_data['counts'] = training_data['prop_id'].map(dict_z) 

In [6]:
# test_data = pd.read_csv('data/test_set_VU_DM.csv')
# test_data.shape

In [7]:
# clicked_data = training_data[training_data['click_bool'] == 1].reset_index(drop=True)
# clicked_data.shape

In [8]:
# booked_data = training_data[training_data['booking_bool'] == 1].reset_index(drop=True)
# booked_data.shape

In [9]:
# model_traindata = booked_data[['srch_id','visitor_location_country_id','prop_country_id','prop_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]
# model_testdata = test_data[['srch_id','visitor_location_country_id','prop_country_id','prop_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]

In [73]:
model_traindata = training_data.copy(deep=True)

In [74]:
model_traindata = model_traindata[['srch_id',
                                   'site_id',
                                   'visitor_location_country_id',
                                   'prop_country_id',
                                   'prop_id',
                                   'prop_starrating',
                                   'prop_review_score',
                                   'prop_brand_bool',
                                   'prop_location_score1',
                                   'prop_location_score2',
                                   'prop_log_historical_price',
                                   'price_usd',
                                   'promotion_flag',
                                   'srch_destination_id',
                                   'srch_length_of_stay',
                                   'srch_booking_window',
                                   'srch_adults_count',
                                   'srch_children_count',
                                   'srch_room_count',
                                   'srch_saturday_night_bool',
                                   'orig_destination_distance',
                                   'random_bool',
                                   'target',
                                   'counts'
                                  ]]

In [75]:
model_traindata.isna().sum()

srch_id                              0
site_id                              0
visitor_location_country_id          0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                 7364
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2           1090348
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
orig_destination_distance      1607782
random_bool                          0
target                               0
counts                               0
dtype: int64

In [76]:
model_traindata = model_traindata.dropna().reset_index(drop=True)

In [77]:
model_traindata.columns

Index(['srch_id', 'site_id', 'visitor_location_country_id', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool',
       'target', 'counts'],
      dtype='object')

In [78]:
features = model_traindata.drop('target', axis=1)
targets  = model_traindata['target']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=1)

In [None]:
# del z
# del dict_z
# del model_traindata
# del features
# del targets

In [80]:
model = GradientBoostingClassifier().fit(X_train, y_train)

In [81]:
filename = 'GBC_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# filename = 'GBC_model.sav'
# loaded_model = pickle.load(open(filename, 'rb'))

In [83]:
y_pred = model.predict(X_test)

In [84]:
indices = list(X_test.index)

In [85]:
print(f"--> Mean accuracy : {model.score(X_test,y_test)}")

--> Mean accuracy : 0.9498181107656588


In [86]:
# get maximum probability
y_predproba = model.predict_proba(X_test)
maximums = list(map(max, y_predproba))

In [87]:
# to DataFrame
y_pred = pd.DataFrame({'y_pred': y_pred},index=indices)
y_test = y_test.to_frame()

In [88]:
result = X_test['prop_id'].copy(deep=True)
result['true'] = y_test
result['predicted'] = y_pred
result['probability'] = maximums

In [91]:
print(f"NDCG : {ndcg_score([y_test['target']],[y_pred['y_pred']])}")

NDCG : 0.7227589732743979
