In [61]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import ndcg_score, accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.preprocessing import LabelEncoder

import pickle

Import training data

In [2]:
training_data = pd.read_csv('data/training_set_VU_DM.csv')

Create target (derived from clicked - 1, booked - 5)

In [3]:
training_data['target'] = (5*training_data['booking_bool']) + training_data['click_bool']

Create property count feature

In [4]:
z = training_data['prop_id'].value_counts() 
dict_z = z.to_dict() # converts to dictionary
training_data['prop_counts'] = training_data['prop_id'].map(dict_z)

Create price difference feature

In [5]:
training_data['diff_price'] = ((training_data['price_usd']) - (np.exp(training_data['prop_log_historical_price'])))

Create distance between home location and destination location as feature

In [6]:
training_data['dist_home_away'] = (training_data['visitor_location_country_id']/training_data['prop_country_id'])
training_data.loc[training_data['dist_home_away']!=1, ['dist_home_away']] = 0

Create combined location score feature

In [7]:
training_data['prop_loc_score1_norm'] = training_data['prop_location_score1'] / training_data['prop_location_score1'].abs().max()
training_data['prop_location_score2'].fillna(training_data['prop_loc_score1_norm'], inplace=True)
training_data['location_score'] = (training_data['prop_loc_score1_norm'] + training_data['prop_location_score2'])/2
training_data = training_data.drop('prop_loc_score1_norm', axis=1)

In [8]:
training_data

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target,prop_counts,diff_price,dist_home_away,location_score
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0.0,,0,,0,0,612,-36.404964,0.0,0.224622
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0.0,,0,,0,0,583,17.806987,0.0,0.165043
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0.0,,0,,0,0,551,42.797387,0.0,0.169843
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,0.0,5.0,0,,0,0,460,522.129581,0.0,0.208972
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0.0,,0,,0,0,665,5.200488,0.0,0.251162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,0,,0,0,47,117.000000,1.0,0.138880
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,0,,0,0,38,88.000000,1.0,0.215685
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,0,,0,0,43,98.000000,1.0,0.086997
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,1,157.84,1,6,37,138.000000,1.0,0.172785


In [17]:
model_traindata = training_data[['srch_id',
                                 'site_id',
                                 'visitor_location_country_id',
                                 'prop_country_id',
                                 'prop_id',
                                 'prop_starrating',
                                 'prop_review_score',
                                 'prop_brand_bool',
                                 'prop_log_historical_price',
                                 'price_usd',
                                 'promotion_flag',
                                 'srch_destination_id',
                                 'srch_length_of_stay',
                                 'srch_booking_window',
                                 'srch_adults_count',
                                 'srch_children_count',
                                 'srch_room_count',
                                 'srch_saturday_night_bool',
                                 'orig_destination_distance',
                                 'random_bool',
                                 'target',
                                 'prop_counts',
                                 'diff_price',
                                 'dist_home_away',
                                 'location_score'
                                ]]

In [18]:
# model_traindata = model_traindata.dropna().reset_index(drop=True)
model_traindata = model_traindata.fillna(0)

In [19]:
model_traindata

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_log_historical_price,price_usd,...,srch_children_count,srch_room_count,srch_saturday_night_bool,orig_destination_distance,random_bool,target,prop_counts,diff_price,dist_home_away,location_score
0,1,12,187,219,893,3,3.5,1,4.95,104.77,...,0,1,1,0.00,1,0,612,-36.404964,0.0,0.224622
1,1,12,187,219,10404,4,4.0,1,5.03,170.74,...,0,1,1,0.00,1,0,583,17.806987,0.0,0.165043
2,1,12,187,219,21315,3,4.5,1,4.92,179.80,...,0,1,1,0.00,1,0,551,42.797387,0.0,0.169843
3,1,12,187,219,27348,2,4.0,1,4.39,602.77,...,0,1,1,0.00,1,0,460,522.129581,0.0,0.208972
4,1,12,187,219,29604,4,3.5,1,4.93,143.58,...,0,1,1,0.00,1,0,665,5.200488,0.0,0.251162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,5,219,219,77700,3,4.0,1,0.00,118.00,...,0,1,0,550.92,0,0,47,117.000000,1.0,0.138880
4958343,332785,5,219,219,88083,3,4.0,1,0.00,89.00,...,0,1,0,553.14,0,0,38,88.000000,1.0,0.215685
4958344,332785,5,219,219,94508,3,3.5,1,0.00,99.00,...,0,1,0,544.43,0,0,43,98.000000,1.0,0.086997
4958345,332785,5,219,219,128360,3,5.0,1,0.00,139.00,...,0,1,0,550.38,0,6,37,138.000000,1.0,0.172785


In [81]:
gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(model_traindata, groups=model_traindata['srch_id'])

X_train_inds, X_test_inds = next(gss)

train_data= model_traindata.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','target'])]
y_train = train_data.loc[:, train_data.columns.isin(['target'])]

groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_data= model_traindata.iloc[X_test_inds]

# We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['target'])]
# X_test = test_data.loc[:, ~test_data.columns.isin(['srch_id','target'])]
y_test = test_data.loc[:, test_data.columns.isin(['target'])]

In [55]:
X_train.columns

Index(['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool',
       'prop_counts', 'diff_price', 'dist_home_away', 'location_score'],
      dtype='object')

In [56]:
X_test.columns

Index(['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool',
       'prop_counts', 'diff_price', 'dist_home_away', 'location_score'],
      dtype='object')

HYPERPARAMETER OPTIMIZATION

In [73]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'learning_rate' : hp.uniform('learning_rate', 0.05, 0.2),
        'eta' : hp.uniform('eta', 0.05, 0.2),
        'n_estimators': hp.quniform('n_estimators', 90,180, 5),
        'seed': 0
    }

In [76]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

  y = column_or_1d(y, warn=True)


In [77]:
def objective(space):
    model=xgb.XGBClassifier(tree_method='hist',booster='gbtree',objective='rank:ndcg',
                    n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']),learning_rate = space['learning_rate'],eta = space['eta'],verbosity=1)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=True)
    

    pred = model.predict(X_test)
    
    accuracy = ndcg_score(y_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [78]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 25,
                        trials = trials)

  0%|                                    | 0/25 [00:00<?, ?trial/s, best loss=?]





[0]	validation_0-auc:0.50000	validation_1-auc:nan                               
[1]	validation_0-auc:0.50000	validation_1-auc:nan                               
[2]	validation_0-auc:0.50000	validation_1-auc:nan                               
[3]	validation_0-auc:0.50000	validation_1-auc:nan                               
[4]	validation_0-auc:0.50000	validation_1-auc:nan                               
[5]	validation_0-auc:0.50000	validation_1-auc:nan                               
[6]	validation_0-auc:0.50000	validation_1-auc:nan                               
[7]	validation_0-auc:0.50000	validation_1-auc:nan                               
[8]	validation_0-auc:0.50000	validation_1-auc:nan                               
[9]	validation_0-auc:0.51355	validation_1-auc:nan                               
[10]	validation_0-auc:0.51355	validation_1-auc:nan                              
  0%|                                    | 0/25 [00:37<?, ?trial/s, best loss=?]

job exception: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead



  0%|                                    | 0/25 [00:39<?, ?trial/s, best loss=?]


ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead

In [67]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.9956497639399737, 'eta': 0.10615342984333002, 'gamma': 4.7115967519155895, 'learning_rate': 0.180656005953109, 'max_depth': 3.0, 'min_child_weight': 1.0, 'n_estimators': 145.0, 'reg_alpha': 128.0, 'reg_lambda': 0.15214262459317351}


def objective(space):
    model=xgb.XGBClassifier(tree_method='hist',booster='gbtree',objective='rank:ndcg',
                    n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = 
                    space['gamma'],reg_alpha =
                    int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']),learning_rate = space['learning_rate'],eta =
                    space['eta'],verbosity=1)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    model.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=True)
    

    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 25,
                        trials = trials)


The best hyperparameters are :  

{'colsample_bytree': 0.9956497639399737, 'eta': 0.10615342984333002, 'gamma': 4.7115967519155895, 'learning_rate': 0.180656005953109, 'max_depth': 3.0, 'min_child_weight': 1.0, 'n_estimators': 145.0, 'reg_alpha': 128.0, 'reg_lambda': 0.15214262459317351}

In [82]:
model = xgb.XGBRanker(
    verbosity=2,
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    reg_alpha=128,
    reg_lambda=0.15214262459317351,
    learning_rate=0.180656005953109,
    colsample_bytree=0.9956497639399737, 
    gamma=4.7115967519155895,
    eta=0.10615342984333002, 
    max_depth=3, 
    min_child_weight=1,
    n_estimators=145
#     subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

[12:38:00] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:00] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:00] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:00] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:01] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

[12:38:13] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:13] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:14] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:14] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:15] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

[12:38:28] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:28] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:29] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:29] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:29] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

[12:38:41] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:41] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:41] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:42] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:42] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.ma

[12:38:52] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[12:38:52] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/gbm/gbtree.cc:179: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.


XGBRanker(base_score=None, booster='gbtree', callbacks=None,
          colsample_bylevel=None, colsample_bynode=None,
          colsample_bytree=0.9956497639399737, early_stopping_rounds=None,
          enable_categorical=False, eta=0.10615342984333002, eval_metric=None,
          feature_types=None, gamma=4.7115967519155895, gpu_id=None,
          grow_policy=None, importance_type=None, interaction_constraints=None,
          learning_rate=0.180656005953109, max_bin=None, max_cat_threshold=None,
          max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
          max_leaves=None, min_child_weight=1, missing=nan,
          monotone_constraints=None, n_estimators=145, n_jobs=None,
          num_parallel_tree=None, objective='rank:ndcg', ...)

In [83]:
filename = 'XGB_model_rankndcg_gbtree_FeatEng_zeroes_opt.sav'
pickle.dump(model, open(filename, 'wb'))

In [84]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
predictions = (X_test.groupby('srch_id').apply(lambda x: predict(model, x)))

In [85]:
y_pred = []

for p in predictions:
    p = list(p)
    y_pred.extend(p)

In [86]:
X_test["predicted_ranking"] = y_pred

In [87]:
X_test

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_log_historical_price,price_usd,...,srch_children_count,srch_room_count,srch_saturday_night_bool,orig_destination_distance,random_bool,prop_counts,diff_price,dist_home_away,location_score,predicted_ranking
65,8,5,219,219,10250,3,3.5,1,4.64,82.00,...,0,1,0,5.40,0,186,-21.544348,1.0,0.115620,0.001522
66,8,5,219,219,13252,4,4.5,1,5.49,160.00,...,0,1,0,3.98,0,251,-82.257207,1.0,0.000000,0.820537
67,8,5,219,219,22756,4,4.0,1,5.11,259.00,...,0,1,0,4.41,0,184,93.329645,1.0,0.315272,-0.753038
68,8,5,219,219,27669,3,3.5,1,4.62,85.00,...,0,1,0,4.17,0,195,-16.494032,1.0,0.320250,0.615163
69,8,5,219,219,30630,3,4.0,1,4.81,109.00,...,0,1,0,3.69,0,218,-13.731618,1.0,0.000000,-0.603043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958336,332784,24,219,100,114177,4,4.5,1,0.00,300.74,...,0,1,0,328.90,0,526,299.740000,0.0,0.385708,-0.911139
4958337,332784,24,219,100,115562,4,4.0,1,0.00,213.15,...,0,1,0,328.83,0,281,212.150000,0.0,0.374092,-0.139627
4958338,332784,24,219,100,121275,4,4.0,0,0.00,193.87,...,0,1,0,329.68,0,406,192.870000,0.0,0.375984,0.533894
4958339,332784,24,219,100,128677,3,3.5,1,0.00,107.05,...,0,1,0,328.72,0,379,106.050000,0.0,0.400398,0.207208


In [88]:
print("--> NDCG : ", ndcg_score([y_test['target']],[X_test['predicted_ranking']]))

--> NDCG :  0.7634965578579737


**No further feature engineering, including these features**

model_traindata = training_data[['srch_id',
                                   'site_id',
                                   'visitor_location_country_id',
                                   'prop_country_id',
                                   'prop_id',
                                   'prop_starrating',
                                   'prop_review_score',
                                   'prop_brand_bool',
                                   'prop_location_score1',
                                   'prop_location_score2',
                                   'prop_log_historical_price',
                                   'price_usd',
                                   'promotion_flag',
                                   'srch_destination_id',
                                   'srch_length_of_stay',
                                   'srch_booking_window',
                                   'srch_adults_count',
                                   'srch_children_count',
                                   'srch_room_count',
                                   'srch_saturday_night_bool',
                                   'orig_destination_distance',
                                   'random_bool',
                                   'target',
                                   'counts',
                                  ]]

NDCG score on training data XGBoost with objective `rank:ndcg`, booster `dart`, dropped NaNs, no further feature engineering = **0.7798114072823272**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='dart',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*


NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, dropped NaNs, no further feature engineering = **0.7793220497613718**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
   
    
NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, including NaNs, merge loc_scores = **0.7687506555406398**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    

NDCG score on training data XGBoost with objective `rank:ndcg`, booster `dart`, replacing NaNs with zeroes, merge loc_scores = **0.768692374404902**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='dart',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    
    
NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, replacing NaNs with zeroes, merge loc_scores = **0.7686738092959273**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    
    
NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, replacing NaNs with zeroes, 4 additional features = **0.767058746838947**
    
*model = xgb.XGBRanker(
    verbosity=3,
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    
NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, replacing NaNs with zeroes, 4 additional features, optimized hyperparameters = **0.7634965578579737**

*model = xgb.XGBRanker(
    verbosity=2,
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    reg_alpha=128,
    reg_lambda=0.15214262459317351,
    learning_rate=0.180656005953109,
    colsample_bytree=0.9956497639399737, 
    gamma=4.7115967519155895,
    eta=0.10615342984333002, 
    max_depth=3, 
    min_child_weight=1,
    n_estimators=145
    )*


NDCG score on training data XGBoost with objective `rank:ndcg`, booster `dart`, including NaNs, merge loc_scores = **?**

*model = xgb.XGBRanker(  
    tree_method='hist',
    booster='dart',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    

NDCG score on training data XGBoost with objective `rank:ndcg`, booster `gbtree`, dropping NaNs, 4 additional features = **0.760910778134321**

*model = xgb.XGBRanker(
    verbosity=2,
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )*
    

In [66]:
test_data = pd.read_csv('data/test_set_VU_DM.csv')

In [67]:
z = test_data['prop_id'].value_counts() 
dict_z = z.to_dict() # converts to dictionary
test_data['counts'] = test_data['prop_id'].map(dict_z)

In [99]:
model_testdata = test_data[['srch_id',
                            'site_id',
                            'visitor_location_country_id',
                            'prop_country_id',
                            'prop_id',
                            'prop_starrating',
                            'prop_review_score',
                            'prop_brand_bool',
                            'prop_location_score1',
                            'prop_location_score2',
                            'prop_log_historical_price',
                            'price_usd',
                            'promotion_flag',
                            'srch_destination_id',
                            'srch_length_of_stay',
                            'srch_booking_window',
                            'srch_adults_count',
                            'srch_children_count',
                            'srch_room_count',
                            'srch_saturday_night_bool',
                            'orig_destination_distance',
                            'random_bool',
                            'counts'
                           ]]

In [100]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
test_predictions = (model_testdata.groupby('srch_id').apply(lambda x: predict(model, x)))

In [101]:
test_predictions

srch_id
1         [0.1625433, 0.41950408, -0.13107207, -0.340134...
3         [1.3109548, 1.5259281, 0.08382395, 0.7322764, ...
6         [0.12061024, 0.30068672, 0.3082502, 0.02565842...
7         [-1.4230359, -0.85277504, -0.3971758, -0.90931...
10        [1.468401, 1.2862945, -0.93367916, 1.6723679, ...
                                ...                        
332781    [-0.49719226, -1.1120768, -0.60463244, -1.2823...
332783    [-1.3466609, 1.7371362, 0.5355207, -0.01810777...
332785    [0.70702094, -0.8647833, 1.3359388, 0.29851568...
332786    [-0.3384254, -0.44028246, -0.36365402, 0.77885...
332787    [1.7464826, 0.9119617, 0.38222784, 1.41004, 1....
Length: 199549, dtype: object

In [102]:
y_testpred = []

for p in test_predictions:
    p = list(p)
    y_testpred.extend(p)

In [103]:
model_testdata["predicted_ranking"] = y_testpred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_testdata["predicted_ranking"] = y_testpred


In [104]:
model_testdata = model_testdata.sort_values(["srch_id","predicted_ranking"], ascending=[True,False])

In [105]:
model_testdata.drop(model_testdata.columns.difference(['srch_id','prop_id']), 1, inplace=True)

  model_testdata.drop(model_testdata.columns.difference(['srch_id','prop_id']), 1, inplace=True)


In [106]:
model_testdata.to_csv("output-XGB-dart.csv", index=False) 