In [35]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import ndcg_score

import pickle

In [2]:
training_data = pd.read_csv('data/training_set_VU_DM.csv')

In [3]:
training_data['target'] = (5*training_data['booking_bool']) + training_data['click_bool']

In [4]:
z = training_data['prop_id'].value_counts() 
dict_z = z.to_dict() # converts to dictionary
training_data['counts'] = training_data['prop_id'].map(dict_z)

In [5]:
model_traindata = training_data.copy(deep=True)

In [6]:
model_traindata = model_traindata[['srch_id',
                                   'site_id',
                                   'visitor_location_country_id',
                                   'prop_country_id',
                                   'prop_id',
                                   'prop_starrating',
                                   'prop_review_score',
                                   'prop_brand_bool',
                                   'prop_location_score1',
                                   'prop_location_score2',
                                   'prop_log_historical_price',
                                   'price_usd',
                                   'promotion_flag',
                                   'srch_destination_id',
                                   'srch_length_of_stay',
                                   'srch_booking_window',
                                   'srch_adults_count',
                                   'srch_children_count',
                                   'srch_room_count',
                                   'srch_saturday_night_bool',
                                   'orig_destination_distance',
                                   'random_bool',
                                   'target',
                                   'counts'
                                  ]]

In [7]:
model_traindata = model_traindata.dropna().reset_index(drop=True)

In [8]:
model_traindata

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,orig_destination_distance,random_bool,target,counts
0,4,5,219,219,3625,4,4.0,0,3.22,0.2544,...,2,46,1,0,1,1,238.35,1,0,431
1,4,5,219,219,11826,5,4.5,1,3.22,0.1924,...,2,46,1,0,1,1,238.33,1,0,505
2,4,5,219,219,22824,3,4.0,0,3.26,0.3729,...,2,46,1,0,1,1,238.29,1,0,360
3,4,5,219,219,37581,5,4.5,0,3.09,0.2508,...,2,46,1,0,1,1,238.35,1,0,306
4,4,5,219,219,39993,4,4.0,1,3.09,0.1692,...,2,46,1,0,1,1,237.94,1,0,525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610098,332784,24,219,100,140304,3,4.0,0,4.55,0.1606,...,2,6,1,0,1,0,328.93,0,0,455
2610099,332785,5,219,219,77700,3,4.0,1,1.61,0.0471,...,1,21,3,0,1,0,550.92,0,0,47
2610100,332785,5,219,219,88083,3,4.0,1,1.95,0.1520,...,1,21,3,0,1,0,553.14,0,0,38
2610101,332785,5,219,219,94508,3,3.5,1,1.10,0.0164,...,1,21,3,0,1,0,544.43,0,0,43


In [27]:
gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(model_traindata, groups=model_traindata['srch_id'])

X_train_inds, X_test_inds = next(gss)

train_data= model_traindata.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','target'])]
y_train = train_data.loc[:, train_data.columns.isin(['target'])]

groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

test_data= model_traindata.iloc[X_test_inds]

# We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['target'])]
y_test = test_data.loc[:, test_data.columns.isin(['target'])]

In [28]:
groups.shape

(78088,)

In [29]:
y_train.shape

(1566318, 1)

In [30]:
X_train.shape

(1566318, 22)

In [31]:
model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

XGBRanker(base_score=None, booster='gbtree', callbacks=None,
          colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.9,
          early_stopping_rounds=None, enable_categorical=False, eta=0.05,
          eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
          grow_policy=None, importance_type=None, interaction_constraints=None,
          learning_rate=0.1, max_bin=None, max_cat_threshold=None,
          max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
          max_leaves=None, min_child_weight=None, missing=nan,
          monotone_constraints=None, n_estimators=110, n_jobs=None,
          num_parallel_tree=None, objective='rank:ndcg', ...)

In [32]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
predictions = (X_test.groupby('srch_id').apply(lambda x: predict(model, x)))

In [58]:
y_pred = []

for p in predictions:
    p = list(p)
    y_pred.extend(p)

In [59]:
X_test["predicted_ranking"] = y_pred

In [60]:
X_test

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,orig_destination_distance,random_bool,counts,predicted_ranking
25,8,5,219,219,10250,3,3.5,1,1.39,0.0321,...,1,0,1,0,1,0,5.40,0,186,-0.021785
26,8,5,219,219,22756,4,4.0,1,2.83,0.2251,...,1,0,1,0,1,0,4.41,0,184,-0.182306
27,8,5,219,219,27669,3,3.5,1,3.00,0.2107,...,1,0,1,0,1,0,4.17,0,195,0.998485
28,8,5,219,219,32491,3,4.5,1,2.30,0.0516,...,1,0,1,0,1,0,4.45,0,192,-0.803597
29,8,5,219,219,33805,3,3.5,1,3.14,0.1393,...,1,0,1,0,1,0,3.83,0,245,0.994150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610098,332784,24,219,100,140304,3,4.0,0,4.55,0.1606,...,2,6,1,0,1,0,328.93,0,455,1.464539
2610099,332785,5,219,219,77700,3,4.0,1,1.61,0.0471,...,1,21,3,0,1,0,550.92,0,47,-0.276554
2610100,332785,5,219,219,88083,3,4.0,1,1.95,0.1520,...,1,21,3,0,1,0,553.14,0,38,1.080996
2610101,332785,5,219,219,94508,3,3.5,1,1.10,0.0164,...,1,21,3,0,1,0,544.43,0,43,-0.710166


In [65]:
print("--> NDCG : ", ndcg_score([y_test['target']],[X_test['predicted_ranking']]))

--> NDCG :  0.7793220497613718


In [69]:
filename = 'XGB_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [66]:
test_data = pd.read_csv('data/test_set_VU_DM.csv')

In [67]:
z = test_data['prop_id'].value_counts() 
dict_z = z.to_dict() # converts to dictionary
test_data['counts'] = test_data['prop_id'].map(dict_z)

In [68]:
model_testdata = test_data[['srch_id',
                            'site_id',
                            'visitor_location_country_id',
                            'prop_country_id',
                            'prop_id',
                            'prop_starrating',
                            'prop_review_score',
                            'prop_brand_bool',
                            'prop_location_score1',
                            'prop_location_score2',
                            'prop_log_historical_price',
                            'price_usd',
                            'promotion_flag',
                            'srch_destination_id',
                            'srch_length_of_stay',
                            'srch_booking_window',
                            'srch_adults_count',
                            'srch_children_count',
                            'srch_room_count',
                            'srch_saturday_night_bool',
                            'orig_destination_distance',
                            'random_bool',
                            'counts'
                           ]]

In [70]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
test_predictions = (model_testdata.groupby('srch_id').apply(lambda x: predict(model, x)))

In [72]:
test_predictions

srch_id
1         [0.13459542, 0.24798356, -0.08780122, -0.23286...
3         [1.2978307, 1.5023919, 0.20259908, 0.76145405,...
6         [0.29701442, 0.25115457, 0.367561, 0.20159033,...
7         [-1.5300428, -0.6941029, -0.32571137, -0.90333...
10        [1.4686387, 1.2202603, -0.93771625, 1.7017969,...
                                ...                        
332781    [-0.59075314, -1.1011741, -0.61554146, -1.2365...
332783    [-1.3845369, 1.771332, 0.6901308, -0.08596708,...
332785    [0.81280977, -0.8631485, 1.273337, 0.3378254, ...
332786    [-0.38508812, -0.28706598, -0.4287139, 0.85284...
332787    [1.7026184, 1.029943, 0.5417605, 1.3695235, 1....
Length: 199549, dtype: object

In [73]:
y_testpred = []

for p in test_predictions:
    p = list(p)
    y_testpred.extend(p)

In [75]:
model_testdata["predicted_ranking"] = y_testpred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_testdata["predicted_ranking"] = y_testpred


In [77]:
model_testdata = model_testdata.sort_values(["srch_id","predicted_ranking"], ascending=[True,False])

In [79]:
model_testdata.drop(model_testdata.columns.difference(['srch_id','prop_id']), 1, inplace=True)

  model_testdata.drop(model_testdata.columns.difference(['srch_id','prop_id']), 1, inplace=True)


In [81]:
model_testdata.to_csv("output-XGB.csv", index=False) 