In [None]:
# TODO: Try optuna to optimize the hyperparameters, as GridSearch is not that good at this task

In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV
from sklearn.metrics import ndcg_score
import xgboost as xgb


In [2]:
# File path
file_path = 'df_OptionB.csv'
# Read the dataset using pandas
df = pd.read_csv(file_path)

In [3]:
# Create a balanced dataset:
def balanced_data(x, id_col):
    """
    The function creates a balanced sample DF.
    It resamples the majority class to have the same number of instances as the minority class, resulting in a balanced dataset.

    Input:
    x: DataFrame
    id_col: column which would be used for ranking on
    
    Output:
    Balanced DataFrame sample
    """
    
    # Create a sample DataFrame with instances where id_col is 1 (clicked hotels):
    click_indices = x[x[id_col] == 1].index
    random_indices = np.random.choice(click_indices, len(x.loc[x[id_col] == 1]), replace=False)
    click_sample = x.loc[random_indices]

    # Create a sample DataFrame with instances where id_col is 0 (non-clicked hotels):
    not_click_indices = x[x[id_col] == 0].index
    random_indices = np.random.choice(not_click_indices, sum(x[id_col]), replace=False)
    not_click_sample = x.loc[random_indices]

    # Concatenate the two sampled DataFrames and create a balanced DataFrame
    df_balanced = pd.concat([not_click_sample, click_sample], axis=0)

    return df_balanced

In [4]:
balanced_df = balanced_data(df, 'click_bool')

In [5]:
# Test if set is balanced:
print("Non-click impressions: ", len(balanced_df[balanced_df['click_bool'] == 0]))
print("Click impression: ", len(balanced_df[balanced_df['click_bool'] == 1]))
print("Total number of records in resampled data: ", len(balanced_df))

Non-click impressions:  221879
Click impression:  221879
Total number of records in resampled data:  443758


In [67]:
balanced_df.columns

Index(['srch_id', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_room_count', 'srch_saturday_night_bool',
       'orig_destination_distance', 'click_bool', 'booking_bool', 'score',
       'avg_comp_rate', 'avg_comp_inv', 'abroad_bool', 'children_bool',
       'srch_query_affinity_score_low', 'srch_query_affinity_score_high',
       'starrating_diff_low', 'starrating_diff_high', 'usd_diff_low',
       'usd_diff_high'],
      dtype='object')

In [68]:
# Drop target variable and save it in separate variable
y = balanced_df['score']
X = balanced_df.drop(['score'], axis=1)

# Perform group based split
gss = GroupShuffleSplit(test_size=0.3, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=X['srch_id']))

# Create train and test sets
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

# Create group data
group_train = list(X_train['srch_id'].value_counts().sort_index())
group_test = list(X_test['srch_id'].value_counts().sort_index())

# Drop unwanted columns
X_train = X_train.drop(['click_bool', 'booking_bool', 'srch_id', 'prop_id'], axis=1)
X_test = X_test.drop(['click_bool', 'booking_bool', 'srch_id', 'prop_id'], axis=1)


# Applying XGBRanker

##### Hyperparameter tuning for XGRanker


### NEXT TWO CELLS HAVE TO BE CHANGED

In [73]:
# Use the pre-built xgb.cv to tune parameters 

# First we initialise a specfiic DF for xgb.cv:
dtrain = xgb.DMatrix(data = X_train, label=y_train)
dtrain.set_group(group_train)

params = {
    'objective' : ['rank:ndcg'],
    'booster' : ['gbtree'],
    'tree_method' : ['auto', 'exact', 'approx', 'hist', 'gpu_hist'],
    'random_state' : [42],
    'learning_rate' : [0.001, 0.01, 0.05, 0.1],
    'colsample_bytree' : [0.3,0.5,0.7,0.9],
    'max_depth' : range(5,15),
    'n_estimators' : [100, 200, 500, 1000],
    'subsample' : [0.4, 0.5, 0.6, 0.7, 0.8],
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100],
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
}

# Instantiate model:
gbm = xgb.XGBRanker()

In [74]:
# Create Grid object
tuned_cv_results = GridSearchCV(estimator= gbm, param_grid=params, scoring=ndcg_score, cv = 5, verbose=1)

# fit model:
tuned_cv_results.fit(X_train, y_train, group=group_train, eval_set=[(X_test, y_test)], eval_group=[group_test], eval_metric='ndcg', verbose=True)

Fitting 5 folds for each of 400000 candidates, totalling 2000000 fits


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x000001F5192C15B0>>
Traceback (most recent call last):
  File "c:\Users\Asus\anaconda3\lib\site-packages\xgboost\core.py", line 500, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 


: 

: 

# Train XGBRanker only for the selected optimized parameters

In [13]:
# model = xgb.XGBRanker(
#     # tree_method = 'gpu_hist',
#     # booster = 'gbtree',
#     # objective = 'rank:pairwise',
#     random_state = 42,
#     learning_rate = 0.1,
#     colsample_bytree = 0.9,
#     eta = 0.05,
#     max_depth = 6,
#     n_estimators = 110,
#     subsample = 0.75
# )

In [15]:
model.fit(X_train, y_train, group=group_train, eval_set=[(X_test, y_test)], eval_group=[group_test], eval_metric='ndcg', verbose=True)



[0]	validation_0-ndcg:0.87155
[1]	validation_0-ndcg:0.87489
[2]	validation_0-ndcg:0.88993
[3]	validation_0-ndcg:0.89299
[4]	validation_0-ndcg:0.89335
[5]	validation_0-ndcg:0.89370
[6]	validation_0-ndcg:0.89424
[7]	validation_0-ndcg:0.89417
[8]	validation_0-ndcg:0.89426
[9]	validation_0-ndcg:0.89451
[10]	validation_0-ndcg:0.89467
[11]	validation_0-ndcg:0.89504
[12]	validation_0-ndcg:0.89506
[13]	validation_0-ndcg:0.89502
[14]	validation_0-ndcg:0.89552
[15]	validation_0-ndcg:0.89546
[16]	validation_0-ndcg:0.89553
[17]	validation_0-ndcg:0.89559
[18]	validation_0-ndcg:0.89557
[19]	validation_0-ndcg:0.89567
[20]	validation_0-ndcg:0.89571
[21]	validation_0-ndcg:0.89596
[22]	validation_0-ndcg:0.89602
[23]	validation_0-ndcg:0.89624
[24]	validation_0-ndcg:0.89639
[25]	validation_0-ndcg:0.89667
[26]	validation_0-ndcg:0.89668
[27]	validation_0-ndcg:0.89687
[28]	validation_0-ndcg:0.89668
[29]	validation_0-ndcg:0.89668
[30]	validation_0-ndcg:0.89685
[31]	validation_0-ndcg:0.89703
[32]	validation_0-

# Now we will predict on the actual train test:

In [53]:
test_set = pd.read_csv('df_test_OptionB.csv')

In [54]:
test_set.columns

Index(['srch_id', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_room_count', 'srch_saturday_night_bool',
       'orig_destination_distance', 'avg_comp_rate', 'avg_comp_inv',
       'abroad_bool', 'children_bool', 'srch_query_affinity_score_low',
       'srch_query_affinity_score_high', 'starrating_diff_low',
       'starrating_diff_high', 'usd_diff_low', 'usd_diff_high'],
      dtype='object')

In [55]:
# Sort test set 
test_set = test_set.sort_values(by='srch_id')

# Storing the removed columns in a separate DataFrame
df_test_columns_removed = test_set[['srch_id', 'prop_id']].copy()

# # Create a new DataFrame by dropping the desired columns
test_set = test_set.drop(columns=['prop_id'], axis=1)

In [56]:
test_set.shape

(4959183, 25)

Since predict for XGBosst does not take an argument 'group', we need to apply the predict for each group separately:

In [37]:
def predict(model, df):
    """
    The function runs the prediction XGBRanker to each group of the variable
    of interest and returns a rank for this particular group

    Input:
    model: the model that we want to run on each separate category
    df: DataFrame

    Output:
    Prediction of the rank for each category
    """
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])

In [38]:
predictions = (test_set.groupby('srch_id').apply(lambda x: predict(model, x)))

In [57]:
predictions

srch_id
1         [0.7237759, 0.18657094, -0.5148052, 0.12954172...
3         [-0.55040884, -0.88232255, -0.1006564, -0.9766...
6         [-0.9152836, -0.57920295, -1.3692656, -1.25394...
7         [0.088054456, 0.2735728, 0.25442874, 0.3738486...
10        [-0.38448307, -0.4979509, -0.6311435, 0.736151...
                                ...                        
332781    [-1.3643686, 0.105161406, 0.04398398, -0.23980...
332783    [-0.8326318, -0.06108358, 0.31370464, 0.146781...
332785    [0.6792511, -0.31961018, -1.1608726, 0.6026232...
332786    [-0.37618676, 1.4334834, -0.07109529, -0.27077...
332787    [0.7697229, 1.9830147, 1.349007, 1.006025, 2.0...
Length: 199549, dtype: object

In [58]:
# Adding the columns back
test_set['srch_id'] = df_test_columns_removed['srch_id']
test_set['prop_id'] = df_test_columns_removed['prop_id']

In [59]:
test_set.shape

(4959183, 26)

In [60]:
test_df_sorted = test_set.sort_values(['srch_id', 'prop_id'])
# Add predictions to test set
test_df_sorted['predicted_ranking'] = np.concatenate(predictions.values)

In [61]:
test_df_sorted.sort_values(['srch_id', 'predicted_ranking'], ascending=[True, False], inplace=True)

In [63]:
# Select the columns `srch_id` and `prop_id`
result = test_df_sorted[['srch_id', 'prop_id']]
result

Unnamed: 0,srch_id,prop_id
6,1,34263
24,1,123675
27,1,134992
20,1,90385
19,1,89466
...,...,...
4959178,332787,32019
4959182,332787,99509
4959179,332787,33959
4959176,332787,22854


In [65]:
# Export submission file:
result.to_csv('XGRanker_submission_file.csv', index=False)