In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import copy
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.impute import SimpleImputer
import math

In [2]:
def replace_nan_with_mean(df,cols):
    """
    Completing missing numerical values (NaN) from columns. NaN values are replaced by the column's mean
    Args:
        @string_cols:  (list of column indexes) string columns to complete
    """
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [3]:
def replace_nan_with_median(df,cols):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [4]:
def replace_nan_with_value(df,col,value):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=value)

    imp_mean = imp_mean.fit(df[[col]])

    df[col] = imp_mean.transform(df[[col]])

    return df

In [5]:
!kaggle competitions download -c vu-dmt-assigment-2-2023
training_data = pd.read_csv('vu-dmt-assigment-2-2023/training_set_VU_DM.csv')
test_data = pd.read_csv('vu-dmt-assigment-2-2023/test_set_VU_DM.csv')

vu-dmt-assigment-2-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
#create new column for NDCG evaluation
training_data['target_col'] = (5* training_data['booking_bool']) + training_data['click_bool']

In [7]:
#Create one location score column
training_data['location_score'] = ((training_data['prop_location_score1'] + training_data['prop_location_score2']) / 2)

In [8]:
#Count how often a prop_id shows up
z = training_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
training_data['Count_Column'] = training_data['prop_id'].map(z1) 

In [9]:
# Get data for whether the hotel is more expensive than usual
training_data['diff_price'] = ((training_data['price_usd']) - (np.exp(training_data['prop_log_historical_price'])))

In [10]:
training_data.to_csv("training_set_TRANS.csv", index=False)

In [11]:
training_data = pd.read_csv("training_set_TRANS.csv")

In [12]:
gc.collect()

20

In [13]:
aux_training = copy.copy(training_data)

In [14]:
aux_training

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target_col,location_score,Count_Column,diff_price
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0.0,0.0,,0,,0,0,1.43690,612,-36.404964
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0.0,0.0,,0,,0,0,1.10745,583,17.806987
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0.0,0.0,,0,,0,0,1.11225,551,42.797387
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,-1.0,0.0,5.0,0,,0,0,1.42125,460,522.129581
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0.0,0.0,,0,,0,0,1.38205,665,5.200488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,0,,0,0,0.82855,47,117.000000
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,0,,0,0,1.05100,38,88.000000
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,0,,0,0,0.55820,43,98.000000
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,1,157.84,1,6,1.00810,37,138.000000


In [16]:
aux_training = replace_nan_with_mean(aux_training,["visitor_hist_starrating","visitor_hist_adr_usd"])
aux_training = replace_nan_with_mean(aux_training,["diff_price"])
aux_training = replace_nan_with_mean(aux_training,['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'])
aux_training = replace_nan_with_median(aux_training,["orig_destination_distance"])

col_min = aux_training["srch_query_affinity_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'srch_query_affinity_score',col_min)

col_min = aux_training["prop_starrating"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_starrating',col_min)

col_min = aux_training["prop_review_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_review_score',col_min)

col_min = aux_training["location_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'location_score',col_min)

In [17]:
aux_training['comp_rate_percent_diff'] = (((aux_training['comp1_rate_percent_diff'] * aux_training['comp1_rate']) +
                                            (aux_training['comp2_rate_percent_diff'] * aux_training['comp2_rate']) +
                                            (aux_training['comp3_rate_percent_diff'] * aux_training['comp3_rate']) +
                                            (aux_training['comp4_rate_percent_diff'] * aux_training['comp4_rate']) + 
                                            (aux_training['comp5_rate_percent_diff'] * aux_training['comp5_rate']) +
                                            (aux_training['comp6_rate_percent_diff'] * aux_training['comp6_rate']) +
                                            (aux_training['comp7_rate_percent_diff'] * aux_training['comp7_rate']) +
                                            (aux_training['comp8_rate_percent_diff'] * aux_training['comp8_rate'])) / 8)

In [18]:
aux_training

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target_col,location_score,Count_Column,diff_price
0,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,893,3,3.5,...,0.000000,0.0,22.430384,0,,0,0,1.43690,612,-36.404964
1,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,10404,4,4.0,...,0.000000,0.0,22.430384,0,,0,0,1.10745,583,17.806987
2,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,21315,3,4.5,...,0.000000,0.0,22.430384,0,,0,0,1.11225,551,42.797387
3,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,27348,2,4.0,...,-1.000000,0.0,5.000000,0,,0,0,1.42125,460,522.129581
4,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,29604,4,3.5,...,0.000000,0.0,22.430384,0,,0,0,1.38205,665,5.200488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,77700,3,4.0,...,-0.060899,,22.430384,0,,0,0,0.82855,47,117.000000
4958343,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,88083,3,4.0,...,-0.060899,,22.430384,0,,0,0,1.05100,38,88.000000
4958344,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,94508,3,3.5,...,-0.060899,,22.430384,0,,0,0,0.55820,43,98.000000
4958345,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,128360,3,5.0,...,-0.060899,,22.430384,1,157.84,1,6,1.00810,37,138.000000


In [19]:
aux_training = aux_training.drop('click_bool', axis=1)   #replaced by target_col and not in test set 
aux_training = aux_training.drop('booking_bool', axis=1) #replaced by target_col and not in test set 
aux_training = aux_training.drop('position', axis=1)     # not in test set 
aux_training = aux_training.drop('gross_bookings_usd', axis=1)
aux_training = aux_training.drop('prop_location_score1', axis=1)
aux_training = aux_training.drop('prop_location_score2', axis=1)
aux_training = aux_training.drop(['date_time'], axis=1)
aux_training = aux_training.drop(['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'], axis=1)

In [20]:
print("Empty values per column: \n",aux_training.isnull().sum())

Empty values per column: 
 srch_id                              0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating              0
visitor_hist_adr_usd                 0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                    0
prop_brand_bool                      0
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score            0
orig_destination_distance            0
random_bool                          0
comp1_inv                      4828788
comp2_inv                      282807

In [21]:
#Separate features from target
feature = aux_training.drop('target_col', axis=1)
target  =  aux_training['target_col']

In [22]:
X_train, X_val, y_train, y_val = train_test_split(feature, target, test_size=0.2, random_state=1)

In [23]:
print("X_train.shape = ",X_train.shape)
print("X_val.shape = ",X_val.shape)
print("\ny_train.shape = ",y_train.shape)
print("y_val.shape = ",y_val.shape)

X_train.shape =  (3966677, 34)
X_val.shape =  (991670, 34)

y_train.shape =  (3966677,)
y_val.shape =  (991670,)


In [24]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()

X_val = X_val.sort_index()
y_val = y_val.sort_index()

In [25]:
# Get the number of rows of each query (srch_id) needed to train the model.
qids_train = X_train.groupby("srch_id")["srch_id"].count().to_numpy() 
qids_val   = X_val.groupby("srch_id")["srch_id"].count().to_numpy()

In [26]:
print("query train ids mean: ",qids_train.mean()) 
print("query val   ids mean: ",qids_val.mean()) 

query train ids mean:  19.853933821505258
query val   ids mean:  5.134355713871516


In [27]:
FIXED_PARAMS={
              'objective'             : 'lambdarank',
              'metric'                : 'ndcg',
              'boosting_type'         : 'dart',
              'is_unbalance'          : True,
              'num_iterations'        : 100,
              'early_stopping_rounds' : 10}

SEARCH_PARAMS = {'learning_rate': 0.15,
                 'max_depth': 15,
                 'num_leaves': 25,
                 'feature_fraction': 0.8,
                 'subsample': 0.2,
                 'n_estimators' : 100}

gbm = lgb.LGBMRanker( 
        objective               = FIXED_PARAMS["objective"],
        metric                  = FIXED_PARAMS["metric"],
        boosting_type           = FIXED_PARAMS["boosting_type"],
        is_unbalance            = FIXED_PARAMS["is_unbalance"],
        num_iterations          = FIXED_PARAMS["num_iterations"],
        early_stopping_rounds   = FIXED_PARAMS["early_stopping_rounds"],
        learning_rate           = SEARCH_PARAMS["learning_rate"],
        max_depth               = SEARCH_PARAMS["max_depth"],
        num_leaves              = SEARCH_PARAMS["num_leaves"],
        feature_fraction        = SEARCH_PARAMS["feature_fraction"],
        subsample               = SEARCH_PARAMS["subsample"],
        n_estimators            = SEARCH_PARAMS["n_estimators"],
)

In [28]:
gbm.fit(X_train, 
        y_train, 
        group=qids_train,
        eval_set=[(X_val, y_val)], 
        eval_group=[qids_val],
        eval_at=[1,2,3,4,5]
)



[1]	valid_0's ndcg@1: 0.857178	valid_0's ndcg@2: 0.887724	valid_0's ndcg@3: 0.904242	valid_0's ndcg@4: 0.913895	valid_0's ndcg@5: 0.919946




[2]	valid_0's ndcg@1: 0.866343	valid_0's ndcg@2: 0.896401	valid_0's ndcg@3: 0.911948	valid_0's ndcg@4: 0.920875	valid_0's ndcg@5: 0.926025
[3]	valid_0's ndcg@1: 0.869409	valid_0's ndcg@2: 0.899821	valid_0's ndcg@3: 0.914759	valid_0's ndcg@4: 0.923435	valid_0's ndcg@5: 0.928468
[4]	valid_0's ndcg@1: 0.871412	valid_0's ndcg@2: 0.901644	valid_0's ndcg@3: 0.916488	valid_0's ndcg@4: 0.924816	valid_0's ndcg@5: 0.929728
[5]	valid_0's ndcg@1: 0.872882	valid_0's ndcg@2: 0.90308	valid_0's ndcg@3: 0.917698	valid_0's ndcg@4: 0.925752	valid_0's ndcg@5: 0.930628
[6]	valid_0's ndcg@1: 0.873146	valid_0's ndcg@2: 0.903277	valid_0's ndcg@3: 0.917945	valid_0's ndcg@4: 0.92604	valid_0's ndcg@5: 0.930878
[7]	valid_0's ndcg@1: 0.873483	valid_0's ndcg@2: 0.903837	valid_0's ndcg@3: 0.918455	valid_0's ndcg@4: 0.926395	valid_0's ndcg@5: 0.931195
[8]	valid_0's ndcg@1: 0.873364	valid_0's ndcg@2: 0.903809	valid_0's ndcg@3: 0.918414	valid_0's ndcg@4: 0.926436	valid_0's ndcg@5: 0.93121
[9]	valid_0's ndcg@1: 0.873996

In [29]:
test_pred = gbm.predict(X_val)

print(test_pred)

[ 0.01045066 -0.81944046 -0.27232607 ...  0.33359258  0.25951212
 -0.06063993]


In [30]:
X_val["predicted_ranking"] = test_pred

In [31]:
print("--> NDCG : ", ndcg_score([y_val],[X_val["predicted_ranking"]]))

--> NDCG :  0.7562154503973506


In [32]:
del training_data, aux_training
gc.collect()

543

In [33]:
#Feature eng for test data
z = test_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
test_data['Count_Column'] = test_data['prop_id'].map(z1)

#Create one location score column
test_data['location_score'] = ((test_data['prop_location_score1'] + test_data['prop_location_score2']) / 2)

In [34]:
test_data['diff_price'] = ((test_data['price_usd']) - (np.exp(test_data['prop_log_historical_price'])))

In [37]:
test_data = replace_nan_with_mean(test_data,["visitor_hist_starrating","visitor_hist_adr_usd","diff_price"])
test_data = replace_nan_with_mean(test_data,['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'])
test_data = replace_nan_with_median(test_data,["orig_destination_distance"])

col_min = test_data["srch_query_affinity_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'srch_query_affinity_score',col_min)

col_min = test_data["prop_starrating"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_starrating',col_min)

col_min = test_data["prop_review_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_review_score',col_min)

col_min = test_data["location_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'location_score',col_min)

In [38]:
#Average competition data
#Get whether diff with competition was pos/neg, then get average diff 
test_data['comp_rate_percent_diff'] = (((test_data['comp1_rate_percent_diff'] * test_data['comp1_rate']) +
                                            (test_data['comp2_rate_percent_diff'] * test_data['comp2_rate']) +
                                            (test_data['comp3_rate_percent_diff'] * test_data['comp3_rate']) +
                                            (test_data['comp4_rate_percent_diff'] * test_data['comp4_rate']) + 
                                            (test_data['comp5_rate_percent_diff'] * test_data['comp5_rate']) +
                                            (test_data['comp6_rate_percent_diff'] * test_data['comp6_rate']) +
                                            (test_data['comp7_rate_percent_diff'] * test_data['comp7_rate']) +
                                            (test_data['comp8_rate_percent_diff'] * test_data['comp8_rate'])) / 8)

In [39]:
test_data = test_data.drop(['date_time'], axis=1)
test_data = test_data.drop(['prop_location_score1'], axis=1)
test_data = test_data.drop(['prop_location_score2'], axis=1)
test_data = test_data.drop(['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'], axis=1)

In [40]:
gc.collect()

2350

In [42]:
test_data

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp3_inv,comp4_inv,comp5_inv,comp6_inv,comp7_inv,comp8_inv,Count_Column,location_score,diff_price,comp_rate_percent_diff
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,,185,1.50455,-33.933013,15.310530
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,,252,1.36215,-20.379512,15.188376
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,,0.0,,,,150,1.38280,-15.071523,16.361767
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,,0.0,,,,147,1.22805,-9.933013,15.111767
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,0.0,,,,214,1.57450,-33.168253,15.111767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959178,332787,24,216,3.374933,177.15073,117,32019,4,3.5,0,...,0.0,0.0,0.0,,,,13,1.26755,-26.688561,33.342988
4959179,332787,24,216,3.374933,177.15073,117,33959,4,3.0,1,...,0.0,0.0,0.0,,,,13,1.26720,-13.540419,18.092988
4959180,332787,24,216,3.374933,177.15073,117,35240,4,0.0,0,...,0.0,0.0,0.0,,,,2,0.00000,-29.634348,22.217988
4959181,332787,24,216,3.374933,177.15073,117,94437,4,0.0,0,...,0.0,0.0,0.0,,,,7,1.51640,-37.474348,23.092988


In [41]:
test_pred = gbm.predict(test_data)

ValueError: Number of features of the model must match the input. Model n_features_ is 34 and input n_features is 35

In [None]:
test_pred

array([ 0.01054672, -0.0207831 , -0.12533251, ..., -0.05866496,
       -0.11736475,  0.23884425])

In [None]:
test_data["predicted_ranking"] = test_pred
test_data.head(15)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,Count_Column,location_score,predicted_ranking
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,,,185,1.50455,0.010547
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,,,252,1.36215,-0.020783
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,,,,,,,150,1.3828,-0.125333
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,,,,,,,147,1.22805,-0.140031
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,,,,,,214,1.5745,0.061799
5,1,24,216,3.374933,177.15073,219,28181,3,4.5,1,...,,,,,,,,381,1.2413,0.047098
6,1,24,216,3.374933,177.15073,219,34263,3,4.5,1,...,,,,,,,,202,1.61,0.039116
7,1,24,216,3.374933,177.15073,219,37567,2,4.5,0,...,,,,,,,,175,1.4219,0.218873
8,1,24,216,3.374933,177.15073,219,50162,2,3.5,1,...,,,,,,,,293,1.1404,0.073533
9,1,24,216,3.374933,177.15073,219,54937,3,4.0,1,...,,,,,,,,403,1.12245,0.231468


In [None]:
test_data = test_data.sort_values(["srch_id","predicted_ranking"], ascending=[True,False])

In [None]:
test_data.drop(test_data.columns.difference(['srch_id','prop_id']), 1, inplace=True)
test_data.head()

  test_data.drop(test_data.columns.difference(['srch_id','prop_id']), 1, inplace=True)


Unnamed: 0,srch_id,prop_id
12,1,61934
9,1,54937
23,1,99484
7,1,37567
8,1,50162


In [None]:
# Create output file for competition
test_data.to_csv("output.csv", index=False) 