In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import gc
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.impute import SimpleImputer
import datetime

In [2]:
def replace_nan_with_mean(df,cols):
    """
    Completing missing numerical values (NaN) from columns. NaN values are replaced by the column's mean
    Args:
        @string_cols:  (list of column indexes) string columns to complete
    """
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [3]:
def replace_nan_with_median(df,cols):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [4]:
def replace_nan_with_value(df,col,value):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=value)

    imp_mean = imp_mean.fit(df[[col]])

    df[col] = imp_mean.transform(df[[col]])

    return df

In [5]:
!kaggle competitions download -c vu-dmt-assigment-2-2023
training_data = pd.read_csv('vu-dmt-assigment-2-2023/training_set_VU_DM.csv')
test_data = pd.read_csv('vu-dmt-assigment-2-2023/test_set_VU_DM.csv')

vu-dmt-assigment-2-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
#create new column for NDCG evaluation
training_data['target_col'] = (5* training_data['booking_bool']) + training_data['click_bool']

In [None]:
#Create one location score column
training_data['location_score'] = ((training_data['prop_location_score1'] + training_data['prop_location_score2']) / 2)

In [7]:
#Count how often a prop_id shows up
z = training_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
training_data['Count_Column'] = training_data['prop_id'].map(z1) 

In [8]:
training_data.to_csv("training_set_TRANS.csv", index=False)

In [9]:
training_data = pd.read_csv("training_set_TRANS.csv")

In [10]:
gc.collect()

0

In [11]:
aux_training = copy.copy(training_data)

In [12]:
aux_training

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target_col,Count_Column
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,0.0,0.0,,0,,0,0,612
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,0.0,0.0,,0,,0,0,583
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,0.0,0.0,,0,,0,0,551
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,-1.0,0.0,5.0,0,,0,0,460
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,0.0,0.0,,0,,0,0,665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,,,0,,0,0,47
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,,,0,,0,0,38
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,,,0,,0,0,43
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,,,1,157.84,1,6,37


In [None]:
aux_training = replace_nan_with_mean(aux_training,["visitor_hist_starrating","visitor_hist_adr_usd"])
aux_training = replace_nan_with_median(aux_training,["orig_destination_distance"])

col_min = aux_training["srch_query_affinity_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'srch_query_affinity_score',col_min)

col_min = aux_training["prop_starrating"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_starrating',col_min)

col_min = aux_training["prop_review_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_review_score',col_min)

col_min = aux_training["location_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'location_score',col_min)

In [None]:
aux_training = aux_training.drop('click_bool', axis=1)   #replaced by target_col and not in test set 
aux_training = aux_training.drop('booking_bool', axis=1) #replaced by target_col and not in test set 
aux_training = aux_training.drop('position', axis=1)     # not in test set 
aux_training = aux_training.drop('gross_bookings_usd', axis=1)
aux_training = aux_training.drop('prop_location_score1', axis=1)
aux_training = aux_training.drop('prop_location_score2', axis=1)
aux_training = aux_training.drop(['date_time'], axis=1)

In [None]:
print("Empty values per column: \n",aux_training.isnull().sum())

Empty values per column: 
 srch_id                              0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating              0
visitor_hist_adr_usd                 0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                    0
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2                 0
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score            0
orig_destination_distance            0
random_bool                          

In [None]:
#Separate features from target
feature = aux_training.drop('target_col', axis=1)
target  =  aux_training['target_col']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(feature, target, test_size=0.2, random_state=1)

In [None]:
print("X_train.shape = ",X_train.shape)
print("X_val.shape = ",X_val.shape)
print("\ny_train.shape = ",y_train.shape)
print("y_val.shape = ",y_val.shape)

X_train.shape =  (3966677, 50)
X_val.shape =  (991670, 50)

y_train.shape =  (3966677,)
y_val.shape =  (991670,)


In [None]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()

X_val = X_val.sort_index()
y_val = y_val.sort_index()

In [None]:
# Get the number of rows of each query (srch_id) needed to train the model.
qids_train = X_train.groupby("srch_id")["srch_id"].count().to_numpy() 
qids_val   = X_val.groupby("srch_id")["srch_id"].count().to_numpy()

In [None]:
print("query train ids mean: ",qids_train.mean()) 
print("query val   ids mean: ",qids_val.mean()) 

query train ids mean:  19.853933821505258
query val   ids mean:  5.134355713871516


In [None]:
FIXED_PARAMS={
              'objective'             : 'lambdarank',
              'metric'                : 'ndcg',
              'boosting_type'         : 'dart',
              'is_unbalance'          : True,
              'num_iterations'        : 100,
              'early_stopping_rounds' : 10}

SEARCH_PARAMS = {'learning_rate': 0.15,
                 'max_depth': 15,
                 'num_leaves': 25,
                 'feature_fraction': 0.8,
                 'subsample': 0.2,
                 'n_estimators' : 100}

gbm = lgb.LGBMRanker( 
        objective               = FIXED_PARAMS["objective"],
        metric                  = FIXED_PARAMS["metric"],
        boosting_type           = FIXED_PARAMS["boosting_type"],
        is_unbalance            = FIXED_PARAMS["is_unbalance"],
        num_iterations          = FIXED_PARAMS["num_iterations"],
        early_stopping_rounds   = FIXED_PARAMS["early_stopping_rounds"],
        learning_rate           = SEARCH_PARAMS["learning_rate"],
        max_depth               = SEARCH_PARAMS["max_depth"],
        num_leaves              = SEARCH_PARAMS["num_leaves"],
        feature_fraction        = SEARCH_PARAMS["feature_fraction"],
        subsample               = SEARCH_PARAMS["subsample"],
        n_estimators            = SEARCH_PARAMS["n_estimators"],
)

In [None]:
gbm.fit(X_train, 
        y_train, 
        group=qids_train,
        eval_set=[(X_val, y_val)], 
        eval_group=[qids_val],
        eval_at=[1,2,3,4,5]
)



[1]	valid_0's ndcg@1: 0.851214	valid_0's ndcg@2: 0.880653	valid_0's ndcg@3: 0.896805	valid_0's ndcg@4: 0.907191	valid_0's ndcg@5: 0.913637




[2]	valid_0's ndcg@1: 0.868394	valid_0's ndcg@2: 0.898568	valid_0's ndcg@3: 0.913973	valid_0's ndcg@4: 0.92241	valid_0's ndcg@5: 0.927528
[3]	valid_0's ndcg@1: 0.871946	valid_0's ndcg@2: 0.902075	valid_0's ndcg@3: 0.917142	valid_0's ndcg@4: 0.925221	valid_0's ndcg@5: 0.930005
[4]	valid_0's ndcg@1: 0.873416	valid_0's ndcg@2: 0.90344	valid_0's ndcg@3: 0.918451	valid_0's ndcg@4: 0.926326	valid_0's ndcg@5: 0.931054
[5]	valid_0's ndcg@1: 0.875037	valid_0's ndcg@2: 0.90505	valid_0's ndcg@3: 0.919635	valid_0's ndcg@4: 0.927492	valid_0's ndcg@5: 0.932135
[6]	valid_0's ndcg@1: 0.876083	valid_0's ndcg@2: 0.905701	valid_0's ndcg@3: 0.920414	valid_0's ndcg@4: 0.928192	valid_0's ndcg@5: 0.932791
[7]	valid_0's ndcg@1: 0.876751	valid_0's ndcg@2: 0.906436	valid_0's ndcg@3: 0.920877	valid_0's ndcg@4: 0.928636	valid_0's ndcg@5: 0.933206
[8]	valid_0's ndcg@1: 0.877134	valid_0's ndcg@2: 0.906629	valid_0's ndcg@3: 0.921306	valid_0's ndcg@4: 0.929072	valid_0's ndcg@5: 0.933493
[9]	valid_0's ndcg@1: 0.877962

In [None]:
test_pred = gbm.predict(X_val)

print(test_pred)

[-0.05524941 -0.7664129  -0.32036877 ...  0.20299522  0.17254942
 -0.09111858]


In [None]:
X_val["predicted_ranking"] = test_pred

In [None]:
print("--> NDCG : ", ndcg_score([y_val],[X_val["predicted_ranking"]]))

--> NDCG :  0.7861882083830752


In [None]:
del training_data, aux_training
gc.collect()

1095

In [None]:
#Feature eng for test data
z = test_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
test_data['Count_Column'] = test_data['prop_id'].map(z1)

#Create one location score column
test_data['location_score'] = ((test_data['prop_location_score1'] + test__data['prop_location_score2']) / 2)

In [None]:
test_data = test_data.drop(['date_time'], axis=1)
test_data = test_data.drop(['prop_location_score1', axis=1])
test_data = test_data.drop(['prop_location_score2', axis=1])

In [None]:
test_data = replace_nan_with_mean(test_data,["visitor_hist_starrating","visitor_hist_adr_usd"])
test_data = replace_nan_with_median(test_data,["orig_destination_distance"])

col_min = test_data["srch_query_affinity_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'srch_query_affinity_score',col_min)

col_min = test_data["prop_starrating"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_starrating',col_min)

col_min = test_data["prop_review_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_review_score',col_min)

col_min = test_data["location_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'location_score',col_min)

In [None]:
gc.collect()

5605

In [None]:
test_pred = gbm.predict(test_data)

MemoryError: Unable to allocate 1.18 GiB for an array with shape (32, 4959183) and data type float64