In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
train_orig = pd.read_csv('processed_data/train_clean.csv')
#test = pd.read_csv('processed_data/test_clean.csv')

In [3]:
# get month feature
month = pd.DatetimeIndex(train_orig['date_time']).month
# drop some columns
train = train_orig.drop(['booking_bool','click_bool','gross_bookings_usd','date_time','position','prop_desirability'], axis=1)
train.insert(1,'month', month)
train.sort_values(by=['srch_id'], inplace=True)
# set label column, where 1 - clicked, 2 - booked
#train_new.insert(0, 'label',(clicked+booked))
#fill nans
train.fillna(-999, inplace=True)
valid.fillna(-999, inplace=True)


In [124]:
def get_fold_ids(unique_srch_ids, num_folds=5):
    fraction = (len(unique_srch_ids)//num_folds)/len(unique_srch_ids)
    final_fold_ids = dict()
    for i in range(num_folds):
        num_samples = int(fraction*len(unique_srch_ids))
        train_sample_ids = np.random.choice(unique_srch_ids, num_samples, replace=False)
        unique_srch_ids = list(set(unique_srch_ids)-set(train_sample_ids))
        key_train = 'id_set' + str(i)
        final_fold_ids[key_train] = train_sample_ids
    return final_fold_ids

In [125]:
# divide ids into n subsets of equal size
num_folds = 5
fold_ids = get_fold_ids(train.srch_id.unique(),num_folds)
all_keys = list(fold_ids.keys())

In [132]:
models = []
for i in range(num_folds):

    # divide the srch_ids into training and validation set
    keys = all_keys.copy()
    id_valid  = i # validation set index 
    valid_set_key = keys[id_valid]
    valid_set_ids = fold_ids[valid_set_key] # get validation set ids

    #get subsets of the full train set for training and validation sets
    valid_set = train[train.srch_id.isin(valid_set_ids)]
    train_set = train[~train.srch_id.isin(valid_set_ids)]
    
    train_set = train_set.sort_values(by=['srch_id'])
    valid_set = valid_set.sort_values(by=['srch_id'])

    #prepare the label 
    train_label = train_set.booked_clicked_combined.values
    valid_label = valid_set.booked_clicked_combined.values
    train_set_nolabel = train_set.drop(['booked_clicked_combined'], axis=1)
    valid_set_nolabel = valid_set.drop(['booked_clicked_combined'], axis=1)

    #get group data
    train_group = train_set_nolabel.groupby(['srch_id']).size().values
    valid_group = valid_set_nolabel.groupby(['srch_id']).size().values
    lgb_train = lgb.Dataset(train_set_nolabel,label=train_label, group=train_group)
    lgb_val = lgb.Dataset(valid_set_nolabel, label=valid_label, group=valid_group)
    
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'label_gain' : {0,1,5},
        'ndcg_at': 38,
        'num_leaves': 10,
        'min_data_in_leaf':70,
        'num_iterations':100,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'verbose': 0
    }

    # train the lambdarank model with given parameters
    lgbm_model = lgb.train(params, train_set = lgb_train, valid_sets = lgb_val, verbose_eval=100)
    models.append(lgbm_model)



[100]	valid_0's ndcg@38: 0.457541


MemoryError: 

In [58]:
# saving the model into txt file
#lgbm_model1.save_model('model1.txt') 

In [127]:
# get the average and standard deviation
scores = [model.best_score['valid_0']['ndcg@38'] for model in models]
print('Average ndcg@38 score: {0:.8f}, std: {1:.8f}'.format(np.mean(scores),np.std(scores)))

Average ndcg@38 score: 0.45716333, std: 0.00144847


In [128]:
feature_importances = dict(zip(models[0].feature_name(), models[0].feature_importance()))
feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
feature_importances

[('price_usd', 249),
 ('prop_location_score2', 188),
 ('prop_starrating', 172),
 ('prop_review_score', 79),
 ('promotion_flag', 63),
 ('star_diff_filled', 47),
 ('price_diff_filled', 40),
 ('prop_log_historical_price', 36),
 ('prop_location_score1', 18),
 ('prop_country_id', 5),
 ('random_bool', 3),
 ('srch_id', 0),
 ('month', 0),
 ('site_id', 0),
 ('visitor_location_country_id', 0),
 ('visitor_hist_starrating', 0),
 ('visitor_hist_adr_usd', 0),
 ('prop_id', 0),
 ('prop_brand_bool', 0),
 ('srch_destination_id', 0),
 ('srch_length_of_stay', 0),
 ('srch_booking_window', 0),
 ('srch_adults_count', 0),
 ('srch_children_count', 0),
 ('srch_room_count', 0),
 ('srch_saturday_night_bool', 0),
 ('srch_query_affinity_score', 0),
 ('orig_destination_distance', 0),
 ('star_diff', 0),
 ('star_diff_bool', 0),
 ('price_diff', 0),
 ('price_diff_bool', 0),
 ('visitor_hist_starrating_filled', 0),
 ('visitor_hist_adr_usd_filled', 0),
 ('willingness_to_pay', 0)]

In [130]:
feature_importances = dict(zip(lgbm_model1.feature_name(), lgbm_model1.feature_importance()))
feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
feature_importances

[('price_usd', 1583),
 ('prop_location_score2', 1433),
 ('prop_log_historical_price', 974),
 ('prop_location_score1', 780),
 ('srch_query_affinity_score', 561),
 ('prop_starrating', 537),
 ('prop_review_score', 406),
 ('price_diff_filled', 392),
 ('prop_id', 341),
 ('star_diff_filled', 261),
 ('orig_destination_distance', 210),
 ('price_diff', 206),
 ('srch_booking_window', 172),
 ('random_bool', 128),
 ('promotion_flag', 125),
 ('prop_country_id', 112),
 ('star_diff', 111),
 ('srch_destination_id', 96),
 ('prop_brand_bool', 90),
 ('visitor_location_country_id', 64),
 ('visitor_hist_adr_usd_filled', 61),
 ('visitor_hist_adr_usd', 57),
 ('srch_id', 53),
 ('srch_length_of_stay', 52),
 ('visitor_hist_starrating_filled', 49),
 ('visitor_hist_starrating', 34),
 ('month', 27),
 ('site_id', 24),
 ('srch_children_count', 19),
 ('srch_adults_count', 12),
 ('srch_room_count', 12),
 ('srch_saturday_night_bool', 8),
 ('willingness_to_pay', 8),
 ('star_diff_bool', 1),
 ('price_diff_bool', 1)]