In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:
# Import training data
train = pd.read_csv('Data Mining VU data/training_set_VU_DM_2014.csv')

## Minimal data preparation for lightGBM

In [11]:
# take half of the train data set to make things faster
half_train = train.iloc[:len(train)//2-1,:]
half_train.head()

# infer missing prop_location_score2 from scaled prop_location_score1
half_train.prop_location_score2.fillna(value=half_train.prop_location_score1/25, inplace=True)

#half_train.fillna(-999)
half_train.prop_location_score2.isna().sum()
half_train.fillna(-999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [12]:
booked = half_train.booking_bool
clicked = half_train.click_bool
gross_usd = half_train.gross_bookings_usd
# drop some columns
half_train = half_train.drop(['booking_bool','click_bool','gross_bookings_usd','date_time'], axis=1)
half_train.insert(0, 'label',(clicked+booked))

In [13]:
# drop more columns
a = half_train.columns.get_loc("comp1_rate")
b = half_train.columns.get_loc("comp8_rate_percent_diff")
half_train.drop(half_train.iloc[:,a:b], axis=1, inplace=True)

In [14]:
# uncomment the following line if you want to drop the large dataframe
# train = None

In [15]:
# choose 50% of srch_id's randomly
unique_srch_ids = half_train.srch_id.unique()
rand_ind = np.random.randint(0,len(unique_srch_ids), len(unique_srch_ids)//2)
train_ind = unique_srch_ids[rand_ind]

# split into train and test 
train_set = half_train[half_train.srch_id.isin(train_ind)]
test_set = half_train[~half_train.srch_id.isin(train_ind)]

In [16]:
# sort by the srch_id
train_set = train_set.sort_values(by=['srch_id'])
test_set = test_set.sort_values(by=['srch_id'])

In [17]:
#separate the labels
train_label = train_set.label.values
test_label = test_set.label.values
train_set.drop(['label'], axis=1, inplace=True)
test_set.drop(['label'], axis=1, inplace=True)

In [18]:
train_set.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp8_rate_percent_diff
28,4,5,219,-999.0,-999.0,219,3625,4,4.0,0,...,2,46,1,0,1,1,-999.0,238.35,1,-999.0
58,4,5,219,-999.0,-999.0,219,137826,2,3.0,0,...,2,46,1,0,1,1,-999.0,238.21,1,-999.0
57,4,5,219,-999.0,-999.0,219,134162,5,4.5,0,...,2,46,1,0,1,1,-999.0,238.34,1,-999.0
56,4,5,219,-999.0,-999.0,219,129278,3,3.5,0,...,2,46,1,0,1,1,-999.0,238.38,1,-999.0
55,4,5,219,-999.0,-999.0,219,127808,2,2.5,0,...,2,46,1,0,1,1,-999.0,237.73,1,-999.0


In [19]:
test_set.head().T

Unnamed: 0,0,27,26,25,24
srch_id,1.0,1.0,1.0,1.0,1.0
site_id,12.0,12.0,12.0,12.0,12.0
visitor_location_country_id,187.0,187.0,187.0,187.0,187.0
visitor_hist_starrating,-999.0,-999.0,-999.0,-999.0,-999.0
visitor_hist_adr_usd,-999.0,-999.0,-999.0,-999.0,-999.0
prop_country_id,219.0,219.0,219.0,219.0,219.0
prop_id,893.0,122844.0,114766.0,111106.0,111000.0
prop_starrating,3.0,3.0,2.0,3.0,3.0
prop_review_score,3.5,4.5,3.5,2.5,4.5
prop_brand_bool,1.0,1.0,1.0,1.0,1.0


In [20]:
# get the frequencies for srch_id to set 'group' parameter in lightGBM Dataset
train_group = train_set.groupby(['srch_id']).size().values
test_group = test_set.groupby(['srch_id']).size().values

In [21]:
# make datasets for lightGBM
lgb_train = lgb.Dataset(train_set.as_matrix(),label=train_label, group=train_group)
lgb_val = lgb.Dataset(test_set.as_matrix(), label=test_label, group=test_group)

In [24]:
# set parameters for lightGBM

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'label_gain' : {0,1,5},
    'num_leaves': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': 0
}
lgbm_model = lgb.train(params, train_set = lgb_train, valid_sets = lgb_val, verbose_eval=5)

[50]	valid_0's ndcg@1: 0.193858	valid_0's ndcg@2: 0.272023	valid_0's ndcg@3: 0.322372	valid_0's ndcg@4: 0.353073	valid_0's ndcg@5: 0.378942
[100]	valid_0's ndcg@1: 0.204103	valid_0's ndcg@2: 0.282562	valid_0's ndcg@3: 0.330409	valid_0's ndcg@4: 0.362676	valid_0's ndcg@5: 0.388313


In [281]:
# save model
lgbm_model.save_model('model.txt') 

In [27]:
# load model 
bst = lgb.Booster(model_file='model.txt') 
ypred = bst.predict(test_set)