In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings
from ndcg_calc import ndcg_calc
warnings.filterwarnings('ignore')

In [None]:
train_orig = pd.read_csv('processed_data_final/train_clean.csv')

In [None]:
# divide the training set into subsamples for: feature engineering (position modeling), cross-validation, single training, single validation

all_srch_ids = train_orig.srch_id.unique()
num_samples_pos = int(0.25*len(all_srch_ids)) # 25%
num_samples_train = int(0.5*len(all_srch_ids)) # 50%
np.random.seed(0)
position_model_ids = np.random.choice(all_srch_ids, num_samples_pos, replace=False)
xval_ids = list(set(all_srch_ids)-set(position_model_ids))
training_ids = np.random.choice(xval_ids, num_samples_train, replace=False)
validation_ids = list(set(xval_ids)-set(training_ids))

# reducing the size of the cross-validation set
xval_ids_train = np.random.choice(xval_ids, int(len(xval_ids)*0.5), replace=False)

# untouched sample is left for extra validation and also because my memory cannot cope
xval_sample = train_orig[train_orig.srch_id.isin(xval_ids_train)]
untouched_sample = train_orig[~train_orig.srch_id.isin(xval_ids_train)]
#train_sample = train_orig[train_orig.srch_id.isin(training_ids)]
#validation_sample = train_orig[train_orig.srch_id.isin(validation_ids)]

In [None]:
# free memory
del(train_orig)

In [None]:

# columns to drop in general
cols_to_drop1 = ['booking_bool','click_bool','prop_desirability','position']

# columns to drop with features that are not important enough, needed to drop it to avoid memory errors

cols_to_drop2 = ['willingness_to_pay', 'avg_prop_starrating', 'price_diff_bins','srch_query_affinity_bins',
                 'star_diff_bins','avg_prop_review_score', 'price_diff_bool']

#drop columns
xval_sample.drop(cols_to_drop1, axis=1, inplace = True)
xval_sample.drop(cols_to_drop2, axis=1, inplace = True)
untouched_sample.drop(['prop_desirability','position'], axis=1, inplace = True)
#train_sample.drop(cols_to_drop, axis=1, inplace = True)
#validation_sample.drop(cols_to_drop, axis=1, inplace = True)

In [None]:
# extract labels
xval_sample.sort_values(by='srch_id', inplace=True)
labels = xval_sample.booked_clicked_combined
xval_sample.drop(['booked_clicked_combined'], inplace=True, axis=1)

In [None]:
xval_sample.shape

In [None]:
# function to break srch_id's into folds 
def get_fold_ids(unique_srch_ids, num_folds=5):
    fraction = 1/num_folds
    final_fold_ids = dict()
    for i in range(num_folds):
        num_samples = int(fraction*len(unique_srch_ids))
        train_sample_ids = np.random.choice(unique_srch_ids, num_samples, replace=False)
        unique_srch_ids = list(set(unique_srch_ids)-set(train_sample_ids))
        set_key = 'id_set' + str(i)
        final_fold_ids[set_key] = train_sample_ids
    return final_fold_ids

In [None]:
# divide ids into n subsets of equal size
num_folds = 3 # number of folds for cross-validation
fold_ids = get_fold_ids(xval_sample.srch_id.unique(),num_folds)
keys = list(fold_ids.keys())

In [None]:


# parameter_range sets the values for parameter sweep
num_iter_range = [2000, 1500, 1000, 500]
learning_rate_range = [0.3, 0.2, 0.1, 0.05]


scores = np.zeros((len(num_iter_range),len(learning_rate_range),num_folds))
models = np.zeros((len(num_iter_range),len(learning_rate_range),num_folds))

for i,num_iter in enumerate(num_iter_range):
    for j, rate in enumerate(learning_rate_range):
        for k in range(num_folds):
            params = {'task': 'train',
                    'boosting_type': 'gbdt',
                    'objective': 'lambdarank',
                    'metric': 'ndcg',
                    'label_gain' : {0,1,5},
                    'ndcg_at': 38,
                    'num_leaves':10,
                    'feature_fraction': 0.9,
                    'bagging_fraction': 0.8,
                    'bagging_freq': 1,
                    'verbose': 0}
            params['num_iterations'] = num_iter
            params['learning_rate'] = rate

            # divide the srch_ids into training and validation set
            id_valid  = k # validation set index
            valid_set_ids = fold_ids[keys[id_valid]] # get validation set ids

            '''
            if k == num_folds-1:
                train_set_ids = fold_ids[keys[0]]
            else:
                train_set_ids = fold_ids[keys[id_valid+1]]
            '''
            #get subsets of the full train set for training and validation sets
            valid_set = xval_sample[xval_sample.srch_id.isin(valid_set_ids)]
            train_set = xval_sample[~xval_sample.srch_id.isin(valid_set_ids)]
            
            training_set_size.append(len(train_set))

            #prepare the label 
            train_label = labels[train_set.index]
            valid_label = labels[valid_set.index]

            #get group data
            train_group = train_set.groupby(['srch_id']).size().values
            valid_group = valid_set.groupby(['srch_id']).size().values

            # put into lightGBM format
            lgb_train = lgb.Dataset(train_set,label=train_label, group=train_group)
            lgb_val = lgb.Dataset(valid_set, label=valid_label, group=valid_group)
            
            #free memory
            del(train_set)
            del(valid_set)
            
            # train the lambdarank model with given parameters
            lgbm_model = lgb.train(params, train_set = lgb_train, valid_sets = lgb_val,
                                   verbose_eval=200)
            models[i,j,k] = lgbm_model
            scores[i,j,k] = lgbm_model.best_score['valid_0']['ndcg@38']
                    
            # free memory
            del(lgb_train)
            del(lgb_val)



In [None]:
print(scores)
mean_scores = np.mean(scores, axis=2)
std_scores = np.std(scores, axis=2)

In [None]:
print(mean_scores)
print(std_scores)

In [None]:
plt.figure()
plt.imshow(mean_scores, aspect='auto', cmap=plt.cm.coolwarm)
plt.xticks(range(len(learning_rate_range)),learning_rate_range)
plt.yticks(range(len(num_iter_range)), num_iter_range)
plt.xlabel('learning_rate')
plt.ylabel('num_iterations')
plt.title('hyperparameter grid search')
cb = plt.colorbar()
cb.set_label('nDCG')

In [None]:
'''
x_str = 'Num_iterations'
plt.figure()
plt.plot(parameter_range, mean_scores,marker='*',linestyle='--')
plt.xlabel(x_str)
plt.ylabel('nDCG')
plt.title('hyperparameter tuning for lambdarank')
plt.grid(color='gray',alpha=0.3, linestyle='--')
#plt.xticks(np.arange(min(parameter_range), max(parameter_range)+100))
#plt.savefig('output/num_iter_vs_ndcg_train.pdf',bbox_inches='tight')
'''

## Some extra validation on the untouched portion

In [None]:
# extra check on the portion of the set that was not used in cross-validation at all
test_scores = np.zeros(len(models))
for i,model in enumerate(models):
    prediction = model.predict(untouched_sample)
    test_score = ndcg_calc(untouched_sample[['srch_id','click_bool','booking_bool']], prediction)
    test_scores[i] = test_score
    print('Model {0:d} score is {1:.8f}'.format(i, test_score))

In [None]:
test_scores = test_scores.reshape((len(num_iter_range),len(learning_rate_range),num_folds))
test_scores

In [None]:
mean_test_scores = np.mean(test_scores, axis=2)
mean_test_scores

In [None]:
plt.figure()
plt.imshow(mean_scores, aspect='auto', cmap=plt.cm.viridis_r)
plt.xticks(range(len(learning_rate_range)),learning_rate_range)
plt.yticks(range(len(num_iter_range)), num_iter_range)
plt.xlabel('learning_rate')
plt.ylabel('num_iterations')
plt.title('hyperparameter grid search')
cb = plt.colorbar()
cb.set_label('nDCG')

In [None]:
#mean_test_scores = np.mean(test_scores, axis=1)
'''
plt.figure()
plt.plot(parameter_range, mean_test_scores, marker='*', linestyle='--')
plt.xlabel(x_str)
plt.ylabel('nDCG')
plt.title('Num_iterations vs. validation set scores')
plt.grid(color='gray',alpha=0.3, linestyle='--')
#plt.xticks(np.arange(min(parameter_range), max(parameter_range)+100,))
plt.savefig("output/num_iterations_vs_ndcg_test.pdf", bbox_inches='tight')
'''

In [None]:
# If results are good save the models into file
for model in models:
    name_str = 'lgmb_models/model' + str(i*k)+'.txt'
    lgbm_model.save_model(name_str)

In [None]:
model_ind = -1
lgb.plot_importance(models[model_ind], height=1, figsize=(9,12))

In [None]:

feature_importances = models[model_ind].feature_importance()/(models[model_ind].current_iteration())
feature_importances_normalized = feature_importances/sum(feature_importances)
feature_names_importances = dict(zip(models[model_ind].feature_name(),feature_importances_normalized))
feature_names_importances = sorted(feature_names_importances.items(), key=lambda x: x[1], reverse=True)
feature_names_importances