In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import copy
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.impute import SimpleImputer
import math

In [2]:
def replace_nan_with_mean(df,cols):
    """
    Completing missing numerical values (NaN) from columns. NaN values are replaced by the column's mean
    Args:
        @string_cols:  (list of column indexes) string columns to complete
    """
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [3]:
def replace_nan_with_median(df,cols):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [4]:
def replace_nan_with_value(df,col,value):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=value)

    imp_mean = imp_mean.fit(df[[col]])

    df[col] = imp_mean.transform(df[[col]])

    return df

In [6]:
# !kaggle competitions download -c vu-dmt-assigment-2-2023
training_data = pd.read_csv('vu-dmt-assigment-2-2023/training_set_VU_DM.csv')
test_data = pd.read_csv('vu-dmt-assigment-2-2023/test_set_VU_DM.csv')

In [None]:
#create new column for NDCG evaluation
training_data['target_col'] = (5* training_data['booking_bool']) + training_data['click_bool']

In [None]:
#Create one location score column
training_data['prop_loc_score1_norm'] = training_data['prop_location_score1'] / training_data['prop_location_score1'].abs().max()
training_data['prop_location_score2'].fillna(training_data['prop_loc_score1_norm'], inplace=True)
training_data['location_score'] = (training_data['prop_loc_score1_norm'] + training_data['prop_location_score2'])/2
training_data = training_data.drop('prop_loc_score1_norm', axis=1)
# training_data['location_score'] = ((training_data['prop_location_score1'] + training_data['prop_location_score2']) / 2)

In [None]:
#Count how often a prop_id shows up
z = training_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
training_data['Count_Column'] = training_data['prop_id'].map(z1) 

In [None]:
# Get data for whether the hotel is more expensive than usual
training_data['diff_price'] = ((training_data['price_usd']) - (np.exp(training_data['prop_log_historical_price'])))

In [None]:
training_data.to_csv("training_set_TRANS.csv", index=False)

Run from here for data

In [None]:
test_data = pd.read_csv('vu-dmt-assigment-2-2023/test_set_VU_DM.csv')

In [7]:
training_data = pd.read_csv("training_set_TRANS.csv")

In [8]:
gc.collect()

34

In [9]:
aux_training = copy.copy(training_data)

In [10]:
aux_training

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target_col,location_score,Count_Column,diff_price
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0.0,0.0,,0,,0,0,0.224622,612,-36.404964
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0.0,0.0,,0,,0,0,0.165043,583,17.806987
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0.0,0.0,,0,,0,0,0.169843,551,42.797387
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,-1.0,0.0,5.0,0,,0,0,0.208972,460,522.129581
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0.0,0.0,,0,,0,0,0.251162,665,5.200488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,0,,0,0,0.138880,47,117.000000
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,0,,0,0,0.215685,38,88.000000
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,0,,0,0,0.086997,43,98.000000
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,1,157.84,1,6,0.172785,37,138.000000


In [11]:
aux_training = replace_nan_with_mean(aux_training,["visitor_hist_starrating","visitor_hist_adr_usd"])
aux_training = replace_nan_with_mean(aux_training,["diff_price"])
aux_training = replace_nan_with_mean(aux_training,['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'])
aux_training = replace_nan_with_median(aux_training,["orig_destination_distance"])

col_min = aux_training["srch_query_affinity_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'srch_query_affinity_score',col_min)

col_min = aux_training["prop_starrating"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_starrating',col_min)

col_min = aux_training["prop_review_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_review_score',col_min)

col_min = aux_training["location_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'location_score',col_min)

In [12]:
aux_training['comp_rate_percent_diff'] = (((aux_training['comp1_rate_percent_diff'] * aux_training['comp1_rate']) +
                                            (aux_training['comp2_rate_percent_diff'] * aux_training['comp2_rate']) +
                                            (aux_training['comp3_rate_percent_diff'] * aux_training['comp3_rate']) +
                                            (aux_training['comp4_rate_percent_diff'] * aux_training['comp4_rate']) + 
                                            (aux_training['comp5_rate_percent_diff'] * aux_training['comp5_rate']) +
                                            (aux_training['comp6_rate_percent_diff'] * aux_training['comp6_rate']) +
                                            (aux_training['comp7_rate_percent_diff'] * aux_training['comp7_rate']) +
                                            (aux_training['comp8_rate_percent_diff'] * aux_training['comp8_rate'])) / 8)

In [13]:
aux_training

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,target_col,location_score,Count_Column,diff_price,comp_rate_percent_diff
0,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,893,3,3.5,...,0.0,22.430384,0,,0,0,0.224622,612,-36.404964,14.916091
1,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,10404,4,4.0,...,0.0,22.430384,0,,0,0,0.165043,583,17.806987,15.018229
2,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,21315,3,4.5,...,0.0,22.430384,0,,0,0,0.169843,551,42.797387,14.916091
3,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,27348,2,4.0,...,0.0,5.000000,0,,0,0,0.208972,460,522.129581,13.041091
4,1,2013-04-04 08:32:15,12,187,3.374334,176.022659,219,29604,4,3.5,...,0.0,22.430384,0,,0,0,0.251162,665,5.200488,14.916091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,77700,3,4.0,...,,22.430384,0,,0,0,0.138880,47,117.000000,14.990604
4958343,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,88083,3,4.0,...,,22.430384,0,,0,0,0.215685,38,88.000000,14.990604
4958344,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,94508,3,3.5,...,,22.430384,0,,0,0,0.086997,43,98.000000,14.990604
4958345,332785,2013-06-30 19:55:18,5,219,3.374334,176.022659,219,128360,3,5.0,...,,22.430384,1,157.84,1,6,0.172785,37,138.000000,14.990604


In [14]:
aux_training = aux_training.drop('click_bool', axis=1)   #replaced by target_col and not in test set 
aux_training = aux_training.drop('booking_bool', axis=1) #replaced by target_col and not in test set 
aux_training = aux_training.drop('position', axis=1)     # not in test set 
aux_training = aux_training.drop('gross_bookings_usd', axis=1)
aux_training = aux_training.drop('prop_location_score1', axis=1)
aux_training = aux_training.drop('prop_location_score2', axis=1)
aux_training = aux_training.drop(['date_time'], axis=1)
aux_training = aux_training.drop(['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'], axis=1)

In [15]:
print("Empty values per column: \n",aux_training.isnull().sum())

Empty values per column: 
 srch_id                              0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating              0
visitor_hist_adr_usd                 0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                    0
prop_brand_bool                      0
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score            0
orig_destination_distance            0
random_bool                          0
comp1_inv                      4828788
comp2_inv                      282807

In [16]:
#Separate features from target
feature = aux_training.drop('target_col', axis=1)
target  = aux_training['target_col']

In [17]:
X_train, X_val, y_train, y_val = train_test_split(feature, target, test_size=0.3, random_state=1)

In [18]:
print("X_train.shape = ",X_train.shape)
print("X_val.shape = ",X_val.shape)
print("\ny_train.shape = ",y_train.shape)
print("y_val.shape = ",y_val.shape)

X_train.shape =  (3470842, 35)
X_val.shape =  (1487505, 35)

y_train.shape =  (3470842,)
y_val.shape =  (1487505,)


In [19]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()

X_val = X_val.sort_index()
y_val = y_val.sort_index()

In [20]:
# Get the number of rows of each query (srch_id) needed to train the model.
qids_train = X_train.groupby("srch_id")["srch_id"].count().to_numpy() 
qids_val   = X_val.groupby("srch_id")["srch_id"].count().to_numpy()

In [21]:
print("query train ids mean: ",qids_train.mean()) 
print("query val   ids mean: ",qids_val.mean()) 

query train ids mean:  17.37427729027026
query val   ids mean:  7.532433664168523


In [22]:
FIXED_PARAMS={
              'objective'             : 'rank_xendcg', #supposed to be faster than 'lambdarank'
              'metric'                : 'ndcg',
              'boosting_type'         : 'dart',
              'is_unbalance'          : True,
              'num_iterations'        : 100,
              'early_stopping_rounds' : 10}

SEARCH_PARAMS = {'learning_rate': 0.15,
                 'max_depth': 15,
                 'num_leaves': 25,
                 'feature_fraction': 0.8,
                 'subsample': 0.2,
                 'n_estimators' : 100}

gbm = lgb.LGBMRanker( 
        objective               = FIXED_PARAMS["objective"],
        metric                  = FIXED_PARAMS["metric"],
        boosting_type           = FIXED_PARAMS["boosting_type"],
        is_unbalance            = FIXED_PARAMS["is_unbalance"],
        num_iterations          = FIXED_PARAMS["num_iterations"],
        early_stopping_rounds   = FIXED_PARAMS["early_stopping_rounds"],
        learning_rate           = SEARCH_PARAMS["learning_rate"],
        max_depth               = SEARCH_PARAMS["max_depth"],
        num_leaves              = SEARCH_PARAMS["num_leaves"],
        feature_fraction        = SEARCH_PARAMS["feature_fraction"],
        subsample               = SEARCH_PARAMS["subsample"],
        n_estimators            = SEARCH_PARAMS["n_estimators"],
)

In [23]:
gbm.fit(X_train, 
        y_train, 
        group=qids_train,
        eval_set=[(X_val, y_val)], 
        eval_group=[qids_val],
        eval_at=[1,2,3,4,5]
)



[1]	valid_0's ndcg@1: 0.763295	valid_0's ndcg@2: 0.800467	valid_0's ndcg@3: 0.822309	valid_0's ndcg@4: 0.836886	valid_0's ndcg@5: 0.846911




[2]	valid_0's ndcg@1: 0.766891	valid_0's ndcg@2: 0.804176	valid_0's ndcg@3: 0.826099	valid_0's ndcg@4: 0.840357	valid_0's ndcg@5: 0.850221
[3]	valid_0's ndcg@1: 0.769353	valid_0's ndcg@2: 0.807342	valid_0's ndcg@3: 0.828853	valid_0's ndcg@4: 0.843122	valid_0's ndcg@5: 0.853112
[4]	valid_0's ndcg@1: 0.771364	valid_0's ndcg@2: 0.80903	valid_0's ndcg@3: 0.830847	valid_0's ndcg@4: 0.845107	valid_0's ndcg@5: 0.854792
[5]	valid_0's ndcg@1: 0.77235	valid_0's ndcg@2: 0.810248	valid_0's ndcg@3: 0.832102	valid_0's ndcg@4: 0.846331	valid_0's ndcg@5: 0.856051
[6]	valid_0's ndcg@1: 0.773257	valid_0's ndcg@2: 0.811399	valid_0's ndcg@3: 0.833042	valid_0's ndcg@4: 0.847085	valid_0's ndcg@5: 0.856892
[7]	valid_0's ndcg@1: 0.774072	valid_0's ndcg@2: 0.812273	valid_0's ndcg@3: 0.833804	valid_0's ndcg@4: 0.847781	valid_0's ndcg@5: 0.857548
[8]	valid_0's ndcg@1: 0.774082	valid_0's ndcg@2: 0.812364	valid_0's ndcg@3: 0.833811	valid_0's ndcg@4: 0.847932	valid_0's ndcg@5: 0.857619
[9]	valid_0's ndcg@1: 0.77493

[61]	valid_0's ndcg@1: 0.78347	valid_0's ndcg@2: 0.822693	valid_0's ndcg@3: 0.843917	valid_0's ndcg@4: 0.857221	valid_0's ndcg@5: 0.866335
[62]	valid_0's ndcg@1: 0.783536	valid_0's ndcg@2: 0.822737	valid_0's ndcg@3: 0.843936	valid_0's ndcg@4: 0.857266	valid_0's ndcg@5: 0.8664
[63]	valid_0's ndcg@1: 0.78384	valid_0's ndcg@2: 0.822922	valid_0's ndcg@3: 0.844135	valid_0's ndcg@4: 0.857387	valid_0's ndcg@5: 0.866551
[64]	valid_0's ndcg@1: 0.783855	valid_0's ndcg@2: 0.822894	valid_0's ndcg@3: 0.844063	valid_0's ndcg@4: 0.857389	valid_0's ndcg@5: 0.86654
[65]	valid_0's ndcg@1: 0.783845	valid_0's ndcg@2: 0.822846	valid_0's ndcg@3: 0.84401	valid_0's ndcg@4: 0.857335	valid_0's ndcg@5: 0.866506
[66]	valid_0's ndcg@1: 0.783982	valid_0's ndcg@2: 0.822853	valid_0's ndcg@3: 0.844051	valid_0's ndcg@4: 0.857354	valid_0's ndcg@5: 0.866532
[67]	valid_0's ndcg@1: 0.783876	valid_0's ndcg@2: 0.822786	valid_0's ndcg@3: 0.844048	valid_0's ndcg@4: 0.857427	valid_0's ndcg@5: 0.866489
[68]	valid_0's ndcg@1: 0.7

LGBMRanker(boosting_type='dart', early_stopping_rounds=10, feature_fraction=0.8,
           is_unbalance=True, learning_rate=0.15, max_depth=15, metric='ndcg',
           num_iterations=100, num_leaves=25, objective='rank_xendcg',
           subsample=0.2)

In [24]:
test_pred = gbm.predict(X_val)

print(test_pred)

[-0.04850601 -0.24622072  0.21122961 ...  0.04796483  0.07546289
 -0.23179927]


In [25]:
X_val["predicted_ranking"] = test_pred

In [26]:
print("--> NDCG : ", ndcg_score([y_val],[X_val["predicted_ranking"]]))

--> NDCG :  0.7681402432767066


In [27]:
del training_data, aux_training
gc.collect()

2

In [28]:
#Feature eng for test data
z = test_data['prop_id'].value_counts() 
z1 = z.to_dict() #converts to dictionary
test_data['Count_Column'] = test_data['prop_id'].map(z1)

#Create one location score column
test_data['prop_loc_score1_norm'] = test_data['prop_location_score1'] / test_data['prop_location_score1'].abs().max()
test_data['prop_location_score2'].fillna(test_data['prop_loc_score1_norm'], inplace=True)
test_data['location_score'] = (test_data['prop_loc_score1_norm'] + test_data['prop_location_score2'])/2
test_data = test_data.drop('prop_loc_score1_norm', axis=1)
# test_data['location_score'] = ((test_data['prop_location_score1'] + test_data['prop_location_score2']) / 2)

In [29]:
test_data['diff_price'] = ((test_data['price_usd']) - (np.exp(test_data['prop_log_historical_price'])))

In [30]:
test_data = replace_nan_with_mean(test_data,["visitor_hist_starrating","visitor_hist_adr_usd","diff_price"])
test_data = replace_nan_with_mean(test_data,['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'])
test_data = replace_nan_with_median(test_data,["orig_destination_distance"])

col_min = test_data["srch_query_affinity_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'srch_query_affinity_score',col_min)

col_min = test_data["prop_starrating"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_starrating',col_min)

col_min = test_data["prop_review_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_review_score',col_min)

col_min = test_data["location_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'location_score',col_min)

In [31]:
#Average competition data
#Get whether diff with competition was pos/neg, then get average diff 
test_data['comp_rate_percent_diff'] = (((test_data['comp1_rate_percent_diff'] * test_data['comp1_rate']) +
                                            (test_data['comp2_rate_percent_diff'] * test_data['comp2_rate']) +
                                            (test_data['comp3_rate_percent_diff'] * test_data['comp3_rate']) +
                                            (test_data['comp4_rate_percent_diff'] * test_data['comp4_rate']) + 
                                            (test_data['comp5_rate_percent_diff'] * test_data['comp5_rate']) +
                                            (test_data['comp6_rate_percent_diff'] * test_data['comp6_rate']) +
                                            (test_data['comp7_rate_percent_diff'] * test_data['comp7_rate']) +
                                            (test_data['comp8_rate_percent_diff'] * test_data['comp8_rate'])) / 8)

In [32]:
test_data = test_data.drop(['date_time'], axis=1)
test_data = test_data.drop(['prop_location_score1'], axis=1)
test_data = test_data.drop(['prop_location_score2'], axis=1)
test_data = test_data.drop(['comp1_rate_percent_diff','comp1_rate','comp2_rate_percent_diff','comp2_rate',
                                 'comp3_rate_percent_diff','comp3_rate','comp4_rate_percent_diff','comp4_rate',
                                 'comp5_rate_percent_diff','comp5_rate','comp6_rate_percent_diff','comp6_rate',
                                 'comp7_rate_percent_diff','comp7_rate','comp8_rate_percent_diff','comp8_rate'], axis=1)

In [33]:
gc.collect()

63

In [34]:
test_data

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp3_inv,comp4_inv,comp5_inv,comp6_inv,comp7_inv,comp8_inv,Count_Column,location_score,diff_price,comp_rate_percent_diff
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,,185,0.245152,-33.933013,15.310530
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,,252,0.231262,-20.379512,15.188376
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,,0.0,,,,150,0.221926,-15.071523,16.361767
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,,0.0,,,,147,0.199970,-9.933013,15.111767
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,0.0,,,,214,0.315102,-33.168253,15.111767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959178,332787,24,216,3.374933,177.15073,117,32019,4,3.5,0,...,0.0,0.0,0.0,,,,13,0.205200,-26.688561,33.342988
4959179,332787,24,216,3.374933,177.15073,117,33959,4,3.0,1,...,0.0,0.0,0.0,,,,13,0.324793,-13.540419,18.092988
4959180,332787,24,216,3.374933,177.15073,117,35240,4,0.0,0,...,0.0,0.0,0.0,,,,2,0.256447,-29.634348,22.217988
4959181,332787,24,216,3.374933,177.15073,117,94437,4,0.0,0,...,0.0,0.0,0.0,,,,7,0.257002,-37.474348,23.092988


In [35]:
test_pred = gbm.predict(test_data)

In [36]:
test_pred

array([ 0.19057166,  0.18946773,  0.18960132, ...,  0.03564552,
       -0.01975265,  0.55385819])

In [37]:
test_data["predicted_ranking"] = test_pred
test_data.head(15)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp4_inv,comp5_inv,comp6_inv,comp7_inv,comp8_inv,Count_Column,location_score,diff_price,comp_rate_percent_diff,predicted_ranking
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,185,0.245152,-33.933013,15.31053,0.190572
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,252,0.231262,-20.379512,15.188376,0.189468
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,0.0,,,,150,0.221926,-15.071523,16.361767,0.189601
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,0.0,,,,147,0.19997,-9.933013,15.111767,0.041115
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,0.0,,,,214,0.315102,-33.168253,15.111767,0.463463
5,1,24,216,3.374933,177.15073,219,28181,3,4.5,1,...,,,,,,381,0.256056,-23.620073,15.31053,0.396112
6,1,24,216,3.374933,177.15073,219,34263,3,4.5,1,...,,0.0,,,,202,0.286347,-23.514064,15.111767,0.450557
7,1,24,216,3.374933,177.15073,219,37567,2,4.5,0,...,,0.0,,,,175,0.209622,-14.95654,19.861767,0.520495
8,1,24,216,3.374933,177.15073,219,50162,2,3.5,1,...,,0.0,,,,293,0.197993,-28.267134,13.861767,0.440089
9,1,24,216,3.374933,177.15073,219,54937,3,4.0,1,...,,0.0,,,,403,0.231447,-32.284285,15.233922,0.765054


In [38]:
test_data = test_data.sort_values(["srch_id","predicted_ranking"], ascending=[True,False])

In [39]:
test_data.drop(test_data.columns.difference(['srch_id','prop_id']), 1, inplace=True)
test_data.head()

  test_data.drop(test_data.columns.difference(['srch_id','prop_id']), 1, inplace=True)


Unnamed: 0,srch_id,prop_id
23,1,99484
9,1,54937
12,1,61934
20,1,90385
7,1,37567


In [40]:
# Create output file for competition
test_data.to_csv("output.csv", index=False) 