In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

In [3]:
train = pd.read_csv('train_clean.csv')

In [4]:
# remove all target columns except for 'booking_bool'
train.drop(columns=['click_bool','booked_clicked_combined'], inplace=True)

In [4]:
# create samples that always contain all queries of a srch_id
srch_id_groups = train.groupby('srch_id')
all_srch_ids = train.srch_id.unique()
num_samples = int(0.05*len(all_srch_ids))
train_sample_ids = np.random.choice(all_srch_ids, num_samples, replace=False)
remaining_ids = list(set(all_srch_ids)-set(train_sample_ids))
test_sample_ids = np.random.choice(remaining_ids, num_samples, replace=False)

train_sample = pd.concat([srch_id_groups.get_group(group) for group in train_sample_ids])
test_sample = pd.concat([srch_id_groups.get_group(group) for group in test_sample_ids])

In [5]:
del train

In [None]:
mi_position = mutual_info_regression(train_sample.drop(columns=['position','booking_bool']).fillna(-1), train_sample.position)

In [11]:
corr_position = train.drop(columns=['booking_bool']).corr().position

In [6]:
corr_booking = train.corr().booking_bool

In [9]:
df_booking_corr = pd.DataFrame({'corr_booking':corr_booking}, index=train.columns)

In [10]:
df_booking_corr.sort_values(by='corr_booking', ascending=False)

Unnamed: 0,corr_booking
booking_bool,1.000000
prop_desirability,0.225748
prop_desirability_incomplete,0.218809
prop_location_score2_norm_srch_destination_id,0.074029
prop_location_score2_norm_srch_id,0.073785
prop_location_score2_norm_site_id,0.037198
prop_location_score2,0.036707
promotion_flag,0.036047
prop_review_score_norm_srch_destination_id,0.033439
prop_review_score_norm_srch_id,0.031681


In [15]:
df = pd.DataFrame({'mutual_info':mi_position, 'corr_position':corr_position.drop('position', axis=0)})

In [27]:
df[df.correlation.abs() > 0.02]

['site_id','prop_country_id','prop_starrating','prop_review_score','prop_brand_bool','prop_location_score2','price_usd','promotion_flag','srch_children_count','srch_room_count','visitor_hist_starrating_filled','star_diff_filled','prop_review_score_norm_srch_destination_id','price_usd_norm_srch_id','price_diff_bins','price_usd_norm_srch_destination_id','prop_location_score1_norm_srch_destination_id','prop_location_score2_norm_srch_destination_id','prop_review_score_norm_srch_id','prop_location_score1_norm_srch_id','prop_location_score2_norm_srch_id','prop_location_score_combined','price_usd_norm_site_id','prop_location_score2_norm_site_id','prop_review_score_norm_site_id','avg_prop_price_usd','avg_prop_location_score2','avg_prop_location_score_combined','avg_prop_review_score','avg_prop_starrating']

Unnamed: 0,correlation,mutual_info
site_id,-0.031194,0.005889
prop_country_id,0.02985,0.001811
prop_starrating,-0.107717,0.006678
prop_review_score,-0.053878,0.003331
prop_brand_bool,0.023419,0.001835
prop_location_score2,-0.100845,0.020509
price_usd,-0.037268,0.003949
promotion_flag,-0.103621,0.006605
srch_children_count,-0.022136,0.001897
srch_room_count,-0.024657,0.0


In [7]:
mi_df = pd.DataFrame({'feature':train_sample.drop(columns=['position','booking_bool']).columns, 'mutual_info':mi_position})

In [8]:
mi_df

Unnamed: 0,feature,mutual_info
0,srch_id,0.000000
1,site_id,0.005889
2,visitor_location_country_id,0.000000
3,visitor_hist_starrating,0.000000
4,visitor_hist_adr_usd,0.000000
5,prop_country_id,0.001811
6,prop_id,0.045743
7,prop_starrating,0.006678
8,prop_review_score,0.003331
9,prop_brand_bool,0.001835


In [24]:
# compare correlations of original and normalized features with 'booking_bool'
features = ['price_usd', 'prop_location_score1', 'prop_location_score2', 'prop_review_score']
norm_features1 = [x + '_norm_srch_id' for x in features]
norm_features2 = [x + '_norm_site_id' for x in features]
norm_features3 = [x + '_norm_srch_destination_id' for x in features]
corr = train[features + ['booking_bool']].corr().booking_bool.drop('booking_bool', axis=0).values
corr_norm1 = train[norm_features1 + ['booking_bool']].corr().booking_bool.drop('booking_bool', axis=0).values
corr_norm2 = train[norm_features2 + ['booking_bool']].corr().booking_bool.drop('booking_bool', axis=0).values
corr_norm3 = train[norm_features3 + ['booking_bool']].corr().booking_bool.drop('booking_bool', axis=0).values
pd.DataFrame({'corr_original':corr, 'corr_norm_srch_id':corr_norm1, 'corr_norm_srch_site_id':corr_norm2, 'corr_norm_srch_dest_id':corr_norm3}, index=features)

Unnamed: 0,corr_norm_srch_dest_id,corr_norm_srch_id,corr_norm_srch_site_id,corr_original
price_usd,-0.025503,-0.034496,-0.032187,-0.03231
prop_location_score1,0.020557,0.02217,-0.002779,-0.003273
prop_location_score2,0.074029,0.073785,0.037198,0.036707
prop_review_score,0.033439,0.031681,0.025356,0.025936
