In [1]:
import numpy as np
import pandas as pd
import dateparser
import sklearn
import seaborn as sns
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('Data Mining VU data/training_set_VU_DM_2014.csv')

In [None]:
test = pd.read_csv('Data Mining VU data/test_set_VU_DM_2014.csv')

In [4]:
train_sample = train.sample(frac=0.5, random_state=0)
test_sample = train.drop(train_sample.index)

In [None]:
# replace missing review score by the worst value 0
train.prop_review_score.fillna(0, inplace=True)
test.prop_review_score.fillna(0, inplace=True)

In [None]:
# mean location_score2 per srch_destination
destination_groups_train = train.groupby(['srch_destination_id'])
destination_means_train = destination_groups_train.apply(lambda x: x.prop_location_score2.mean())
destination_groups_test = test.groupby(['srch_destination_id'])
destination_means_test = destination_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# mean location_score2 per prop_country
country_groups_train = train.groupby(['prop_country_id'])
country_means_train = country_groups_train.apply(lambda x: x.prop_location_score2.mean())
country_groups_test = test.groupby(['prop_country_id'])
country_means_test = country_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# replace missing location_score2 by average within the destination_id cluster
destination_mean_loc_score2_train = [destination_means_train[dest] for dest in train.srch_destination_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, destination_mean_loc_score2_train)), inplace=True)
destination_mean_loc_score2_test = [destination_means_test[dest] for dest in test.srch_destination_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, destination_mean_loc_score2_test)), inplace=True)

In [None]:
# replace missing location_score2 by average within the prop_country_id cluster
country_mean_loc_score2_train = [country_means_train[country] for country in train.prop_country_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, country_mean_loc_score2_train)), inplace=True)
country_mean_loc_score2_test = [country_means_test[country] for country in test.prop_country_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, country_mean_loc_score2_test)), inplace=True)

In [None]:
# replace remaining missing location_score by overall average
loc_score2_mean_train = train.prop_location_score2.mean()
train.prop_location_score2.fillna(loc_score2_mean_train, inplace=True)
loc_score2_mean_test = test.prop_location_score2.mean()
test.prop_location_score2.fillna(loc_score2_mean_test, inplace=True)

In [None]:
# replace missing srch_query_affinity_score by minimum value (hotels that did not register in any internet searches are punished)
train.srch_query_affinity_score.replace(0,np.nan, inplace=True)
test.srch_query_affinity_score.replace(0,np.nan, inplace=True)
sqas_min = min(train.srch_query_affinity_score.min(),test.srch_query_affinity_score.min())
train.srch_query_affinity_score.fillna(sqas_min, inplace=True)
test.srch_query_affinity_score.fillna(sqas_min, inplace=True)

In [None]:
# replace missing orig_destination_distance by average
train.orig_destination_distance.fillna(train.orig_destination_distance.mean(), inplace=True)
test.orig_destination_distance.fillna(test.orig_destination_distance.mean(), inplace=True)

In [None]:
# mean orig_destination_distance per (srch_destination_id, visitor_location_country_id) cluster
orig_destination_groups_train = train.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_train = orig_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
orig_destination_groups_test = test.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_test = orig_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# mean orig_destination_distance per (srch_destination_id, site_id) cluster
site_destination_groups_train = train.groupby(['srch_destination_id', 'site_id'])
distance_means2_train = site_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
site_destination_groups_test = test.groupby(['srch_destination_id', 'site_id'])
distance_means2_test = site_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, visitor_location_country_id) cluster
orig_destination_distance_estimate_train = [distance_means_train[(dest, orig)] for dest, orig in zip(train.srch_destination_id, train.visitor_location_country_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate_train)), inplace=True)
orig_destination_distance_estimate_test = [distance_means_test[(dest, orig)] for dest, orig in zip(test.srch_destination_id, test.visitor_location_country_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate_test)), inplace=True)

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, site_id) cluster
orig_destination_distance_estimate2_train = [distance_means2_train[(dest, site)] for dest, site in zip(train.srch_destination_id, train.site_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate2_train)), inplace=True)
orig_destination_distance_estimate2_test = [distance_means2_test[(dest, site)] for dest, site in zip(test.srch_destination_id, test.site_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate2_test)), inplace=True)

In [None]:
train['star_diff'] = np.abs(train.visitor_hist_starrating - train.prop_starrating)
train['star_diff_bool'] = train.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch
test['star_diff'] = np.abs(test.visitor_hist_starrating - test.prop_starrating)
test['star_diff_bool'] = test.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch

In [None]:
train['price_diff'] = np.abs(train.visitor_hist_adr_usd - train.price_usd)
train['price_diff_bool'] = train.price_diff.apply(lambda x: 1 if x <= 27 else 0) # 1 means match, 0 mismatch
test['price_diff'] = np.abs(test.visitor_hist_adr_usd - test.price_usd)
test['price_diff_bool'] = test.price_diff.apply(lambda x: 1 if x <= 27 else 0) # 1 means match, 0 mismatch

In [None]:
# mean hist_starrating per site_id
hist_starrating_groups_train = train.groupby(['site_id'])
starrating_means_train = hist_starrating_groups_train.apply(lambda x: x.visitor_hist_starrating.mean())
hist_starrating_groups_test = test.groupby(['site_id'])
starrating_means_test = hist_starrating_groups_test.apply(lambda x: x.visitor_hist_starrating.mean())

In [None]:
# replace missing hist_starrating by average within the site_id cluster
site_mean_hist_starrating_train = [starrating_means_train[site] for site in train.site_id]
train['visitor_hist_starrating_filled'] = train.visitor_hist_starrating.copy()
train.visitor_hist_starrating_filled.fillna(dict(zip(train.index.values, site_mean_hist_starrating_train)), inplace=True)
train.visitor_hist_starrating_filled.fillna(train.visitor_hist_starrating.mean(), inplace=True)
site_mean_hist_starrating_test = [starrating_means_test[site] for site in test.site_id]
test['visitor_hist_starrating_filled'] = test.visitor_hist_starrating.copy()
test.visitor_hist_starrating_filled.fillna(dict(zip(test.index.values, site_mean_hist_starrating_test)), inplace=True)
test.visitor_hist_starrating_filled.fillna(test.visitor_hist_starrating.mean(), inplace=True)

In [None]:
train['star_diff_filled'] = np.abs(train.visitor_hist_starrating_filled - train.prop_starrating)
test['star_diff_filled'] = np.abs(test.visitor_hist_starrating_filled - test.prop_starrating)

In [None]:
train['star_diff_bins'], star_diff_bins = pd.cut(train['price_diff_filled'], 4, retbins=True)
test['star_diff_bins'] = pd.cut(test['price_diff_filled'], bins=price_diff_bins)

In [None]:
# mean hist_adr_usd per site_id
hist_price_groups_train = train.groupby(['site_id'])
hist_price_means_train = hist_price_groups_train.apply(lambda x: x.visitor_hist_adr_usd.mean())
hist_price_groups_test = test.groupby(['site_id'])
hist_price_means_test = hist_price_groups_test.apply(lambda x: x.visitor_hist_adr_usd.mean())

In [None]:
# replace missing hist_adr_usd by average within the site_id cluster
site_mean_hist_price_train = [hist_price_means_train[site] for site in train.site_id]
train['visitor_hist_adr_usd_filled'] = train.visitor_hist_adr_usd.copy()
train.visitor_hist_adr_usd_filled.fillna(dict(zip(train.index.values, site_mean_hist_price_train)), inplace=True)
train.visitor_hist_adr_usd_filled.fillna(train.visitor_hist_adr_usd.mean(), inplace=True)
site_mean_hist_price_test = [hist_price_means_test[site] for site in test.site_id]
test['visitor_hist_adr_usd_filled'] = test.visitor_hist_adr_usd.copy()
test.visitor_hist_adr_usd_filled.fillna(dict(zip(test.index.values, site_mean_hist_price_test)), inplace=True)
test.visitor_hist_adr_usd_filled.fillna(test.visitor_hist_adr_usd.mean(), inplace=True)

In [None]:
train['price_diff_filled'] = np.abs(train.visitor_hist_adr_usd_filled - train.price_usd)
test['price_diff_filled'] = np.abs(test.visitor_hist_adr_usd_filled - test.price_usd)

In [None]:
train['price_diff_bins'], price_diff_bins = pd.qcut(train['price_diff_filled'], 10, retbins=True)
test['price_diff_bins'] = pd.cut(test['price_diff_filled'], bins=price_diff_bins)

In [4]:
train['month'] = train.date_time.apply(lambda x: x.split('-')[1])
test['month'] = test.date_time.apply(lambda x: x.split('-')[1])

In [None]:
# difference between mean prices of booked hotels and mean prices of not-booked hotels per site_id
site_groups = train.groupby(['site_id'])
price_diff = site_groups.apply(lambda x: x[x.booking_bool == 1].price_usd.mean() - x[x.booking_bool == 0].price_usd.mean())

In [None]:
train['willingness_to_pay'] = train.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)
test['willingness_to_pay'] = test.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)

In [None]:
hotel_groups = train.groupby(['prop_id'])
hotel_quality = hotel_groups.apply(lambda x: float(x.booking_bool.sum())/x.booking_bool.values.size)

In [None]:
pd.cut(hotel_quality, 4).value_counts()

In [None]:
train['prop_desirability'] = train.prop_id.apply(lambda x: hotel_quality[x])
train.prop_desirability.fillna(hotel_quality.mean(), inplace=True)

In [None]:
def get_hotel_quality(prop_id):
    try:
        return hotel_quality[prop_id]
    except:
        return hotel_quality.mean()
    
test['prop_desirability'] = test.prop_id.apply(get_hotel_quality)

In [6]:
def normalize_feature(df, feature_name, normalize_wrt_feature):
    groups = df.groupby(normalize_wrt_feature)
    avg_feature_vals = groups.apply(lambda x: x[feature_name].mean())
    new_col = []
    for row in df.itertuples():
        normed_val = getattr(row, feature_name) / avg_feature_vals[getattr(row, normalize_wrt_feature)]
        new_col.append(normed_val if np.isfinite(normed_val) else 0)
    df[feature_name + '_norm_' + normalize_wrt_feature] = new_col

In [6]:
normalize_feature(train, 'prop_review_score','srch_destination_id')
normalize_feature(test, 'prop_review_score','srch_destination_id')

  


In [7]:
normalize_feature(train, 'price_usd','srch_id')
normalize_feature(test, 'price_usd','srch_id')

In [None]:
normalize_feature(train, 'prop_location_score1','srch_id')
normalize_feature(test, 'prop_location_score1','srch_id')
normalize_feature(train, 'prop_location_score2','srch_id')
normalize_feature(test, 'prop_location_score2','srch_id')
normalize_feature(train, 'srch_query_affinity_score','srch_id')
normalize_feature(test, 'srch_query_affinity_score','srch_id')

In [None]:
train.drop(['comp1_rate','comp1_inv','comp1_rate_percent_diff','comp2_rate','comp2_inv','comp2_rate_percent_diff','comp3_rate','comp3_inv','comp3_rate_percent_diff','comp4_rate','comp4_inv','comp4_rate_percent_diff','comp5_rate','comp5_inv','comp5_rate_percent_diff','comp6_rate','comp6_inv','comp6_rate_percent_diff','comp7_rate','comp7_inv','comp7_rate_percent_diff','comp8_rate','comp8_inv','comp8_rate_percent_diff'], axis=1, inplace=True)

In [None]:
train['booked_clicked_combined'] = train.booking_bool + train.click_bool

In [20]:
train['booking_or_click_bool'] = np.logical_or(train.booking_bool.values, train.click_bool.values)

In [21]:
train.to_csv('train_clean.csv', index=False)

In [13]:
test.to_csv('test_clean.csv', index=False)

In [27]:
features = ['site_id','prop_brand_bool','prop_location_score1','prop_location_score2','srch_room_count','promotion_flag','srch_destination_id', 'month', 'prop_id']
cv = StratifiedKFold(n_splits=2, random_state=0)

#svr_scores = cross_val_score(RandomForestRegressor(), train[features], train['position'], cv=cv)
#lr_scores = cross_val_score(LinearRegression(), binary_features, classification_data[target], cv=cv)
clf = RandomForestRegressor()
clf.fit(train[features], train['position'])
    
#print(np.mean(svr_scores))

MemoryError: could not allocate 117440512 bytes

In [2]:
test = pd.read_csv('test_clean.csv')

In [9]:
train = pd.read_csv('train_clean.csv')

In [8]:
train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,willingness_to_pay,booked_clicked_combined,prop_desirability,star_diff_filled,price_diff_filled,month,prop_review_score_norm_srch_destination_id,price_usd_norm_srch_id,booking_or_click_bool,prop_location_score1_norm_srch_id
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0,0,0.01634,0.472818,97.302673,4,0.93078,0.639938,False,1.230626
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0,0,0.015437,0.527182,31.332673,4,1.063749,1.042885,False,0.95667
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0,0,0.00363,0.472818,22.272673,4,1.196718,1.098224,False,0.95667
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,0,0,0.013043,1.472818,400.697327,4,1.063749,3.681737,False,1.230626
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0,0,0.03609,0.527182,58.492673,4,0.93078,0.876991,False,1.148004


In [None]:
test.head()