In [2]:
import numpy as np
import pandas as pd
import dateparser
import sklearn
import seaborn as sns
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('Data Mining VU data/training_set_VU_DM_2014.csv')

In [None]:
test = pd.read_csv('Data Mining VU data/test_set_VU_DM_2014.csv')

In [None]:
train_sample = train.sample(frac=0.3, random_state=0)
test_sample = train.drop(train_sample.index)

In [None]:
# replace missing review score by the worst value 0
train.prop_review_score.fillna(0, inplace=True)
test.prop_review_score.fillna(0, inplace=True)

In [None]:
# mean location_score2 per srch_destination
destination_groups_train = train.groupby(['srch_destination_id'])
destination_means_train = destination_groups_train.apply(lambda x: x.prop_location_score2.mean())
destination_groups_test = test.groupby(['srch_destination_id'])
destination_means_test = destination_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# mean location_score2 per prop_country
country_groups_train = train.groupby(['prop_country_id'])
country_means_train = country_groups_train.apply(lambda x: x.prop_location_score2.mean())
country_groups_test = test.groupby(['prop_country_id'])
country_means_test = country_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# replace missing location_score2 by average within the destination_id cluster
destination_mean_loc_score2_train = [destination_means_train[dest] for dest in train.srch_destination_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, destination_mean_loc_score2_train)), inplace=True)
destination_mean_loc_score2_test = [destination_means_test[dest] for dest in test.srch_destination_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, destination_mean_loc_score2_test)), inplace=True)

In [None]:
# replace missing location_score2 by average within the prop_country_id cluster
country_mean_loc_score2_train = [country_means_train[country] for country in train.prop_country_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, country_mean_loc_score2_train)), inplace=True)
country_mean_loc_score2_test = [country_means_test[country] for country in test.prop_country_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, country_mean_loc_score2_test)), inplace=True)

In [None]:
# replace remaining missing location_score by overall average
loc_score2_mean_train = train.prop_location_score2.mean()
train.prop_location_score2.fillna(loc_score2_mean_train, inplace=True)
loc_score2_mean_test = test.prop_location_score2.mean()
test.prop_location_score2.fillna(loc_score2_mean_test, inplace=True)

In [None]:
# replace missing srch_query_affinity_score by minimum value (hotels that did not register in any internet searches are punished)
train.srch_query_affinity_score.replace(0,np.nan, inplace=True)
test.srch_query_affinity_score.replace(0,np.nan, inplace=True)
sqas_min = min(train.srch_query_affinity_score.min(),test.srch_query_affinity_score.min())
train.srch_query_affinity_score.fillna(sqas_min, inplace=True)
test.srch_query_affinity_score.fillna(sqas_min, inplace=True)

In [None]:
# replace missing orig_destination_distance by average
train.orig_destination_distance.fillna(train.orig_destination_distance.mean(), inplace=True)
test.orig_destination_distance.fillna(test.orig_destination_distance.mean(), inplace=True)

In [None]:
# mean orig_destination_distance per (srch_destination_id, visitor_location_country_id) cluster
orig_destination_groups_train = train.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_train = orig_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
orig_destination_groups_test = test.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_test = orig_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# mean orig_destination_distance per (srch_destination_id, site_id) cluster
site_destination_groups_train = train.groupby(['srch_destination_id', 'site_id'])
distance_means2_train = site_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
site_destination_groups_test = test.groupby(['srch_destination_id', 'site_id'])
distance_means2_test = site_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, visitor_location_country_id) cluster
orig_destination_distance_estimate_train = [distance_means_train[(dest, orig)] for dest, orig in zip(train.srch_destination_id, train.visitor_location_country_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate_train)), inplace=True)
orig_destination_distance_estimate_test = [distance_means_test[(dest, orig)] for dest, orig in zip(test.srch_destination_id, test.visitor_location_country_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate_test)), inplace=True)

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, site_id) cluster
orig_destination_distance_estimate2_train = [distance_means2_train[(dest, site)] for dest, site in zip(train.srch_destination_id, train.site_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate2_train)), inplace=True)
orig_destination_distance_estimate2_test = [distance_means2_test[(dest, site)] for dest, site in zip(test.srch_destination_id, test.site_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate2_test)), inplace=True)

In [None]:
train['star_diff'] = np.abs(train.visitor_hist_starrating - train.prop_starrating)
train['star_diff_bool'] = train.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch
test['star_diff'] = np.abs(test.visitor_hist_starrating - test.prop_starrating)
test['star_diff_bool'] = test.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch

In [None]:
train['price_diff'] = np.abs(train.visitor_hist_adr_usd - train.price_usd)
train['price_diff_bool'] = train.price_diff.apply(lambda x: 1 if x <= 27 else 0) # 1 means match, 0 mismatch
test['price_diff'] = np.abs(test.visitor_hist_adr_usd - test.price_usd)
test['price_diff_bool'] = test.price_diff.apply(lambda x: 1 if x <= 27 else 0) # 1 means match, 0 mismatch

In [None]:
# mean hist_starrating per site_id
hist_starrating_groups_train = train.groupby(['site_id'])
starrating_means_train = hist_starrating_groups_train.apply(lambda x: x.visitor_hist_starrating.mean())
hist_starrating_groups_test = test.groupby(['site_id'])
starrating_means_test = hist_starrating_groups_test.apply(lambda x: x.visitor_hist_starrating.mean())

In [None]:
# replace missing hist_starrating by average within the site_id cluster
site_mean_hist_starrating_train = [starrating_means_train[site] for site in train.site_id]
train['visitor_hist_starrating_filled'] = train.visitor_hist_starrating.copy()
train.visitor_hist_starrating_filled.fillna(dict(zip(train.index.values, site_mean_hist_starrating_train)), inplace=True)
train.visitor_hist_starrating_filled.fillna(train.visitor_hist_starrating.mean(), inplace=True)
site_mean_hist_starrating_test = [starrating_means_test[site] for site in test.site_id]
test['visitor_hist_starrating_filled'] = test.visitor_hist_starrating.copy()
test.visitor_hist_starrating_filled.fillna(dict(zip(test.index.values, site_mean_hist_starrating_test)), inplace=True)
test.visitor_hist_starrating_filled.fillna(test.visitor_hist_starrating.mean(), inplace=True)

In [4]:
# mean hist_adr_usd per site_id
hist_price_groups_train = train.groupby(['site_id'])
hist_price_means_train = hist_price_groups_train.apply(lambda x: x.visitor_hist_adr_usd.mean())
hist_price_groups_test = test.groupby(['site_id'])
hist_price_means_test = hist_price_groups_test.apply(lambda x: x.visitor_hist_adr_usd.mean())

In [5]:
# replace missing hist_adr_usd by average within the site_id cluster
site_mean_hist_price_train = [hist_price_means_train[site] for site in train.site_id]
train['visitor_hist_adr_usd_filled'] = train.visitor_hist_adr_usd.copy()
train.visitor_hist_adr_usd_filled.fillna(dict(zip(train.index.values, site_mean_hist_price_train)), inplace=True)
train.visitor_hist_adr_usd_filled.fillna(train.visitor_hist_adr_usd.mean(), inplace=True)
site_mean_hist_price_test = [hist_price_means_test[site] for site in test.site_id]
test['visitor_hist_adr_usd_filled'] = test.visitor_hist_adr_usd.copy()
test.visitor_hist_adr_usd_filled.fillna(dict(zip(test.index.values, site_mean_hist_price_test)), inplace=True)
test.visitor_hist_adr_usd_filled.fillna(test.visitor_hist_adr_usd.mean(), inplace=True)

In [None]:
#train['month'] = train.date_time.apply(lambda x: dateparser.date.DateDataParser().get_date_data(str(x)).get('date_obj').month)

In [6]:
# difference between mean prices of booked hotels and mean prices of not-booked hotels per site_id
site_groups = train.groupby(['site_id'])
price_diff = site_groups.apply(lambda x: x[x.booking_bool == 1].price_usd.mean() - x[x.booking_bool == 0].price_usd.mean())

In [7]:
train['willingness_to_pay'] = train.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)
test['willingness_to_pay'] = test.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)

In [8]:
hotel_groups = train.groupby(['prop_id'])
hotel_quality = hotel_groups.apply(lambda x: float(x.booking_bool.sum())/x.booking_bool.values.size)

In [11]:
pd.cut(hotel_quality, 4).value_counts()

(-0.001, 0.25]    126665
(0.25, 0.5]         1874
(0.75, 1.0]          469
(0.5, 0.75]          105
dtype: int64

In [13]:
train['prop_desirability'] = train.prop_id.apply(lambda x: hotel_quality[x])
train.prop_desirability.fillna(hotel_quality.mean(), inplace=True)

In [None]:
test['prop_desirability'] = test.prop_id.apply(lambda x: hotel_quality[x] if x in train.prop_id.unique() else np.nan)
test.prop_desirability.fillna(hotel_quality.mean(), inplace=True)

In [None]:
train.drop(['comp1_rate','comp1_inv','comp1_rate_percent_diff','comp2_rate','comp2_inv','comp2_rate_percent_diff','comp3_rate','comp3_inv','comp3_rate_percent_diff','comp4_rate','comp4_inv','comp4_rate_percent_diff','comp5_rate','comp5_inv','comp5_rate_percent_diff','comp6_rate','comp6_inv','comp6_rate_percent_diff','comp7_rate','comp7_inv','comp7_rate_percent_diff','comp8_rate','comp8_inv','comp8_rate_percent_diff'], axis=1, inplace=True)

In [8]:
train.to_csv('train_clean.csv', index=False)

In [3]:
test.to_csv('test_clean.csv', index=False)

NameError: name 'test' is not defined

In [None]:
features = ['site_id','prop_country_id','prop_starrating','prop_brand_bool','prop_location_score1','promotion_flag','srch_destination_id']
cv = StratifiedKFold(n_splits=10, random_state=0)

svr_scores = cross_val_score(SVR(), train_sample[features], train_sample['position'], cv=cv)
#lr_scores = cross_val_score(LinearRegression(), binary_features, classification_data[target], cv=cv)
    
print(np.mean(svr_scores))

In [4]:
test = pd.read_csv('test_clean.csv')

In [6]:
train = pd.read_csv('train_clean.csv')

In [3]:
train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,random_bool,click_bool,gross_bookings_usd,booking_bool,star_diff,star_diff_bool,price_diff,price_diff_bool,visitor_hist_starrating_filled,visitor_hist_adr_usd_filled
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,1,0,,0,,0,,0,3.472818,
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,1,0,,0,,0,,0,3.472818,
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,1,0,,0,,0,,0,3.472818,
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,1,0,,0,,0,,0,3.472818,
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,1,0,,0,,0,,0,3.472818,
