In [4]:
import numpy as np
import pandas as pd
import dateparser
import sklearn
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold #RepeatedStratifiedKFold, 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import pickle

In [5]:
train = pd.read_csv('Data Mining VU data/training_set_VU_DM_2014.csv')

In [6]:
test = pd.read_csv('Data Mining VU data/test_set_VU_DM_2014.csv')

In [None]:
# load the clean train set
#train = pd.read_csv('train_clean.csv')
# load the clean test set
#test = pd.read_csv('test_clean.csv')

In [9]:
# trim the upper 2.5% of price_usd and visitor_hist_adr_usd to remove extreme outliers
train.price_usd = train.price_usd.apply(lambda x: x if x < 550 else 550)
test.price_usd = test.price_usd.apply(lambda x: x if x < 550 else 550)
train.visitor_hist_adr_usd = train.visitor_hist_adr_usd.apply(lambda x: x if x < 550 else 550)
test.visitor_hist_adr_usd = test.visitor_hist_adr_usd.apply(lambda x: x if x < 550 else 550)

###  Deal with missing data

In [None]:
# replace missing review score by the worst value 0
train.prop_review_score.fillna(0, inplace=True)
test.prop_review_score.fillna(0, inplace=True)

In [None]:
# mean location_score2 per srch_destination
destination_groups_train = train.groupby(['srch_destination_id'])
destination_means_train = destination_groups_train.apply(lambda x: x.prop_location_score2.mean())
destination_groups_test = test.groupby(['srch_destination_id'])
destination_means_test = destination_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# mean location_score2 per prop_country
country_groups_train = train.groupby(['prop_country_id'])
country_means_train = country_groups_train.apply(lambda x: x.prop_location_score2.mean())
country_groups_test = test.groupby(['prop_country_id'])
country_means_test = country_groups_test.apply(lambda x: x.prop_location_score2.mean())

In [None]:
# replace missing location_score2 by average within the destination_id cluster
destination_mean_loc_score2_train = [destination_means_train[dest] for dest in train.srch_destination_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, destination_mean_loc_score2_train)), inplace=True)
destination_mean_loc_score2_test = [destination_means_test[dest] for dest in test.srch_destination_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, destination_mean_loc_score2_test)), inplace=True)

In [None]:
# replace missing location_score2 by average within the prop_country_id cluster
country_mean_loc_score2_train = [country_means_train[country] for country in train.prop_country_id]
train.prop_location_score2.fillna(dict(zip(train.index.values, country_mean_loc_score2_train)), inplace=True)
country_mean_loc_score2_test = [country_means_test[country] for country in test.prop_country_id]
test.prop_location_score2.fillna(dict(zip(test.index.values, country_mean_loc_score2_test)), inplace=True)

In [None]:
# replace remaining missing location_score by overall average
loc_score2_mean_train = train.prop_location_score2.mean()
train.prop_location_score2.fillna(loc_score2_mean_train, inplace=True)
loc_score2_mean_test = test.prop_location_score2.mean()
test.prop_location_score2.fillna(loc_score2_mean_test, inplace=True)

In [None]:
# replace missing srch_query_affinity_score by minimum value (hotels that did not register in any internet searches are punished)
train.srch_query_affinity_score.replace(0,np.nan, inplace=True)
test.srch_query_affinity_score.replace(0,np.nan, inplace=True)
sqas_min = min(train.srch_query_affinity_score.min(),test.srch_query_affinity_score.min())
train.srch_query_affinity_score.fillna(sqas_min, inplace=True)
test.srch_query_affinity_score.fillna(sqas_min, inplace=True)

In [None]:
# replace missing orig_destination_distance by average
train.orig_destination_distance.fillna(train.orig_destination_distance.mean(), inplace=True)
test.orig_destination_distance.fillna(test.orig_destination_distance.mean(), inplace=True)

In [None]:
# mean orig_destination_distance per (srch_destination_id, visitor_location_country_id) cluster
orig_destination_groups_train = train.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_train = orig_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
orig_destination_groups_test = test.groupby(['srch_destination_id', 'visitor_location_country_id'])
distance_means_test = orig_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# mean orig_destination_distance per (srch_destination_id, site_id) cluster
site_destination_groups_train = train.groupby(['srch_destination_id', 'site_id'])
distance_means2_train = site_destination_groups_train.apply(lambda x: x.orig_destination_distance.mean())
site_destination_groups_test = test.groupby(['srch_destination_id', 'site_id'])
distance_means2_test = site_destination_groups_test.apply(lambda x: x.orig_destination_distance.mean())

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, visitor_location_country_id) cluster
orig_destination_distance_estimate_train = [distance_means_train[(dest, orig)] for dest, orig in zip(train.srch_destination_id, train.visitor_location_country_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate_train)), inplace=True)
orig_destination_distance_estimate_test = [distance_means_test[(dest, orig)] for dest, orig in zip(test.srch_destination_id, test.visitor_location_country_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate_test)), inplace=True)

In [None]:
# replace missing orig_destination_distance by average within the (destination_id, site_id) cluster
orig_destination_distance_estimate2_train = [distance_means2_train[(dest, site)] for dest, site in zip(train.srch_destination_id, train.site_id)]
train.orig_destination_distance.fillna(dict(zip(train.index.values, orig_destination_distance_estimate2_train)), inplace=True)
orig_destination_distance_estimate2_test = [distance_means2_test[(dest, site)] for dest, site in zip(test.srch_destination_id, test.site_id)]
test.orig_destination_distance.fillna(dict(zip(test.index.values, orig_destination_distance_estimate2_test)), inplace=True)

In [16]:
# mean hist_starrating per site_id
hist_starrating_groups_train = train.groupby(['site_id'])
starrating_means_train = hist_starrating_groups_train.apply(lambda x: x.visitor_hist_starrating.mean())
hist_starrating_groups_test = test.groupby(['site_id'])
starrating_means_test = hist_starrating_groups_test.apply(lambda x: x.visitor_hist_starrating.mean())
train.shape

(4958347, 60)

In [17]:
# replace missing hist_starrating by average within the site_id cluster
site_mean_hist_starrating_train = [starrating_means_train[site] for site in train.site_id]
train['visitor_hist_starrating_filled'] = train.visitor_hist_starrating.copy()
train.visitor_hist_starrating_filled.fillna(dict(zip(train.index.values, site_mean_hist_starrating_train)), inplace=True)
train.visitor_hist_starrating_filled.fillna(train.visitor_hist_starrating.mean(), inplace=True)
site_mean_hist_starrating_test = [starrating_means_test[site] for site in test.site_id]
test['visitor_hist_starrating_filled'] = test.visitor_hist_starrating.copy()
test.visitor_hist_starrating_filled.fillna(dict(zip(test.index.values, site_mean_hist_starrating_test)), inplace=True)
test.visitor_hist_starrating_filled.fillna(test.visitor_hist_starrating.mean(), inplace=True)
train.shape

(4958347, 61)

In [21]:
# mean hist_adr_usd per site_id
hist_price_groups_train = train.groupby(['site_id'])
hist_price_means_train = hist_price_groups_train.apply(lambda x: x.visitor_hist_adr_usd.mean())
hist_price_groups_test = test.groupby(['site_id'])
hist_price_means_test = hist_price_groups_test.apply(lambda x: x.visitor_hist_adr_usd.mean())
train.shape

(4958347, 63)

In [22]:
# replace missing hist_adr_usd by average within the site_id cluster
site_mean_hist_price_train = [hist_price_means_train[site] for site in train.site_id]
train['visitor_hist_adr_usd_filled'] = train.visitor_hist_adr_usd.copy()
train.visitor_hist_adr_usd_filled.fillna(dict(zip(train.index.values, site_mean_hist_price_train)), inplace=True)
train.visitor_hist_adr_usd_filled.fillna(train.visitor_hist_adr_usd.mean(), inplace=True)
site_mean_hist_price_test = [hist_price_means_test[site] for site in test.site_id]
test['visitor_hist_adr_usd_filled'] = test.visitor_hist_adr_usd.copy()
test.visitor_hist_adr_usd_filled.fillna(dict(zip(test.index.values, site_mean_hist_price_test)), inplace=True)
test.visitor_hist_adr_usd_filled.fillna(test.visitor_hist_adr_usd.mean(), inplace=True)
train.shape

(4958347, 64)

### Create new features 

In [11]:
train.shape
train['srch_query_affinity_bins'], affinity_bins = pd.cut(train.srch_query_affinity_score.apply(lambda x: x if x >= -100 else -100), 5, labels=np.arange(5), retbins=True)
#test['srch_query_affinity_bins'] = pd.cut(test.srch_query_affinity_score.apply(lambda x: x if x >= -100 else -100), bins=affinity_bins, labels=np.arange(5))
train.shape

(4958347, 56)

In [12]:
train.shape
train['star_diff'] = np.abs(train.visitor_hist_starrating - train.prop_starrating)
train['star_diff_bool'] = train.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch
test['star_diff'] = np.abs(test.visitor_hist_starrating - test.prop_starrating)
test['star_diff_bool'] = test.star_diff.apply(lambda x: 1 if x <= 1 else 0) # 1 means match, 0 mismatch
train.shape

(4958347, 58)

In [13]:
train.shape
train['price_diff'] = np.abs(train.visitor_hist_adr_usd - train.price_usd)
train['price_diff_bool'] = train.price_diff.apply(lambda x: 1 if x <= 20 else 0) # 1 means match, 0 mismatch (use 25% quantil as threshold)
test['price_diff'] = np.abs(test.visitor_hist_adr_usd - test.price_usd)
test['price_diff_bool'] = test.price_diff.apply(lambda x: 1 if x <= 20 else 0) # 1 means match, 0 mismatch (use 25% quantil as threshold)
train.shape

(4958347, 60)

In [18]:
train['star_diff_filled'] = np.abs(train.visitor_hist_starrating_filled - train.prop_starrating)
test['star_diff_filled'] = np.abs(test.visitor_hist_starrating_filled - test.prop_starrating)
train.shape

(4958347, 62)

In [19]:
train['star_diff_bins'], star_diff_bins = pd.cut(train['star_diff_filled'], 4, labels=np.arange(4), retbins=True)
test['star_diff_bins'] = pd.cut(test['star_diff_filled'], bins=star_diff_bins, labels=np.arange(4))
train.shape

(4958347, 63)

In [23]:
train['price_diff_filled'] = np.abs(train.visitor_hist_adr_usd_filled - train.price_usd)
test['price_diff_filled'] = np.abs(test.visitor_hist_adr_usd_filled - test.price_usd)
train.shape

(4958347, 65)

In [24]:
train['price_diff_bins'] = pd.qcut(train['price_diff_filled'], 10, labels=np.arange(10))
test['price_diff_bins'] = pd.qcut(test['price_diff_filled'], 10 , labels=np.arange(10))
train.shape

(4958347, 66)

In [25]:
train['month'] = train.date_time.apply(lambda x: x.split('-')[1])
test['month'] = test.date_time.apply(lambda x: x.split('-')[1])
train.shape

(4958347, 67)

In [26]:
# difference between mean prices of booked hotels and mean prices of not-booked hotels per site_id
site_groups = train.groupby(['site_id'])
price_diff = site_groups.apply(lambda x: x[x.booking_bool == 1].price_usd.mean() - x[x.booking_bool == 0].price_usd.mean())
train.shape

(4958347, 67)

In [27]:
train['willingness_to_pay'] = train.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)
test['willingness_to_pay'] = test.site_id.apply(lambda x: 1 if price_diff[x] > 0 else 0)
train.shape

(4958347, 68)

In [28]:
hotel_groups = train.groupby(['prop_id'])
hotel_quality = hotel_groups.apply(lambda x: float(x.booking_bool.sum())/x.booking_bool.values.size)
train.shape

(4958347, 68)

In [29]:
train['prop_desirability'] = train.prop_id.apply(lambda x: hotel_quality[x])
train.shape

(4958347, 69)

In [30]:
def get_hotel_quality(prop_id):
    try:
        return hotel_quality[prop_id]
    except:
        return hotel_quality.mean()
    
test['prop_desirability'] = test.prop_id.apply(get_hotel_quality)

In [32]:
all_prop_ids = train.prop_id.unique()
leave_out = np.random.choice(all_prop_ids, int(0.06*len(all_prop_ids)), replace=False)
hotel_quality.loc[leave_out] = hotel_quality.mean()
train['prop_desirability_incomplete'] = train.prop_id.apply(lambda x: hotel_quality[x])
train.shape
#test.shape

(4958347, 70)

In [33]:
# combine the two location scores into one joint feature
train['prop_location_score_combined'] = (train.prop_location_score1 / train.prop_location_score1.mean()) - np.log(train.prop_location_score2 + 0.00001)/np.mean(np.log(train.prop_location_score2 + 0.000001))
test['prop_location_score_combined'] = (test.prop_location_score1 / test.prop_location_score1.mean()) - np.log(test.prop_location_score2 + 0.00001)/np.mean(np.log(test.prop_location_score2 + 0.000001))
train.shape

(4958347, 71)

In [34]:
train['booked_clicked_combined'] = train.booking_bool + train.click_bool
train.shape

(4958347, 72)

### Feature Normalization 

In [8]:
def normalize_feature(df, feature_name, normalize_wrt_feature):
    groups = df.groupby(normalize_wrt_feature)
    avg_feature_vals = groups.apply(lambda x: x[feature_name].mean())
    new_col = []
    for row in df.itertuples():
        normed_val = getattr(row, feature_name) / avg_feature_vals[getattr(row, normalize_wrt_feature)]
        new_col.append(normed_val if np.isfinite(normed_val) else 0)
    df[feature_name + '_norm_' + normalize_wrt_feature] = new_col


(4958347, 54)

In [None]:
# normalize numerical features with regard to 'srch_destination_id'
normalize_feature(train, 'price_usd','srch_destination_id')
normalize_feature(test, 'price_usd','srch_destination_id')
normalize_feature(train, 'prop_location_score1','srch_destination_id')
normalize_feature(test, 'prop_location_score1','srch_destination_id')
normalize_feature(train, 'prop_location_score2','srch_destination_id')
normalize_feature(test, 'prop_location_score2','srch_destination_id')
normalize_feature(train, 'prop_review_score','srch_destination_id')
normalize_feature(test, 'prop_review_score','srch_destination_id')
train.shape

  


In [None]:
# normalize numerical features with regard to 'srch_id'
normalize_feature(train, 'price_usd','srch_id')
#normalize_feature(test, 'price_usd','srch_id')
normalize_feature(train, 'prop_location_score1','srch_id')
#normalize_feature(test, 'prop_location_score1','srch_id')
normalize_feature(train, 'prop_location_score2','srch_id')
#normalize_feature(test, 'prop_location_score2','srch_id')
normalize_feature(train, 'prop_review_score','srch_id')
#normalize_feature(test, 'prop_review_score','srch_id')
train.shape

In [None]:
# normalize numerical features with regard to 'site_id'
normalize_feature(train, 'price_usd','site_id')
#normalize_feature(test, 'price_usd','site_id')
normalize_feature(train, 'prop_location_score1','site_id')
#normalize_feature(test, 'prop_location_score1','site_id')
normalize_feature(train, 'prop_location_score2','site_id')
#normalize_feature(test, 'prop_location_score2','site_id')
normalize_feature(train, 'prop_review_score','site_id')
#normalize_feature(test, 'prop_review_score','site_id')
train.shape

### Average hotel features 

In [8]:
# compute average characteristics per prop_id
properties_train = train[['price_usd','prop_location_score1','prop_location_score2','prop_location_score_combined','prop_review_score','prop_starrating','prop_id']].groupby(['prop_id'])
avg_price_usd_per_prop_train = properties_train.apply(lambda x: x.price_usd.mean())
avg_loc_score1_per_prop_train = properties_train.apply(lambda x: x.prop_location_score1.mean())
avg_loc_score2_per_prop_train = properties_train.apply(lambda x: x.prop_location_score2.mean())
avg_loc_score_combined_per_prop_train = properties_train.apply(lambda x: x.prop_location_score_combined.mean())
avg_review_score_per_prop_train = properties_train.apply(lambda x: x.prop_review_score.mean())
avg_starrating_per_prop_train = properties_train.apply(lambda x: x.prop_starrating.mean())

# create features
train['avg_prop_price_usd'] = train.prop_id.apply(lambda x: avg_price_usd_per_prop_train[x])
train['avg_prop_location_score1'] = train.prop_id.apply(lambda x: avg_loc_score1_per_prop_train[x])
train['avg_prop_location_score2'] = train.prop_id.apply(lambda x: avg_loc_score2_per_prop_train[x])
train['avg_prop_location_score_combined'] = train.prop_id.apply(lambda x: avg_loc_score_combined_per_prop_train[x])
train['avg_prop_review_score'] = train.prop_id.apply(lambda x: avg_review_score_per_prop_train[x])
train['avg_prop_starrating'] = train.prop_id.apply(lambda x: avg_starrating_per_prop_train[x])

In [9]:
# compute average characteristics per prop_id
properties_test = test[['price_usd','prop_location_score1','prop_location_score2','prop_location_score_combined','prop_review_score','prop_starrating','prop_id']].groupby(['prop_id'])
avg_price_usd_per_prop_test = properties_test.apply(lambda x: x.price_usd.mean())
avg_loc_score1_per_prop_test = properties_test.apply(lambda x: x.prop_location_score1.mean())
avg_loc_score2_per_prop_test = properties_test.apply(lambda x: x.prop_location_score2.mean())
avg_loc_score_combined_per_prop_test = properties_test.apply(lambda x: x.prop_location_score_combined.mean())
avg_review_score_per_prop_test = properties_test.apply(lambda x: x.prop_review_score.mean())
avg_starrating_per_prop_test = properties_test.apply(lambda x: x.prop_starrating.mean())

# create features
test['avg_prop_price_usd'] = test.prop_id.apply(lambda x: avg_price_usd_per_prop_test[x])
test['avg_prop_location_score1'] = test.prop_id.apply(lambda x: avg_loc_score1_per_prop_test[x])
test['avg_prop_location_score2'] = test.prop_id.apply(lambda x: avg_loc_score2_per_prop_test[x])
test['avg_prop_location_score_combined'] = test.prop_id.apply(lambda x: avg_loc_score_combined_per_prop_test[x])
test['avg_prop_review_score'] = test.prop_id.apply(lambda x: avg_review_score_per_prop_test[x])
test['avg_prop_starrating'] = test.prop_id.apply(lambda x: avg_starrating_per_prop_test[x])

### Delete features that we decided not to use

In [35]:
train.drop(['comp1_rate','comp1_inv','comp1_rate_percent_diff','comp2_rate','comp2_inv','comp2_rate_percent_diff','comp3_rate','comp3_inv','comp3_rate_percent_diff','comp4_rate','comp4_inv','comp4_rate_percent_diff','comp5_rate','comp5_inv','comp5_rate_percent_diff','comp6_rate','comp6_inv','comp6_rate_percent_diff','comp7_rate','comp7_inv','comp7_rate_percent_diff','comp8_rate','comp8_inv','comp8_rate_percent_diff'], axis=1, inplace=True)
train.shape

(4958347, 48)

### Establish a model for position estimation 

In [36]:
# create samples that always contain all queries of a srch_id
srch_id_groups = train.groupby('srch_id')
all_srch_ids = train.srch_id.unique()
num_samples = int(0.25*len(all_srch_ids)) # 25% of the entire train set
np.random.seed(0)
position_model_ids = np.random.choice(all_srch_ids, num_samples, replace=False)
position_model_sample = pd.concat([srch_id_groups.get_group(group) for group in position_model_ids])
train.shape

(4958347, 48)

In [37]:
pos_features = ['site_id','prop_country_id','prop_starrating','prop_review_score','prop_brand_bool','prop_location_score2','price_usd','promotion_flag','srch_children_count','srch_room_count','visitor_hist_starrating_filled','star_diff_filled','prop_review_score_norm_srch_destination_id','price_usd_norm_srch_id','price_diff_bins','price_usd_norm_srch_destination_id','prop_location_score1_norm_srch_destination_id','prop_location_score2_norm_srch_destination_id','prop_review_score_norm_srch_id','prop_location_score1_norm_srch_id','prop_location_score2_norm_srch_id','prop_location_score_combined','price_usd_norm_site_id','prop_location_score2_norm_site_id','prop_review_score_norm_site_id','avg_prop_price_usd','avg_prop_location_score2','avg_prop_location_score_combined','avg_prop_review_score','avg_prop_starrating']
train.shape

(4958347, 48)

In [38]:
# evaluate performance of position model with cross-validation on the entire training set
cv = StratifiedKFold(n_splits=4, random_state=0)
scores = cross_val_score(RandomForestRegressor(n_estimators=50), train[pos_features], train['position'], cv=cv)
print(np.mean(scores))
train.shape

KeyError: "['prop_review_score_norm_srch_destination_id' 'price_usd_norm_srch_id'\n 'prop_location_score1_norm_srch_destination_id'\n 'prop_location_score2_norm_srch_destination_id'\n 'prop_review_score_norm_srch_id' 'prop_location_score1_norm_srch_id'\n 'prop_location_score2_norm_srch_id' 'price_usd_norm_site_id'\n 'prop_location_score2_norm_site_id' 'prop_review_score_norm_site_id'\n 'avg_prop_price_usd' 'avg_prop_location_score2'\n 'avg_prop_location_score_combined' 'avg_prop_review_score'\n 'avg_prop_starrating'] not in index"

In [4]:
# train the position model
clf = RandomForestRegressor(n_estimators=50, random_state=0)
clf.fit(train_sample[pos_features], position_model_sample['position'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
# save the trained model to file
pickle.dump(clf, open('position_model.sav', 'wb'))

In [None]:
# load the position model from file
#clf = pickle.load(open('position_model.sav', 'rb'))

In [10]:
train['position_estimate'] = clf.predict(train[pos_features]).astype(int)
test['position_estimate'] = clf.predict(test[pos_features]).astype(int)

In [13]:
pd.DataFrame({'feature':pos_features, 'feature_importance':clf.feature_importances_}).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature,feature_importance
20,prop_location_score2_norm_srch_id,0.108617
18,prop_review_score_norm_srch_id,0.091706
19,prop_location_score1_norm_srch_id,0.090787
6,prop_location_score2,0.086432
17,prop_location_score2_norm_srch_destination_id,0.068569
15,price_usd_norm_srch_destination_id,0.067228
7,price_usd,0.060701
5,prop_location_score1,0.054462
24,avg_prop_location_score_combined,0.051149
12,month,0.040025


### Delete columns that cannot be used in the model 

In [17]:
# delete unnecessary columns
train.drop(columns=['date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'gross_bookings_usd', 'star_diff', 'price_diff'], inplace=True)
test.drop(columns=['date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'star_diff', 'price_diff'], inplace=True)

### Save the cleaned datasets 

In [21]:
train.to_csv('train_clean.csv', index=False)

In [22]:
test.to_csv('test_clean.csv', index=False)

### Use th following code to divide the training set into samples. Apply all cross-validations on xval_sample or subsets of it!

In [None]:
# divide the training set into subsamples for: feature engineering (position modeling), cross-validation, single training, single validation
srch_id_groups = train.groupby('srch_id')
all_srch_ids = train.srch_id.unique()
num_samples_pos = int(0.25*len(all_srch_ids)) # 25%
num_samples_train = int(0.5*len(all_srch_ids)) # 50%
np.random.seed(0)
position_model_ids = np.random.choice(all_srch_ids, num_samples_pos, replace=False)
xval_ids = list(set(all_srch_ids)-set(position_model_ids))
training_ids = np.random.choice(xval_ids, num_samples_train, replace=False)
validation_ids = list(set(xval_ids)-set(training_ids))

xval_sample = train[train.srch_id.isin(xval_ids)]
train_sample = train[train.srch_id.isin(training_ids)]
validation_sample = train[train.srch_id.isin(validation_ids)]

# Execute this before making the final predictions for the test set !!!!

In [None]:
test['price_diff_bins'] = pd.qcut(test['price_diff_filled'], 10 , labels=np.arange(10))