In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [2]:
train = pd.read_csv('Data Mining VU data/training_set_VU_DM_2014.csv')
test = pd.read_csv('Data Mining VU data/test_set_VU_DM_2014.csv')

In [3]:
train_clean = pd.read_csv('processed_data/train_clean.csv')
test_clean = pd.read_csv('processed_data/test_clean.csv')

In [4]:
# smaller samples for faster computing
tr_cl_sample = train_clean.sample(frac=0.5, random_state=0)
te_cl_sample = train_clean.drop(tr_cl_sample.index)

In [5]:
tr_cl_sample.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,star_diff_bool,price_diff,price_diff_bool,visitor_hist_starrating_filled,visitor_hist_adr_usd_filled,willingness_to_pay,booked_clicked_combined,prop_desirability,star_diff_filled,price_diff_filled
930452,62094,2013-01-26 22:10:19,18,31,,,32,47040,4,4.5,...,0,,0,3.604085,167.55862,0,0,0.0,0.395915,81.50138
1701430,114235,2012-11-13 12:42:21,1,13,,,15,115700,5,4.5,...,0,,0,3.600963,223.067754,0,0,0.022989,1.399037,291.762246
1595996,107254,2013-04-25 10:22:32,16,31,,,31,31574,4,0.0,...,0,,0,3.673218,194.181384,0,0,0.0,0.326782,12.688616
3367696,226202,2013-02-21 19:02:17,5,219,,,219,33423,3,3.5,...,0,,0,3.171522,166.362238,0,0,0.0,0.171522,31.362238
3880444,260461,2013-05-02 23:11:06,14,100,,,219,24194,3,4.5,...,0,,0,3.285429,184.321422,0,0,0.014634,0.285429,94.271422


In [6]:
tr_cl_sample.isnull().sum()

srch_id                                 0
date_time                               0
site_id                                 0
visitor_location_country_id             0
visitor_hist_starrating           2353405
visitor_hist_adr_usd              2352835
prop_country_id                         0
prop_id                                 0
prop_starrating                         0
prop_review_score                       0
prop_brand_bool                         0
prop_location_score1                    0
prop_location_score2                    0
prop_log_historical_price               0
position                                0
price_usd                               0
promotion_flag                          0
srch_destination_id                     0
srch_length_of_stay                     0
srch_booking_window                     0
srch_adults_count                       0
srch_children_count                     0
srch_room_count                         0
srch_saturday_night_bool          

# predict booked_clicked_combined

In [7]:
#col = [c for c in tr_cl_sample.columns if c not in ['booking_bool', 'date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'gross_bookings_usd', 'star_diff', 'price_diff']]
col = [c for c in tr_cl_sample.columns if c not in ['position', 'date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'gross_bookings_usd', 'star_diff', 'price_diff', 'booking_bool', 'click_bool', 'booked_clicked_combined']]

In [10]:
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf = RandomForestClassifier()
clf.fit(data, target)

  """


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf.predict(data_test)

In [12]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.954543
f1-score (micro): 0.954543
f1-score (macro): 0.339044
f1-score (weighted): 0.933938
recall-score (micro): 0.954543
recall-score (macro): 0.339993
recall-score (weighted): 0.954543


In [13]:
dictionary = dict(zip(col, clf.feature_importances_))
pd.Series(dictionary).to_frame()
# how do I rank these in descending order??

Unnamed: 0,0
orig_destination_distance,0.069142
price_diff_bool,0.000946
price_diff_filled,0.069701
price_usd,0.071804
promotion_flag,0.004133
prop_brand_bool,0.008022
prop_country_id,0.018755
prop_desirability,0.089587
prop_id,0.063176
prop_location_score1,0.050037


In [14]:
col = ['prop_desirability', 'price_usd', 'prop_location_score2', 'prop_location_score1']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf2 = RandomForestClassifier()
clf2.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf2.predict(data_test)

  


In [15]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.944112
f1-score (micro): 0.944112
f1-score (macro): 0.350798
f1-score (weighted): 0.929948
recall-score (micro): 0.944112
recall-score (macro): 0.347001
recall-score (weighted): 0.944112


In [16]:
col = ['price_diff_filled', 'srch_booking_window', 'orig_destination_distance']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf3 = RandomForestClassifier()
clf3.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf3.predict(data_test)

  


In [17]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.944357
f1-score (micro): 0.944357
f1-score (macro): 0.332522
f1-score (weighted): 0.928413
recall-score (micro): 0.944357
recall-score (macro): 0.334937
recall-score (weighted): 0.944357


In [18]:
col = ['prop_desirability', 'srch_id']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf4 = RandomForestClassifier()
clf4.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf4.predict(data_test)

  


In [19]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.935638
f1-score (micro): 0.935638
f1-score (macro): 0.348683
f1-score (weighted): 0.925644
recall-score (micro): 0.935638
recall-score (macro): 0.346250
recall-score (weighted): 0.935638


In [20]:
col = ['prop_desirability', 'orig_destination_distance', 'price_usd']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf5 = RandomForestClassifier()
clf5.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf5.predict(data_test)

  


In [21]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.948443
f1-score (micro): 0.948443
f1-score (macro): 0.345566
f1-score (weighted): 0.931579
recall-score (micro): 0.948443
recall-score (macro): 0.343423
recall-score (weighted): 0.948443


In [22]:
col = ['srch_id', 'orig_destination_distance', 'price_usd', 'willingness_to_pay']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf6 = RandomForestClassifier()
clf6.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf6.predict(data_test)

  


In [23]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.950454
f1-score (micro): 0.950454
f1-score (macro): 0.329628
f1-score (weighted): 0.931132
recall-score (micro): 0.950454
recall-score (macro): 0.334300
recall-score (weighted): 0.950454


In [24]:
col = ['srch_booking_window', 'orig_destination_distance', 'prop_desirability']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf7 = RandomForestClassifier()
clf7.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf7.predict(data_test)

  


In [25]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.947974
f1-score (micro): 0.947974
f1-score (macro): 0.343486
f1-score (weighted): 0.931150
recall-score (micro): 0.947974
recall-score (macro): 0.342035
recall-score (weighted): 0.947974


In [26]:
col = ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf8 = RandomForestClassifier()
clf8.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf8.predict(data_test)

  


In [27]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.953787
f1-score (micro): 0.953787
f1-score (macro): 0.340575
f1-score (weighted): 0.933727
recall-score (micro): 0.953787
recall-score (macro): 0.340760
recall-score (weighted): 0.953787


In [28]:
col = ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability', 'star_diff_filled']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf9 = RandomForestClassifier()
clf9.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf9.predict(data_test)

  


In [29]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.953981
f1-score (micro): 0.953981
f1-score (macro): 0.340128
f1-score (weighted): 0.933778
recall-score (micro): 0.953981
recall-score (macro): 0.340524
recall-score (weighted): 0.953981


In [30]:
col = ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability', 'star_diff_filled', 'srch_id']
target = pd.DataFrame(tr_cl_sample['booked_clicked_combined'])
data = pd.DataFrame(tr_cl_sample[col])

clf10 = RandomForestClassifier()
clf10.fit(data, target)

data_test = pd.DataFrame(te_cl_sample[col])
target_test = pd.DataFrame(te_cl_sample['booked_clicked_combined'])

predict = clf10.predict(data_test)

  


In [31]:
acc = accuracy_score(target_test, predict)
print('accuracy: %f' %acc)
f1_ = f1_score(target_test, predict, average='micro')
print('f1-score (micro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='macro')
print('f1-score (macro): %f' %f1_)
f1_ = f1_score(target_test, predict, average='weighted')
print('f1-score (weighted): %f' %f1_)
recall = recall_score(target_test, predict, average='micro')
print('recall-score (micro): %f' %recall)
recall = recall_score(target_test, predict, average='macro')
print('recall-score (macro): %f' %recall)
recall = recall_score(target_test, predict, average='weighted')
print('recall-score (weighted): %f' %recall)

accuracy: 0.954514
f1-score (micro): 0.954514
f1-score (macro): 0.336598
f1-score (weighted): 0.933766
recall-score (micro): 0.954514
recall-score (macro): 0.338733
recall-score (weighted): 0.954514


# predict booking_bool

In [33]:
diff_cols = [[c for c in tr_cl_sample.columns if c not in ['position', 'date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'gross_bookings_usd', 'star_diff', 'price_diff', 'booking_bool', 'click_bool', 'booked_clicked_combined']], ['prop_desirability', 'price_usd', 'prop_location_score2', 'prop_location_score1'], ['price_diff_filled', 'srch_booking_window', 'orig_destination_distance'], ['prop_desirability', 'srch_id'], ['prop_desirability', 'orig_destination_distance', 'price_usd'], ['srch_id', 'orig_destination_distance', 'price_usd', 'willingness_to_pay'], ['srch_booking_window', 'orig_destination_distance', 'prop_desirability'], ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability'], ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability', 'star_diff_filled'], ['srch_booking_window', 'orig_destination_distance', 'price_usd', 'prop_location_score2', 'prop_desirability', 'star_diff_filled', 'srch_id']]

for i in range(len(diff_cols)):
    col = diff_cols[i]
    target = pd.DataFrame(tr_cl_sample['booking_bool'])
    data = pd.DataFrame(tr_cl_sample[col])

    clf = RandomForestClassifier()
    clf.fit(data, target)
    
    data_test = pd.DataFrame(te_cl_sample[col])
    target_test = pd.DataFrame(te_cl_sample['booking_bool'])

    predict = clf.predict(data_test)
    
    print(i)
    acc = accuracy_score(target_test, predict)
    print('accuracy: %f' %acc)
    f1_ = f1_score(target_test, predict, average='micro')
    print('f1-score (micro): %f' %f1_)
    f1_ = f1_score(target_test, predict, average='macro')
    print('f1-score (macro): %f' %f1_)
    f1_ = f1_score(target_test, predict, average='weighted')
    print('f1-score (weighted): %f' %f1_)
    recall = recall_score(target_test, predict, average='micro')
    print('recall-score (micro): %f' %recall)
    recall = recall_score(target_test, predict, average='macro')
    print('recall-score (macro): %f' %recall)
    recall = recall_score(target_test, predict, average='weighted')
    print('recall-score (weighted): %f' %recall)

  if __name__ == '__main__':


0
accuracy: 0.971666
f1-score (micro): 0.971666
f1-score (macro): 0.508182
f1-score (weighted): 0.958863
recall-score (micro): 0.971666
recall-score (macro): 0.507626
recall-score (weighted): 0.971666
1
accuracy: 0.965263
f1-score (micro): 0.965263
f1-score (macro): 0.522681
f1-score (weighted): 0.956543
recall-score (micro): 0.965263
recall-score (macro): 0.516801
recall-score (weighted): 0.965263
2
accuracy: 0.965583
f1-score (micro): 0.965583
f1-score (macro): 0.496600
f1-score (weighted): 0.955254
recall-score (micro): 0.965583
recall-score (macro): 0.499940
recall-score (weighted): 0.965583
3
accuracy: 0.960629
f1-score (micro): 0.960629
f1-score (macro): 0.520081
f1-score (weighted): 0.954122
recall-score (micro): 0.960629
recall-score (macro): 0.516039
recall-score (weighted): 0.960629
4
accuracy: 0.968110
f1-score (micro): 0.968110
f1-score (macro): 0.515787
f1-score (weighted): 0.957553
recall-score (micro): 0.968110
recall-score (macro): 0.511878
recall-score (weighted): 0.96