## Imports, loads and initialization

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
BD = "bookings_data"
B = "bookings"
CD = "customer_data"
HD = "hotels_data"
PD = "payments_data"
SS = "sample_submission_5"
TD = "train_data"

base_path = r"../data/"

def load_from_csv(filename):
    file_path = base_path + filename + ".csv"
    return pd.read_csv(file_path)

In [3]:
bookings_data = load_from_csv(BD)
bookings = load_from_csv(B)
customer_data = load_from_csv(CD)
hotels_data = load_from_csv(HD)
payments_data = load_from_csv(PD)
sample_submission = load_from_csv(SS)
train_data = load_from_csv(TD)

bookings['booking_create_timestamp'] = pd.to_datetime(bookings['booking_create_timestamp'])
bookings['booking_approved_at'] = pd.to_datetime(bookings['booking_approved_at'])
bookings['booking_checkin_customer_date'] = pd.to_datetime(bookings['booking_checkin_customer_date'])
bookings_data['booking_expiry_date'] = pd.to_datetime(bookings_data['booking_expiry_date'])

In [39]:
# bookings_data.describe()
# bookings.describe()
# customer_data.describe()
# hotels_data.describe()
payments_data.describe()
# sample_submission.describe()
# train_data.describe()

Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154.10038
std,0.706584,2.687051,217.494064
min,1.0,0.0,0.0
25%,1.0,1.0,56.79
50%,1.0,1.0,100.0
75%,1.0,4.0,171.8375
max,29.0,24.0,13664.08


## Data modification to handle non-numerical data

In [4]:
def get_hash(val_list):
    hashmap = {}
    for i,val in enumerate(val_list):
        hashmap[val] = i
    return hashmap

In [5]:
unique_ids = payments_data['booking_id'].unique()
hash_val = get_hash(unique_ids)
counts = np.zeros(unique_ids.shape)
total_installments = np.zeros(unique_ids.shape)
value_sum = np.zeros(unique_ids.shape)
for i in range(payments_data.shape[0]):
    booking_id = payments_data.at[i, 'booking_id']
    installments = payments_data.at[i, 'payment_installments']
    payment_value = payments_data.at[i, 'payment_value']
    counts[hash_val[booking_id]] += 1
    total_installments[hash_val[booking_id]] += installments
    value_sum[hash_val[booking_id]] += payment_value
payments_data_modified = []
for booking_id in unique_ids:
    payments_made =  counts[hash_val[booking_id]]
    payments_installments = total_installments[hash_val[booking_id]]
    payments_value = value_sum[hash_val[booking_id]]
    payments_data_modified.append([booking_id, payments_made, payments_installments, payments_value])
payments_data_modified = pd.DataFrame(data=payments_data_modified, columns=['booking_id', 'payments_made', 'payments_installments', 'payments_value'])
payments_data_modified.describe(include='all')

Unnamed: 0,booking_id,payments_made,payments_installments,payments_value
count,99440,99440.0,99440.0,99440.0
unique,99440,,,
top,6f3fe1789b1e8b2acac839d17b81ef22,,,
freq,1,,,
mean,,1.04471,2.980923,160.990267
std,,0.381166,2.74181,221.951257
min,,1.0,0.0,0.0
25%,,1.0,1.0,62.01
50%,,1.0,2.0,105.29
75%,,1.0,4.0,176.97


In [12]:
# Has NaNs!!!
unique_ids = hotels_data['hotel_id'].unique()
hotel_id_hash = get_hash(unique_ids)
hotels_data_modified = []
for i in range(hotels_data.shape[0]):
    hotel_id = hotel_id_hash[hotels_data.at[i, 'hotel_id']]
    category = hotels_data.at[i, 'hotel_category']
    name_length = hotels_data.at[i, 'hotel_name_length']
    description_length = hotels_data.at[i, 'hotel_description_length']
    photos_qty = hotels_data.at[i, 'hotel_photos_qty']
    hotels_data_modified.append([hotel_id, category, name_length, description_length, photos_qty])
hotels_data_modified = pd.DataFrame(data=hotels_data_modified, columns=[
    'hotel_id', 'hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty'])
hotels_data_modified.fillna(0, inplace=True)
hotels_data_modified.describe(include='all')

Unnamed: 0,hotel_id,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty
count,32951.0,32951.0,32951.0,32951.0,32951.0
mean,16475.0,30.234136,48.523656,768.227945,2.166975
std,9512.278697,23.520923,10.156155,629.658469,1.728063
min,0.0,1.0,5.0,4.0,1.0
25%,8237.5,10.0,42.0,344.0,1.0
50%,16475.0,28.0,51.0,595.0,1.0
75%,24712.5,41.0,57.0,961.0,3.0
max,32950.0,73.0,76.0,3992.0,20.0


In [7]:
unique_unique_ids = customer_data['customer_unique_id'].unique()
unique_id_hash = get_hash(unique_unique_ids)
unique_countries = customer_data['country'].unique()
country_hash = get_hash(unique_countries)
customer_data_modified = []
for i in range(customer_data.shape[0]):
    customer_id = customer_data.at[i, 'customer_id']
    unique_id_num = unique_id_hash[customer_data.at[i, 'customer_unique_id']]
    country_num = country_hash[customer_data.at[i, 'country']]
    customer_data_modified.append([customer_id, unique_id_num, country_num])
customer_data_modified = pd.DataFrame(data=customer_data_modified, columns=['customer_id', 'unique_id_num', 'country_num'])
customer_data_modified.describe(include='all')


Unnamed: 0,customer_id,unique_id_num,country_num
count,99441,99441.0,99441.0
unique,99441,,
top,1fbc88172c00ba8bc706b8999e2fba1a,,
freq,1,,
mean,,47470.47591,4.001428
std,,27757.61303,2.578318
min,,0.0,0.0
25%,,23353.0,2.0
50%,,47201.0,4.0
75%,,71453.0,6.0


In [13]:
# Missing: dates besides created
unique_booking_status = bookings['booking_status'].unique()
status_hash = get_hash(unique_booking_status)
bookings_modified = []
for i in range(bookings.shape[0]):
    booking_id = bookings.at[i, 'booking_id']
    customer_id = bookings.at[i, 'customer_id']
    booking_status_num = status_hash[bookings.at[i, 'booking_status']]
    create_date = bookings.at[i, 'booking_create_timestamp']
    approved_date = bookings.at[i, 'booking_approved_at']
    checkin_date = bookings.at[i, 'booking_checkin_customer_date']
    approval_time = (approved_date-create_date).total_seconds()/60
    checkin_time = (checkin_date-create_date).total_seconds()/1440
    bookings_modified.append([booking_id, customer_id, booking_status_num, create_date, approval_time, checkin_time])
bookings_modified = pd.DataFrame(data=bookings_modified, columns=[
    'booking_id', 'customer_id', 'booking_status_num', 'booking_create_timestamp', 'booking_approval_time', 'booking_checkin_time'])
bookings_modified.fillna(-1, inplace=True)
bookings_modified.describe(include='all')

  bookings_modified.fillna(bookings_modified.median(), inplace=True)
  bookings_modified.fillna(bookings_modified.median(), inplace=True)
  bookings_modified.describe(include='all')


Unnamed: 0,booking_id,customer_id,booking_status_num,booking_create_timestamp,booking_approval_time,booking_checkin_time
count,99441,99441,99441.0,99441,99441.0,99441.0
unique,99441,99441,,98875,,
top,c54678b7cc49136f2d6af7e481f51cbd,51297304e76186b10a928d9ef432eb62,,2008-04-13 10:31:14,,
freq,1,1,,3,,
first,,,,2006-09-07 20:58:19,,
last,,,,2008-10-19 17:13:18,,
mean,,,0.090868,,624.172948,749.334182
std,,,0.569725,,1561.210891,564.69325
min,,,0.0,,0.0,32.004861
25%,,,0.0,,12.916667,412.427778


In [9]:
unique_ids = bookings_data['booking_id'].unique()
hash_val = get_hash(unique_ids)
unique_agents = bookings_data['seller_agent_id'].unique()
agent_hash = get_hash(unique_agents)
counts = np.zeros(unique_ids.shape)
hotel_ids = np.zeros(unique_ids.shape).tolist()
seller_agent_ids = np.zeros(unique_ids.shape).tolist()
booking_expiry_dates = np.zeros(unique_ids.shape).tolist()
prices = np.zeros(unique_ids.shape).tolist()
agent_feess = np.zeros(unique_ids.shape).tolist()
for i in range(bookings_data.shape[0]):
    booking_id = bookings_data.at[i, 'booking_id']
    counts[hash_val[booking_id]] += 1
    hotel_ids[hash_val[booking_id]] = hotel_id_hash[bookings_data.at[i, 'hotel_id']]
    seller_agent_ids[hash_val[booking_id]] = agent_hash[bookings_data.at[i, 'seller_agent_id']]
    booking_expiry_dates[hash_val[booking_id]] = bookings_data.at[i, 'booking_expiry_date']
    prices[hash_val[booking_id]] = bookings_data.at[i, 'price']
    agent_feess[hash_val[booking_id]] = bookings_data.at[i, 'agent_fees']
bookings_data_modified = []
for booking_id in unique_ids:
    sub_requests = counts[hash_val[booking_id]]
    hotel_id = hotel_ids[hash_val[booking_id]]
    seller_agent_id = seller_agent_ids[hash_val[booking_id]]
    booking_expiry_date = booking_expiry_dates[hash_val[booking_id]]
    price = prices[hash_val[booking_id]]
    agent_fees = agent_feess[hash_val[booking_id]]
    bookings_data_modified.append([booking_id, sub_requests, hotel_id, seller_agent_id, booking_expiry_date, price, agent_fees])
bookings_data_modified =  pd.DataFrame(data=bookings_data_modified, columns=[
    'booking_id', 'sub_requests', 'hotel_id', 'seller_agent_id_num', 'booking_expiry_date', 'price', 'agent_fees'
])
bookings_data_modified.describe(include='all')

  bookings_data_modified.describe(include='all')


Unnamed: 0,booking_id,sub_requests,hotel_id,seller_agent_id_num,booking_expiry_date,price,agent_fees
count,98666,98666.0,98666.0,98666.0,98666,98666.0,98666.0
unique,98666,,,,93009,,
top,242fe8c5a6d1ba2dd792cb1621400010,,,,2018-06-11 03:31:04,,
freq,1,,,,6,,
first,,,,,2016-09-19 00:15:34,,
last,,,,,2020-04-09 22:35:08,,
mean,,1.141731,16536.404293,471.238157,,125.883144,20.179411
std,,0.538452,9578.771521,555.651909,,191.166764,15.855944
min,,1.0,0.0,0.0,,0.85,0.0
25%,,1.0,8290.0,88.0,,41.4925,13.28


## Obtaining total train and test data

In [18]:
train_data_full = pd.merge(left=train_data, right=bookings_modified, how='left', on='booking_id')
train_data_full = pd.merge(left=train_data_full, right=bookings_data_modified, how='left', on='booking_id')
train_data_full = pd.merge(left=train_data_full, right=customer_data_modified, how='left', on='customer_id')
train_data_full = pd.merge(left=train_data_full, right=hotels_data_modified, how='left', on='hotel_id')
train_data_full = pd.merge(left=train_data_full, right=payments_data_modified, how='left', on='booking_id')
# train_data_full.describe(include='all')
print(train_data_full.shape[0])
booking_expiry = []
unique_ids = train_data_full['booking_id'].unique()
id_hash = get_hash(unique_ids)
expiry_times = np.zeros(unique_ids.shape)
for i in range(train_data_full.shape[0]):
    booking_id = train_data_full.at[i, 'booking_id']
    expiry_date = train_data_full.at[i, 'booking_expiry_date']
    create_date = train_data_full.at[i, 'booking_create_timestamp']
    expiry_time = (expiry_date-create_date).total_seconds()/1440
    expiry_times[id_hash[booking_id]] = expiry_time
for booking_id in unique_ids:
    expiry_time = expiry_times[id_hash[booking_id]]
    booking_expiry.append([booking_id, expiry_time])
booking_expiry = pd.DataFrame(data=booking_expiry, columns=['booking_id', 'booking_expiry_time'])
print(booking_expiry.shape[0])
train_data_full = pd.merge(left=train_data_full, right=booking_expiry, how='left', on='booking_id')
train_data_full.drop(labels=['booking_create_timestamp', 'booking_expiry_date'], axis=1, inplace=True)
# means = {}
# medians = {}
# for column in train_data_full.columns:
#     means[column] = train_data_full[column].mean()
#     medians[column] = train_data_full[column].median()
# train_data_full.fillna(-1, inplace=True)
# train_data_full.fillna(train_data_full.median(), inplace=True)
# train_data_full.describe(include='all')
# train_data_full.columns.shape

50000
49868


In [20]:
test_data = sample_submission['booking_id']
test_data = pd.merge(left=test_data, right=bookings_modified, how='left', on='booking_id')
test_data = pd.merge(left=test_data, right=bookings_data_modified, how='left', on='booking_id')
test_data = pd.merge(left=test_data, right=customer_data_modified, how='left', on='customer_id')
test_data = pd.merge(left=test_data, right=hotels_data_modified, how='left', on='hotel_id')
test_data = pd.merge(left=test_data, right=payments_data_modified, how='left', on='booking_id')
# test_data.describe(include='all')
print(test_data.shape[0])
booking_expiry = []
unique_ids = test_data['booking_id'].unique()
id_hash = get_hash(unique_ids)
expiry_times = np.zeros(unique_ids.shape)
for i in range(test_data.shape[0]):
    booking_id = test_data.at[i, 'booking_id']
    expiry_date = test_data.at[i, 'booking_expiry_date']
    create_date = test_data.at[i, 'booking_create_timestamp']
    expiry_time = (expiry_date-create_date).total_seconds()/1440
    expiry_times[id_hash[booking_id]] = expiry_time
for booking_id in unique_ids:
    expiry_time = expiry_times[id_hash[booking_id]]
    booking_expiry.append([booking_id, expiry_time])
booking_expiry = pd.DataFrame(data=booking_expiry, columns=['booking_id', 'booking_expiry_time'])
print(booking_expiry.shape[0])
test_data = pd.merge(left=test_data, right=booking_expiry, how='left', on='booking_id')
test_data.drop(labels=['booking_create_timestamp', 'booking_expiry_date'], axis=1, inplace=True)
# test_data.fillna(-1, inplace=True)
# test_data.describe(include='all')
# test_data.columns.shape

49079
49079


In [44]:
train_data_full.columns

Index(['booking_id', 'rating_score', 'customer_id', 'booking_status_num',
       'booking_approval_time', 'booking_checkin_time', 'sub_requests',
       'hotel_id', 'seller_agent_id_num', 'price', 'agent_fees',
       'unique_id_num', 'country_num', 'hotel_category', 'hotel_name_length',
       'hotel_description_length', 'hotel_photos_qty', 'payments_made',
       'payments_installments', 'payments_value', 'booking_expiry_time'],
      dtype='object')

In [19]:
# from sklearn.ensemble import RandomForestClassifier as rf
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
train_labels = train_data_full['rating_score']
train_data_full.drop(labels=['rating_score', 'booking_id', 'customer_id'], axis=1, inplace=True)
# train_data_full.fillna(train_data_full.median(), inplace=True)
train_data_full.fillna(-1, inplace=True)
train_data_full.describe(include='all')
# X_train, X_val, Y_train, Y_val = train_test_split(train_data_full, train_labels, test_size=0.2)
# rfclf = rf()
# rfclf.fit(X_train, Y_train)
# train_pred = rfclf.predict(X_val)
# print(np.mean(np.subtract(Y_val, train_pred)**2))

Unnamed: 0,booking_status_num,booking_approval_time,booking_checkin_time,sub_requests,hotel_id,seller_agent_id_num,price,agent_fees,unique_id_num,country_num,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,payments_made,payments_installments,payments_value,booking_expiry_time
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.08924,624.397828,747.579535,1.1408,16547.89706,470.12778,124.119417,20.144149,47485.09326,4.00174,28.78766,48.91292,788.22678,2.22074,1.04638,2.97438,159.889098,219398.433048
std,0.565157,1280.894277,568.594422,0.526935,9544.357954,554.635858,186.792437,15.660079,27774.566666,2.576368,22.382366,9.897297,644.071701,1.735348,0.411767,2.738539,223.782603,444.901175
min,0.0,0.0,32.004861,1.0,2.0,0.0,0.85,0.0,1.0,0.0,1.0,5.0,4.0,1.0,1.0,1.0,0.0,219120.986806
25%,0.0,12.9,410.047222,1.0,8304.5,88.0,41.9375,13.33,23406.5,2.0,10.0,43.0,354.75,1.0,1.0,1.0,61.8,219301.111806
50%,0.0,20.65,613.065278,1.0,16414.0,259.0,79.0,16.33,47167.0,4.0,28.0,52.0,600.0,1.0,1.0,2.0,104.995,219361.416667
75%,0.0,868.795833,922.593403,1.0,24884.0,644.0,139.0,21.19,71466.25,6.0,38.0,57.0,982.0,3.0,1.0,4.0,176.0,219428.191667
max,7.0,44486.616667,12501.105556,15.0,32950.0,3094.0,6735.0,375.28,96093.0,8.0,73.0,72.0,3976.0,20.0,29.0,29.0,13664.08,282363.208333


In [21]:
test_ids = test_data['booking_id']
test_data.drop(labels=['booking_id', 'customer_id'], axis=1, inplace=True)
# test_data.fillna(train_data_full.median(), inplace=True)
test_data.fillna(-1, inplace=True)
test_data.describe(include='all')

Unnamed: 0,booking_status_num,booking_approval_time,booking_checkin_time,sub_requests,hotel_id,seller_agent_id_num,price,agent_fees,unique_id_num,country_num,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,payments_made,payments_installments,payments_value,booking_expiry_time
count,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0,49079.0
mean,0.087043,623.39635,747.058592,1.139306,16521.040751,468.049573,126.347751,20.135018,47351.441757,4.001019,28.92398,48.889708,789.454186,2.219809,1.04344,2.989018,161.359245,219396.313264
std,0.558931,1792.40094,550.56796,0.538742,9541.81065,552.339162,191.030525,15.900428,27737.081265,2.580542,22.484222,9.888597,650.174212,1.732737,0.353456,2.746318,216.917182,345.266394
min,0.0,0.0,51.31875,1.0,0.0,0.0,0.85,0.0,0.0,0.0,1.0,5.0,4.0,1.0,1.0,0.0,0.0,219120.9625
25%,0.0,12.916667,413.456597,1.0,8290.0,89.0,42.0,13.37,23169.5,2.0,10.0,43.0,354.0,1.0,1.0,1.0,62.07,219301.116667
50%,0.0,20.533333,613.065278,1.0,16414.0,259.0,79.0,16.33,47091.0,4.0,28.0,52.0,600.0,1.0,1.0,2.0,105.38,219361.416667
75%,0.0,876.383333,918.825,1.0,24851.5,642.0,139.9,21.15,71316.5,6.0,38.0,57.0,982.0,3.0,1.0,4.0,177.27,219429.476389
max,7.0,270550.833333,11391.789583,21.0,32949.0,3092.0,6499.0,409.68,96095.0,8.0,73.0,76.0,3992.0,19.0,22.0,25.0,6726.66,282363.208333


## Trying different models, validation

In [22]:
from sklearn.model_selection import train_test_split
# train_data_full['booking_status_num'] = train_data_full['booking_status_num'].apply(func=int)
# train_data_full['hotel_id'] = train_data_full['hotel_id'].apply(func=int)
# train_data_full['seller_agent_id_num'] = train_data_full['seller_agent_id_num'].apply(func=int)
# train_data_full['unique_id_num'] = train_data_full['unique_id_num'].apply(func=int)
# train_data_full['country_num'] = train_data_full['country_num'].apply(func=int)
# train_data_full['hotel_category'] = train_data_full['hotel_category'].apply(func=int)

# train_data_full['booking_status_num'] = int(train_data_full['booking_status_num'])
# train_data_full['hotel_id'] = int(train_data_full['hotel_id'])
# train_data_full['seller_agent_id_num'] = int(train_data_full['seller_agent_id_num'])
# train_data_full['unique_id_num'] = int(train_data_full['unique_id_num'])
# train_data_full['country_num'] = int(train_data_full['country_num'])
# train_data_full['hotel_category'] = int(train_data_full['hotel_category'])

X_train, X_val, Y_train, Y_val = train_test_split(train_data_full, train_labels, test_size=0.2)

In [15]:
X_train.columns

Index(['booking_status_num', 'booking_approval_time', 'booking_checkin_time',
       'sub_requests', 'hotel_id', 'seller_agent_id_num', 'price',
       'agent_fees', 'unique_id_num', 'country_num', 'hotel_category',
       'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty',
       'payments_made', 'payments_installments', 'payments_value',
       'booking_expiry_time'],
      dtype='object')

In [23]:
def trim_pred(preds):
    for i in range(preds.shape[0]):
        if preds[i] > 5:
            preds[i] = 5
        elif preds[i] < 1:
            preds[i] = 1
    return preds

In [24]:
from sklearn.linear_model import LinearRegression
lrrgr = LinearRegression()
lrrgr.fit(X_train, Y_train)
train_pred = trim_pred(lrrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))


1.4552857296762456


In [25]:
from sklearn.linear_model import HuberRegressor
hrrgr = HuberRegressor()
hrrgr.fit(X_train, Y_train)
train_pred = trim_pred(hrrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))


1.699210605198617


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [62]:
from sklearn.linear_model import LogisticRegression
lrclf = LogisticRegression()
lrclf.fit(X_train, Y_train)
train_pred = lrclf.predict(X_val)
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.3915


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
from sklearn.svm import SVC
svclf = SVC()
svclf.fit(X_train, Y_train)
train_pred = svclf.predict(X_val)
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.5264


In [16]:
from sklearn.svm import SVR
svrgr = SVR()
svrgr.fit(X_train, Y_train)
train_pred = trim_pred(svrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.326805822488311


In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
svrgr = SVR()
parameters = {
    'C' : [1, 10, 100, 1000],
    'epsilon': [0.1, 0.05, 0.01]
}
svrgr_cv = GridSearchCV(svrgr, parameters, refit=True)
svrgr_cv.fit(X_train, Y_train)
print(svrgr_cv.cv_results_)
print(svrgr_cv.best_params_)
train_pred = trim_pred(svrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

{'mean_fit_time': array([29.51872482, 29.16342959, 29.10666089, 29.08525972, 29.7821322 ,
       34.78960099, 30.80005827, 34.75263138, 37.06007504, 37.83238902,
       42.67817793, 44.98870625]), 'std_fit_time': array([0.28897672, 0.18118178, 0.24985328, 0.33344969, 0.37083466,
       0.25488461, 0.13154977, 0.40110112, 0.49408033, 0.91660543,
       0.4800088 , 1.36815849]), 'mean_score_time': array([7.59722457, 7.66976643, 7.58209453, 7.6695725 , 7.80282583,
       8.69114075, 7.64699235, 8.20143471, 8.9790576 , 7.57624631,
       8.50588799, 9.15509696]), 'std_score_time': array([0.0606567 , 0.07164986, 0.0405797 , 0.21394562, 0.22981605,
       0.09731472, 0.09997627, 0.03347039, 0.16123477, 0.1356467 ,
       0.06929247, 0.0734896 ]), 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 100, 100, 100, 1000, 1000, 1000],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=

NotFittedError: This SVR instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [35]:
train_pred = trim_pred(svrgr_cv.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

1.9011831161540296


In [29]:
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, Y_train)
train_pred = dtclf.predict(X_val)
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.756


In [26]:
from sklearn.tree import DecisionTreeRegressor
dtrgr = DecisionTreeRegressor()
dtrgr.fit(X_train, Y_train)
train_pred = trim_pred(dtrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.8037


In [28]:
from sklearn.ensemble import RandomForestRegressor
rfrgr = RandomForestRegressor(n_estimators=1000, max_features=0.5)
rfrgr.fit(X_train, Y_train)
train_pred = trim_pred(rfrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

1.390563600199648


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
rfrgr = RandomForestRegressor()
parameters = {
    'n_estimators': [100, 200, 500, 1000],
    'max_features': [1, 0.5]
}
rfrgr_cv = GridSearchCV(rfrgr, parameters, refit=True)
rfrgr_cv.fit(X_train, Y_train)
print(rfrgr_cv.cv_results_)
train_pred = trim_pred(rfrgr_cv.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

{'mean_fit_time': array([  2.46282182,   4.82474227,  12.02732015,  23.92772484,
        14.55907216,  29.01279259,  73.19618931, 144.69253893]), 'std_fit_time': array([0.03483817, 0.02964677, 0.13887645, 0.10087827, 0.32569052,
       0.17298032, 1.28276771, 3.34066407]), 'mean_score_time': array([0.14094181, 0.28704357, 0.71274824, 1.41548948, 0.1305234 ,
       0.25578818, 0.64181437, 1.27131915]), 'std_score_time': array([0.00067554, 0.00547857, 0.00499534, 0.00516494, 0.00190755,
       0.00166465, 0.00246348, 0.00998213]), 'param_max_features': masked_array(data=[1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[100, 200, 500, 1000, 100, 200, 500, 1000],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_features': 1, 'n_estimators': 100}, {'

In [28]:
rfrgr_cv.best_params_

{'max_features': 0.5, 'n_estimators': 1000}

In [29]:
from sklearn.ensemble import AdaBoostRegressor
abrgr = AdaBoostRegressor()
abrgr.fit(X_train, Y_train)
train_pred = trim_pred(abrgr.predict(X_val))
print(np.mean(np.subtract(Y_val, train_pred)**2))

1.4483172266155893


In [40]:
import xgboost as xgb
xgbclf = xgb.XGBClassifier()
Y_train -= 1
xgbclf.fit(X_train, Y_train)
train_pred = xgbclf.predict(X_val)
train_pred += 1
Y_train += 1
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.0488


In [30]:
import xgboost as xgb
xgbrgr = xgb.XGBRegressor()
# Y_train -= 1
xgbrgr.fit(X_train, Y_train)
train_pred = trim_pred(xgbrgr.predict(X_val))
# train_pred += 1
print(np.mean(np.subtract(Y_val, train_pred)**2))



1.4190665818790424


In [42]:
import catboost as cb
cbclf = cb.CatBoostClassifier()
# cbclf.fit(X_train, Y_train, cat_features=[
    # 'booking_status_num', 'hotel_id', 'seller_agent_id_num', 'unique_id_num', 'country_num', 'hotel_category'], verbose=False)
cbclf.fit(X_train, Y_train, verbose=False)
train_pred = np.reshape(cbclf.predict(X_val), Y_val.shape)
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.0634


In [31]:
import catboost as cb
cbrgr = cb.CatBoostRegressor()
# cbrgr.fit(X_train, Y_train, cat_features=[
    # 'booking_status_num', 'hotel_id', 'seller_agent_id_num', 'unique_id_num', 'country_num', 'hotel_category'], verbose=False)
cbrgr.fit(X_train, Y_train, verbose=False)
train_pred = trim_pred(np.reshape(cbrgr.predict(X_val), Y_val.shape))
print(np.mean(np.subtract(Y_val, train_pred)**2))


1.3734928317505422


In [18]:
import lightgbm as lgb
lgbclf = lgb.LGBMClassifier()
lgbclf.fit(X_train, Y_train)
train_pred = np.reshape(lgbclf.predict(X_val), Y_val.shape)
print(np.mean(np.subtract(Y_val, train_pred)**2))

2.024


In [32]:
import lightgbm as lgb
lgbrgr = lgb.LGBMRegressor()
lgbrgr.fit(X_train, Y_train)
train_pred = np.reshape(lgbrgr.predict(X_val), Y_val.shape)
for pred in train_pred:
    if pred > 5:
        pred = 5
    elif pred < 1:
        pred = 1
print(np.mean(np.subtract(Y_val, train_pred)**2))

1.369119088924107


## Running best model on test data

In [29]:
from sklearn.ensemble import RandomForestRegressor
rfrgr = RandomForestRegressor(n_estimators=1000, max_features=0.5)
rfrgr.fit(train_data_full, train_labels)
test_pred = trim_pred(np.reshape(rfrgr.predict(test_data), test_ids.shape))
test_pred = pd.DataFrame(data=test_pred, columns=['rating_score'])
test_sub = pd.concat([test_ids, test_pred], axis=1)
test_sub.describe(include='all')

Unnamed: 0,booking_id,rating_score
count,49079,49079.0
unique,49079,
top,796b98fdf73dbeba33a548910a1c6147,
freq,1,
mean,,4.031517
std,,0.683355
min,,1.043
25%,,3.918
50%,,4.2568
75%,,4.44


In [30]:
test_sub.to_csv("../output/RegressionRF.csv", index=False)

In [23]:
test_sub['rating_score'].value_counts()

1.535251    12
2.078501    10
1.556321     8
1.582636     8
1.555425     6
            ..
3.653213     1
3.714539     1
4.230107     1
4.460738     1
4.396121     1
Name: rating_score, Length: 48697, dtype: int64