In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
BD = "bookings_data"
B = "bookings"
CD = "customer_data"
HD = "hotels_data"
PD = "payments_data"
SS = "sample_submission_5"
TD = "train_data"

base_path = r"../data/"

def load_from_csv(filename):
    file_path = base_path + filename + ".csv"
    return pd.read_csv(file_path)

In [3]:
bookings_data = load_from_csv(BD)
bookings = load_from_csv(B)
customer_data = load_from_csv(CD)
hotels_data = load_from_csv(HD)
payments_data = load_from_csv(PD)
sample_submission = load_from_csv(SS)
train_data = load_from_csv(TD)

bookings['booking_create_timestamp'] = pd.to_datetime(bookings['booking_create_timestamp'])
bookings['booking_approved_at'] = pd.to_datetime(bookings['booking_approved_at'])
bookings['booking_checkin_customer_date'] = pd.to_datetime(bookings['booking_checkin_customer_date'])
bookings_data['booking_expiry_date'] = pd.to_datetime(bookings_data['booking_expiry_date'])

In [4]:
def get_hash(val_list):
    hashmap = {}
    for i,val in enumerate(val_list):
        hashmap[val] = i
    return hashmap

In [5]:
unique_ids = payments_data['booking_id'].unique()
hash_val = get_hash(unique_ids)
counts = np.zeros(unique_ids.shape)
total_installments = np.zeros(unique_ids.shape)
value_sum = np.zeros(unique_ids.shape)
for i in range(payments_data.shape[0]):
    booking_id = payments_data.at[i, 'booking_id']
    installments = payments_data.at[i, 'payment_installments']
    payment_value = payments_data.at[i, 'payment_value']
    counts[hash_val[booking_id]] += 1
    total_installments[hash_val[booking_id]] += installments
    value_sum[hash_val[booking_id]] += payment_value
payments_data_modified = []
for booking_id in unique_ids:
    payments_made =  counts[hash_val[booking_id]]
    payments_installments = total_installments[hash_val[booking_id]]
    payments_value = value_sum[hash_val[booking_id]]
    payments_data_modified.append([booking_id, payments_made, payments_installments, payments_value])
payments_data_modified = pd.DataFrame(data=payments_data_modified, columns=['booking_id', 'payments_made', 'payments_installments', 'payments_value'])

In [6]:
# Missing: dates besides created
unique_booking_status = bookings['booking_status'].unique()
bookings_modified = []
for i in range(bookings.shape[0]):
    booking_id = bookings.at[i, 'booking_id']
    customer_id = bookings.at[i, 'customer_id']
    booking_status_num = bookings.at[i, 'booking_status']
    create_date = bookings.at[i, 'booking_create_timestamp']
    approved_date = bookings.at[i, 'booking_approved_at']
    checkin_date = bookings.at[i, 'booking_checkin_customer_date']
    approval_time = (approved_date-create_date).total_seconds()/60
    checkin_time = (checkin_date-create_date).total_seconds()/1440
    bookings_modified.append([booking_id, customer_id, booking_status_num, create_date, approval_time, checkin_time])
bookings_modified = pd.DataFrame(data=bookings_modified, columns=[
    'booking_id', 'customer_id', 'booking_status', 'booking_create_timestamp', 'booking_approval_time', 'booking_checkin_time'])

In [7]:
unique_ids = bookings_data['booking_id'].unique()
hash_val = get_hash(unique_ids)
unique_agents = bookings_data['seller_agent_id'].unique()
counts = np.zeros(unique_ids.shape)
hotel_ids = np.zeros(unique_ids.shape).tolist()
seller_agent_ids = np.zeros(unique_ids.shape).tolist()
booking_expiry_dates = np.zeros(unique_ids.shape).tolist()
prices = np.zeros(unique_ids.shape).tolist()
agent_feess = np.zeros(unique_ids.shape).tolist()
for i in range(bookings_data.shape[0]):
    booking_id = bookings_data.at[i, 'booking_id']
    counts[hash_val[booking_id]] += 1
    hotel_ids[hash_val[booking_id]] = bookings_data.at[i, 'hotel_id']
    seller_agent_ids[hash_val[booking_id]] = bookings_data.at[i, 'seller_agent_id']
    booking_expiry_dates[hash_val[booking_id]] = bookings_data.at[i, 'booking_expiry_date']
    prices[hash_val[booking_id]] = bookings_data.at[i, 'price']
    agent_feess[hash_val[booking_id]] = bookings_data.at[i, 'agent_fees']
bookings_data_modified = []
for booking_id in unique_ids:
    sub_requests = counts[hash_val[booking_id]]
    hotel_id = hotel_ids[hash_val[booking_id]]
    seller_agent_id = seller_agent_ids[hash_val[booking_id]]
    booking_expiry_date = booking_expiry_dates[hash_val[booking_id]]
    price = prices[hash_val[booking_id]]
    agent_fees = agent_feess[hash_val[booking_id]]
    bookings_data_modified.append([booking_id, sub_requests, hotel_id, seller_agent_id, booking_expiry_date, price, agent_fees])
bookings_data_modified =  pd.DataFrame(data=bookings_data_modified, columns=[
    'booking_id', 'sub_requests', 'hotel_id', 'seller_agent_id', 'booking_expiry_date', 'price', 'agent_fees'
])

In [71]:
train_data_full = pd.merge(left=train_data, right=bookings_modified, how='left', on='booking_id')
train_data_full = pd.merge(left=train_data_full, right=bookings_data_modified, how='left', on='booking_id')
train_data_full = pd.merge(left=train_data_full, right=customer_data, how='left', on='customer_id')
train_data_full = pd.merge(left=train_data_full, right=hotels_data, how='left', on='hotel_id')
train_data_full = pd.merge(left=train_data_full, right=payments_data_modified, how='left', on='booking_id')
print(train_data_full.shape[0])
booking_expiry = []
unique_ids = train_data_full['booking_id'].unique()
id_hash = get_hash(unique_ids)
expiry_times = np.zeros(unique_ids.shape)
for i in range(train_data_full.shape[0]):
    booking_id = train_data_full.at[i, 'booking_id']
    expiry_date = train_data_full.at[i, 'booking_expiry_date']
    create_date = train_data_full.at[i, 'booking_create_timestamp']
    expiry_time = (expiry_date-create_date).total_seconds()/1440
    expiry_times[id_hash[booking_id]] = expiry_time
for booking_id in unique_ids:
    expiry_time = expiry_times[id_hash[booking_id]]
    booking_expiry.append([booking_id, expiry_time])
booking_expiry = pd.DataFrame(data=booking_expiry, columns=['booking_id', 'booking_expiry_time'])
print(booking_expiry.shape[0])
train_data_full = pd.merge(left=train_data_full, right=booking_expiry, how='left', on='booking_id')
train_data_full.drop(labels=['booking_create_timestamp', 'booking_expiry_date'], axis=1, inplace=True)

50000
49868


In [72]:
test_data = sample_submission['booking_id']
test_data = pd.merge(left=test_data, right=bookings_modified, how='left', on='booking_id')
test_data = pd.merge(left=test_data, right=bookings_data_modified, how='left', on='booking_id')
test_data = pd.merge(left=test_data, right=customer_data, how='left', on='customer_id')
test_data = pd.merge(left=test_data, right=hotels_data, how='left', on='hotel_id')
test_data = pd.merge(left=test_data, right=payments_data_modified, how='left', on='booking_id')
print(test_data.shape[0])
booking_expiry = []
unique_ids = test_data['booking_id'].unique()
id_hash = get_hash(unique_ids)
expiry_times = np.zeros(unique_ids.shape)
for i in range(test_data.shape[0]):
    booking_id = test_data.at[i, 'booking_id']
    expiry_date = test_data.at[i, 'booking_expiry_date']
    create_date = test_data.at[i, 'booking_create_timestamp']
    expiry_time = (expiry_date-create_date).total_seconds()/1440
    expiry_times[id_hash[booking_id]] = expiry_time
for booking_id in unique_ids:
    expiry_time = expiry_times[id_hash[booking_id]]
    booking_expiry.append([booking_id, expiry_time])
booking_expiry = pd.DataFrame(data=booking_expiry, columns=['booking_id', 'booking_expiry_time'])
print(booking_expiry.shape[0])
test_data = pd.merge(left=test_data, right=booking_expiry, how='left', on='booking_id')
test_data.drop(labels=['booking_create_timestamp', 'booking_expiry_date'], axis=1, inplace=True)

49079
49079


In [73]:
from sklearn.model_selection import train_test_split
train_labels = train_data_full['rating_score']
# train_data_full.drop(labels=['rating_score', 'booking_id', 'customer_id'], axis=1, inplace=True)
train_data_full.drop(labels=['rating_score', 'booking_id', 'customer_id', 'customer_unique_id', 'hotel_id', 'seller_agent_id'], axis=1, inplace=True)
X_train, X_val, Y_train, Y_val = train_test_split(train_data_full, train_labels, test_size=0.2, random_state=14)
test_ids = test_data['booking_id']
# test_data.drop(labels=['booking_id', 'customer_id'], axis=1, inplace=True)
test_data.drop(labels=['booking_id', 'customer_id', 'customer_unique_id', 'hotel_id', 'seller_agent_id'], axis=1, inplace=True)

In [11]:
X_train.dtypes

booking_status               object
booking_approval_time       float64
booking_checkin_time        float64
sub_requests                float64
seller_agent_id              object
price                       float64
agent_fees                  float64
country                      object
hotel_category              float64
hotel_name_length           float64
hotel_description_length    float64
hotel_photos_qty            float64
payments_made               float64
payments_installments       float64
payments_value              float64
booking_expiry_time         float64
dtype: object

In [96]:
# onehotcols = ['booking_status', 'country']
# ordinalcols = ['hotel_id', 'seller_agent_id', 'customer_unique_id']
# cardinalcols = ['booking_status', 'country', 'hotel_id', 'seller_agent_id', 'customer_unique_id']
# cardinalcols = ['booking_status', 'country', 'seller_agent_id']
cardinalcols = ['seller_agent_id']
targetcols = ['country', 'booking_status']
numcols = X_train.columns[X_train.dtypes == 'float64'].tolist()

In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, Normalizer, RobustScaler
from category_encoders import MEstimateEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [101]:
num_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value=-1)),
    ('scale', StandardScaler(with_mean=False))
])

# oh_transformer = Pipeline(steps=[
#     ('impute', SimpleImputer(strategy='constant', fill_value='Missing')),
#     ('ohenc', OneHotEncoder(handle_unknown='infrequent_if_exist')),
#     ('scale', StandardScaler(with_mean=False))
# ])

ord_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ordenc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scale', StandardScaler(with_mean=False))
])

target_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('targetenc', MEstimateEncoder(handle_unknown=-1)),
    ('scale', StandardScaler(with_mean=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('ord', ord_transformer, targetcols),
        # ('tar', target_transformer, targetcols),
        ('num', num_transformer, numcols)
    ]
)

In [125]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

# model = RandomForestRegressor(random_state=14)
# model = GradientBoostingRegressor(n_estimators=1000)
# model = HistGradientBoostingRegressor(max_iter=1000, max_depth=4)
# model = MLPRegressor(hidden_layer_sizes=(100, 100), learning_rate='constant', activation='logistic', learning_rate_init=0.001, max_iter=1000, early_stopping=True)
# model = KNeighborsRegressor(n_neighbors=100, weights='distance')
model = BaggingRegressor(base_estimator=RandomForestRegressor(max_features=0.5, random_state=14, max_samples=0.2, max_depth=10), n_estimators=500, random_state=14, bootstrap_features=True, oob_score=True)


In [18]:
def trim_pred(preds):
    for i in range(preds.shape[0]):
        if preds[i] > 5:
            preds[i] = 5
        elif preds[i] < 1:
            preds[i] = 1
    return preds

In [126]:
# eliminate = PCA(n_components=0.95)

model_pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    # ('eliminate', eliminate),
    ('model', model)
])

In [103]:
# For GridSearchCV

from sklearn.model_selection import GridSearchCV

"""
For HistGradientBoostingRegressor, best params: max_depth = 4, max_iter = 200, learning_rate: 0.05, max_leaf_nodes: None,
l2_regularization: 0, max_bins: 100, random_state = 14
"""

# parameters = {
#     'model__max_iter': [100, 200, 500, 1000],
#     'model__max_depth': [5, 4, 3, 2, 1],
#     'model__learning_rate': [0.1, 0.2, 0.05],
#     'model__max_leaf_nodes': [None, 31],
#     'model__l2_regularization': [0, 0.1],
#     'model__max_bins': [255, 200, 100],
#     'model__random_state': [14]
# }

"""
For MLPRegressor, best params: activation = 'logistic', early_stopping = True, hidden_layer_sizes = (100, 100), learning_rate = 'constant',
learning_rate_init = 0.001, max_iter = 1000, n_iter_no_change = 10
"""

# parameters = {
#     'model__hidden_layer_sizes': [(100, 100, 100), (100, 100, 50), (100, 50, 50), (100, 100)],
#     'model__activation': ['logistic', 'relu'],
#     'model__learning_rate': ['constant', 'adaptive'],
#     'model__learning_rate_init': [0.001, 0.005, 0.01, 0.05, 0.1],
#     'model__max_iter': [1000],
#     'model__early_stopping': [True],
#     'model__n_iter_no_change': [10]
# }

"""
For KNeighborsRegressor, best params: n_neighbors = 100, weights = 'distance'
"""

parameters = {
    'model__n_neighbors': [5, 10, 20, 100],
    'model__weights': ['uniform', 'distance']
}

pipeline_cv = GridSearchCV(model_pipeline, parameters, refit=True)
pipeline_cv.fit(X_train, Y_train)
print(pipeline_cv.best_params_)

{'model__n_neighbors': 100, 'model__weights': 'distance'}


In [89]:
from sklearn.ensemble import VotingRegressor

max_iters = [100, 200, 500, 1000, 5000]
max_depths = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
models = np.ndarray.flatten(np.array(list(np.array(list(HistGradientBoostingRegressor(max_iter=max_iter, max_depth=max_depth, random_state=14, max_bins=200) for max_depth in max_depths)) for max_iter in max_iters)))
model_names = np.array(list('m'+str(i) for i in range(50)))
model = VotingRegressor(estimators=list(zip(model_names, models)))
pipeline = Pipeline(steps=[('preprocess', preprocess), ('model', model)])
pipeline.fit(X_train, Y_train)
preds_val = pipeline.predict(X_val)
preds_train = pipeline.predict(X_train)

In [19]:
from sklearn.ensemble import VotingRegressor

max_iters = [100, 200, 500, 1000, 5000]
max_depths = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
models = np.ndarray.flatten(np.array(list(np.array(list(HistGradientBoostingRegressor(max_iter=max_iter, max_depth=max_depth, random_state=14, max_bins=200) for max_depth in max_depths)) for max_iter in max_iters)))
model_names = np.array(list('m'+str(i) for i in range(50)))
model = VotingRegressor(estimators=list(zip(model_names, models)))

In [26]:
from sklearn.neighbors import LocalOutlierFactor
X_train_processed = preprocess.fit_transform(X_train)
lof = LocalOutlierFactor(n_neighbors=20)
is_inlier = lof.fit_predict(X_train_processed)
X_train_inliers = X_train_processed[is_inlier == 1]
Y_train_inliers = Y_train[is_inlier == 1]
model.fit(X_train_inliers, Y_train_inliers)

In [27]:
X_val_processed = preprocess.transform(X_val)
preds_train = model.predict(X_train_processed)
preds_val = model.predict(X_val_processed)

In [90]:
preds_train = trim_pred(preds_train)
preds_val = trim_pred(preds_val)
mean_squared_error(Y_train, preds_train), mean_squared_error(Y_val, preds_val)

(1.2846113562306756, 1.3681142369705603)

In [29]:
X_total_processed = preprocess.fit_transform(train_data_full)
is_inlier = lof.fit_predict(X_total_processed)
X_total_inliers = X_total_processed[is_inlier == 1]
Y_total_inliers = train_labels[is_inlier == 1]
model.fit(X_total_inliers, Y_total_inliers)

In [30]:
X_test_processed = preprocess.transform(test_data)
sub_preds = trim_pred(model.predict(X_test_processed))

In [93]:
pipeline.fit(train_data_full, train_labels)

In [94]:
sub_preds = pipeline.predict(test_data)
sub_preds = trim_pred(sub_preds)

In [31]:
test_preds = pd.DataFrame(data=sub_preds, columns=['rating_score'])
test_sub = pd.concat([test_ids, test_preds], axis=1)
test_sub.describe(include='all')

Unnamed: 0,booking_id,rating_score
count,49079,49079.0
unique,49079,
top,796b98fdf73dbeba33a548910a1c6147,
freq,1,
mean,,4.090139
std,,0.655751
min,,1.206388
25%,,4.086961
50%,,4.348495
75%,,4.435792


In [32]:
test_sub.to_csv("../output/HGB_vote_pipeline_fulltrain_nohotelid_inliers.csv", index=False)

In [104]:
# Prediction after GridSearchCV

preds = trim_pred(pipeline_cv.predict(X_val))
score = mean_squared_error(Y_val, preds)
score

1.448744166405618

In [127]:
# Prediction without GridSearchCV

model_pipeline.fit(X_train, Y_train)
val_preds = trim_pred(model_pipeline.predict(X_val))
train_preds = trim_pred(model_pipeline.predict(X_train))
mean_squared_error(Y_train, train_preds), mean_squared_error(Y_val, val_preds)

(1.3340715478044816, 1.4138475451645518)

In [None]:
test_preds = trim_pred(model_pipeline.predict(test_data))
test_preds = pd.DataFrame(data=test_preds, columns=['rating_score'])
test_sub = pd.concat([test_ids, test_preds], axis=1)
test_sub.describe(include='all')

In [None]:
# test_sub.to_csv("../output/RegressionHGB_Scale.csv", index=False)