In [1]:
pip install xgboost lightgbm catboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.submission import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep7/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep7/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep7/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Fixing dates (https://www.kaggle.com/competitions/playground-series-s3e7/discussion/386655)
train['arrival_year_month'] = pd.to_datetime(train['arrival_year'].astype(str) + train['arrival_month'].astype(str), format = '%Y%m')
test['arrival_year_month'] = pd.to_datetime(test['arrival_year'].astype(str) + test['arrival_month'].astype(str), format = '%Y%m')

train.loc[train.arrival_date > train.arrival_year_month.dt.days_in_month, 'arrival_date'] = train.arrival_year_month.dt.days_in_month
test.loc[test.arrival_date > test.arrival_year_month.dt.days_in_month, 'arrival_date'] = test.arrival_year_month.dt.days_in_month

train.drop(columns = 'arrival_year_month', inplace = True)
test.drop(columns = 'arrival_year_month', inplace = True)

train['low_price_flag'] = np.where(train['avg_price_per_room'] < 30, 1, 0)
train['segment_0'] = np.where(train['market_segment_type'] == 0, 1, 0)
train['segment_1'] = np.where(train['market_segment_type'] == 1, 1, 0)
train['total_guests'] = train['no_of_adults'] + train['no_of_children']
train['stay_length'] = train['no_of_weekend_nights'] + train['no_of_week_nights']
train['stay_during_weekend'] = np.where(train['no_of_weekend_nights'] > 0, 1, 0)
train['quarter_1'] = np.where(train['arrival_month'] <= 3, 1, 0)
train['quarter_2'] = np.where(((train['arrival_month'] >= 4) & (train['arrival_month'] <= 6)), 1, 0)
train['quarter_3'] = np.where(((train['arrival_month'] >= 7) & (train['arrival_month'] <= 9)), 1, 0)
train['quarter_4'] = np.where(train['arrival_month'] >= 10, 1, 0)
train['segment_0_feature_1'] = np.where(((train['market_segment_type'] == 0) & (train['lead_time'] <= 90)), 1, 0)
train['segment_0_feature_2'] = np.where(((train['market_segment_type'] == 0) & (train['avg_price_per_room'] > 98)), 1, 0)
train['segment_1_feature_1'] = np.where(((train['market_segment_type'] == 1) & (train['no_of_special_requests'] == 0)), 1, 0)
train['segment_1_feature_2'] = np.where(((train['market_segment_type'] == 1) & (train['no_of_special_requests'] > 0) & (train['lead_time'] <= 150)), 1, 0)
train['segment_0_year_flag'] = np.where(((train['market_segment_type'] == 0) & (train['arrival_year'] == 2018)), 1, 0)
train['segment_1_year_flag'] = np.where(((train['market_segment_type'] == 1) & (train['arrival_year'] == 2018)), 1, 0)
train['price_lead_time_flag'] = np.where(((train['avg_price_per_room'] > 100) & (train['lead_time'] > 150)), 1, 0)

test['low_price_flag'] = np.where(test['avg_price_per_room'] < 30, 1, 0)
test['segment_0'] = np.where(test['market_segment_type'] == 0, 1, 0)
test['segment_1'] = np.where(test['market_segment_type'] == 1, 1, 0)
test['total_guests'] = test['no_of_adults'] + test['no_of_children']
test['stay_length'] = test['no_of_weekend_nights'] + test['no_of_week_nights']
test['stay_during_weekend'] = np.where(test['no_of_weekend_nights'] > 0, 1, 0)
test['quarter_1'] = np.where(test['arrival_month'] <= 3, 1, 0)
test['quarter_2'] = np.where(((test['arrival_month'] >= 4) & (test['arrival_month'] <= 6)), 1, 0)
test['quarter_3'] = np.where(((test['arrival_month'] >= 7) & (test['arrival_month'] <= 9)), 1, 0)
test['quarter_4'] = np.where(test['arrival_month'] >= 10, 1, 0)
test['segment_0_feature_1'] = np.where(((test['market_segment_type'] == 0) & (test['lead_time'] <= 90)), 1, 0)
test['segment_0_feature_2'] = np.where(((test['market_segment_type'] == 0) & (test['avg_price_per_room'] > 98)), 1, 0)
test['segment_1_feature_1'] = np.where(((test['market_segment_type'] == 1) & (test['no_of_special_requests'] == 0)), 1, 0)
test['segment_1_feature_2'] = np.where(((test['market_segment_type'] == 1) & (test['no_of_special_requests'] > 0) & (test['lead_time'] <= 150)), 1, 0)
test['segment_0_year_flag'] = np.where(((test['market_segment_type'] == 0) & (test['arrival_year'] == 2018)), 1, 0)
test['segment_1_year_flag'] = np.where(((test['market_segment_type'] == 1) & (test['arrival_year'] == 2018)), 1, 0)
test['price_lead_time_flag'] = np.where(((test['avg_price_per_room'] > 100) & (test['lead_time'] > 150)), 1, 0)


##########################
## Splitting Duplicates ##
##########################

train_dup = train.copy()
test_dup = test.copy()

duplicates = pd.merge(train, test, on = train_dup.columns.tolist()[1:18])
train_dup_ids = duplicates['id_x'].tolist()
test_dup_ids = duplicates['id_y'].tolist()

train_clean = train[~np.isin(train['id'], train_dup_ids)].reset_index(drop = True)
train_dup = train[np.isin(train['id'], train_dup_ids)].reset_index(drop = True)

test_clean = test[~np.isin(test['id'], test_dup_ids)].reset_index(drop = True)
test_dup = test[np.isin(test['id'], test_dup_ids)].reset_index(drop = True)

# XGBoost

In [17]:
X_xgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_xgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]

XGB_md = XGBClassifier(tree_method = 'hist', 
                       max_depth = 5,
                       learning_rate = 0.012569604344366379,
                       n_estimators = 5050,
                       gamma = 1.1534413128130536,
                       min_child_weight = 10,
                       colsample_bytree = 0.4542341159557226,
                       subsample = 0.8576987566989286).fit(X_xgb, Y)

xgb_pred_test = XGB_md.predict_proba(test_xgb)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = xgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

xgb_sub = submission.copy()
xgb_sub.columns = ['id', 'xgb']
xgb_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,xgb
0,42100,0.111617
1,42101,0.100939
2,42102,0.36402
3,42103,0.030002
4,42104,0.492941


# LightGBM

In [19]:
X_lgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_lgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]

lgb_md = LGBMClassifier(boosting_type = 'gbdt', 
                        n_estimators = 8743,
                        learning_rate = 0.005503197021026835,
                        max_depth = 11,
                        lambda_l1 = 1.815327684394496,
                        lambda_l2 = 0.2012642220501286,
                        num_leaves = 27,
                        bagging_fraction = 0.21774814214525,
                        feature_fraction = 0.4307774047555758).fit(X_lgb, Y)

lgb_pred_test = lgb_md.predict_proba(test_lgb)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = lgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

lgb_sub = submission.copy()
lgb_sub.columns = ['id', 'lgb']
lgb_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,lgb
0,42100,0.063296
1,42101,0.083414
2,42102,0.326009
3,42103,0.030986
4,42104,0.576908


# CatBoost

In [21]:
X_cat = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_cat = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]

cat_md = CatBoostClassifier(loss_function = 'Logloss',
                            eval_metric = 'AUC',
                            iterations = 3287,
                            learning_rate = 0.06487165843182341,
                            depth = 5,
                            random_strength = 0.34199641155092914,
                            bagging_temperature = 0.81496844524381,
                            border_count = 255,
                            l2_leaf_reg = 25, 
                            verbose = False).fit(X_cat, Y)

cat_pred_test = cat_md.predict_proba(test_cat)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = cat_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

cat_sub = submission.copy()
cat_sub.columns = ['id', 'cat']
cat_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,cat
0,42100,0.067354
1,42101,0.097399
2,42102,0.274797
3,42103,0.0288
4,42104,0.651255


# Consolidating 

In [22]:
submission = pd.merge(xgb_sub, lgb_sub, on = 'id')
submission = pd.merge(submission, cat_sub, on = 'id')
submission.head()

Unnamed: 0,id,xgb,lgb,cat
0,42100,0.111617,0.063296,0.067354
1,42101,0.100939,0.083414,0.097399
2,42102,0.36402,0.326009,0.274797
3,42103,0.030002,0.030986,0.0288
4,42104,0.492941,0.576908,0.651255


In [23]:
submission['booking_status'] = (submission['xgb'] + submission['lgb'] + submission['cat']) / 3
submission.drop(columns = ['xgb', 'lgb', 'cat'], axis = 1, inplace = True)
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.080756
1,42101,0.093917
2,42102,0.321609
3,42103,0.029929
4,42104,0.573701


In [24]:
submission.to_csv('ensemble_full_data.csv', index = False)

# Looping 

In [25]:
X_xgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_xgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
preds = list()

for i in range(0, 10):
    print(i)
    XGB_md = XGBClassifier(tree_method = 'hist', 
                           max_depth = 5,
                           learning_rate = 0.012569604344366379,
                           n_estimators = 5050,
                           gamma = 1.1534413128130536,
                           min_child_weight = 10,
                           colsample_bytree = 0.4542341159557226,
                           subsample = 0.8576987566989286).fit(X_xgb, Y)
    preds.append(XGB_md.predict_proba(test_xgb)[:, 1])

xgb_pred_test = pd.DataFrame(preds).apply(np.mean, axis = 0)
clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = xgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

xgb_sub = submission.copy()
xgb_sub.columns = ['id', 'xgb']
xgb_sub.head()

0
1
2
3
4
5
6
7
8
9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,xgb
0,42100,0.111617
1,42101,0.100939
2,42102,0.36402
3,42103,0.030002
4,42104,0.492941
