In [1]:
pip install xgboost lightgbm catboost 

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected package

In [4]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep7/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep7/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep7/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Fixing dates (https://www.kaggle.com/competitions/playground-series-s3e7/discussion/386655)
train['arrival_year_month'] = pd.to_datetime(train['arrival_year'].astype(str) + train['arrival_month'].astype(str), format = '%Y%m')
test['arrival_year_month'] = pd.to_datetime(test['arrival_year'].astype(str) + test['arrival_month'].astype(str), format = '%Y%m')

train.loc[train.arrival_date > train.arrival_year_month.dt.days_in_month, 'arrival_date'] = train.arrival_year_month.dt.days_in_month
test.loc[test.arrival_date > test.arrival_year_month.dt.days_in_month, 'arrival_date'] = test.arrival_year_month.dt.days_in_month

train.drop(columns = 'arrival_year_month', inplace = True)
test.drop(columns = 'arrival_year_month', inplace = True)

train['low_price_flag'] = np.where(train['avg_price_per_room'] < 30, 1, 0)
train['segment_0'] = np.where(train['market_segment_type'] == 0, 1, 0)
train['segment_1'] = np.where(train['market_segment_type'] == 1, 1, 0)
train['total_guests'] = train['no_of_adults'] + train['no_of_children']
train['stay_length'] = train['no_of_weekend_nights'] + train['no_of_week_nights']
train['stay_during_weekend'] = np.where(train['no_of_weekend_nights'] > 0, 1, 0)
train['quarter_1'] = np.where(train['arrival_month'] <= 3, 1, 0)
train['quarter_2'] = np.where(((train['arrival_month'] >= 4) & (train['arrival_month'] <= 6)), 1, 0)
train['quarter_3'] = np.where(((train['arrival_month'] >= 7) & (train['arrival_month'] <= 9)), 1, 0)
train['quarter_4'] = np.where(train['arrival_month'] >= 10, 1, 0)
train['segment_0_feature_1'] = np.where(((train['market_segment_type'] == 0) & (train['lead_time'] <= 90)), 1, 0)
train['segment_0_feature_2'] = np.where(((train['market_segment_type'] == 0) & (train['avg_price_per_room'] > 98)), 1, 0)
train['segment_1_feature_1'] = np.where(((train['market_segment_type'] == 1) & (train['no_of_special_requests'] == 0)), 1, 0)
train['segment_1_feature_2'] = np.where(((train['market_segment_type'] == 1) & (train['no_of_special_requests'] > 0) & (train['lead_time'] <= 150)), 1, 0)
train['segment_0_year_flag'] = np.where(((train['market_segment_type'] == 0) & (train['arrival_year'] == 2018)), 1, 0)
train['segment_1_year_flag'] = np.where(((train['market_segment_type'] == 1) & (train['arrival_year'] == 2018)), 1, 0)
train['price_lead_time_flag'] = np.where(((train['avg_price_per_room'] > 100) & (train['lead_time'] > 150)), 1, 0)

test['low_price_flag'] = np.where(test['avg_price_per_room'] < 30, 1, 0)
test['segment_0'] = np.where(test['market_segment_type'] == 0, 1, 0)
test['segment_1'] = np.where(test['market_segment_type'] == 1, 1, 0)
test['total_guests'] = test['no_of_adults'] + test['no_of_children']
test['stay_length'] = test['no_of_weekend_nights'] + test['no_of_week_nights']
test['stay_during_weekend'] = np.where(test['no_of_weekend_nights'] > 0, 1, 0)
test['quarter_1'] = np.where(test['arrival_month'] <= 3, 1, 0)
test['quarter_2'] = np.where(((test['arrival_month'] >= 4) & (test['arrival_month'] <= 6)), 1, 0)
test['quarter_3'] = np.where(((test['arrival_month'] >= 7) & (test['arrival_month'] <= 9)), 1, 0)
test['quarter_4'] = np.where(test['arrival_month'] >= 10, 1, 0)
test['segment_0_feature_1'] = np.where(((test['market_segment_type'] == 0) & (test['lead_time'] <= 90)), 1, 0)
test['segment_0_feature_2'] = np.where(((test['market_segment_type'] == 0) & (test['avg_price_per_room'] > 98)), 1, 0)
test['segment_1_feature_1'] = np.where(((test['market_segment_type'] == 1) & (test['no_of_special_requests'] == 0)), 1, 0)
test['segment_1_feature_2'] = np.where(((test['market_segment_type'] == 1) & (test['no_of_special_requests'] > 0) & (test['lead_time'] <= 150)), 1, 0)
test['segment_0_year_flag'] = np.where(((test['market_segment_type'] == 0) & (test['arrival_year'] == 2018)), 1, 0)
test['segment_1_year_flag'] = np.where(((test['market_segment_type'] == 1) & (test['arrival_year'] == 2018)), 1, 0)
test['price_lead_time_flag'] = np.where(((test['avg_price_per_room'] > 100) & (test['lead_time'] > 150)), 1, 0)


##########################
## Splitting Duplicates ##
##########################

train_dup = train.copy()
test_dup = test.copy()

duplicates = pd.merge(train, test, on = train_dup.columns.tolist()[1:18])
train_dup_ids = duplicates['id_x'].tolist()
test_dup_ids = duplicates['id_y'].tolist()

train_clean = train[~np.isin(train['id'], train_dup_ids)].reset_index(drop = True)
train_dup = train[np.isin(train['id'], train_dup_ids)].reset_index(drop = True)

test_clean = test[~np.isin(test['id'], test_dup_ids)].reset_index(drop = True)
test_dup = test[np.isin(test['id'], test_dup_ids)].reset_index(drop = True)

# XGBoost

In [17]:
X_xgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_xgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]

XGB_md = XGBClassifier(tree_method = 'hist', 
                       max_depth = 5,
                       learning_rate = 0.012569604344366379,
                       n_estimators = 5050,
                       gamma = 1.1534413128130536,
                       min_child_weight = 10,
                       colsample_bytree = 0.4542341159557226,
                       subsample = 0.8576987566989286).fit(X_xgb, Y)

xgb_pred_test = XGB_md.predict_proba(test_xgb)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = xgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

xgb_sub = submission.copy()
xgb_sub.columns = ['id', 'xgb']
xgb_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,xgb
0,42100,0.111617
1,42101,0.100939
2,42102,0.36402
3,42103,0.030002
4,42104,0.492941


# LightGBM

In [19]:
X_lgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_lgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]

lgb_md = LGBMClassifier(boosting_type = 'gbdt', 
                        n_estimators = 8743,
                        learning_rate = 0.005503197021026835,
                        max_depth = 11,
                        lambda_l1 = 1.815327684394496,
                        lambda_l2 = 0.2012642220501286,
                        num_leaves = 27,
                        bagging_fraction = 0.21774814214525,
                        feature_fraction = 0.4307774047555758).fit(X_lgb, Y)

lgb_pred_test = lgb_md.predict_proba(test_lgb)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = lgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

lgb_sub = submission.copy()
lgb_sub.columns = ['id', 'lgb']
lgb_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,lgb
0,42100,0.063296
1,42101,0.083414
2,42102,0.326009
3,42103,0.030986
4,42104,0.576908


# CatBoost

In [21]:
X_cat = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_cat = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]

cat_md = CatBoostClassifier(loss_function = 'Logloss',
                            eval_metric = 'AUC',
                            iterations = 3287,
                            learning_rate = 0.06487165843182341,
                            depth = 5,
                            random_strength = 0.34199641155092914,
                            bagging_temperature = 0.81496844524381,
                            border_count = 255,
                            l2_leaf_reg = 25, 
                            verbose = False).fit(X_cat, Y)

cat_pred_test = cat_md.predict_proba(test_cat)[:, 1]

clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = cat_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

cat_sub = submission.copy()
cat_sub.columns = ['id', 'cat']
cat_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,cat
0,42100,0.067354
1,42101,0.097399
2,42102,0.274797
3,42103,0.0288
4,42104,0.651255


# Consolidating 

In [22]:
submission = pd.merge(xgb_sub, lgb_sub, on = 'id')
submission = pd.merge(submission, cat_sub, on = 'id')
submission.head()

Unnamed: 0,id,xgb,lgb,cat
0,42100,0.111617,0.063296,0.067354
1,42101,0.100939,0.083414,0.097399
2,42102,0.36402,0.326009,0.274797
3,42103,0.030002,0.030986,0.0288
4,42104,0.492941,0.576908,0.651255


In [23]:
submission['booking_status'] = (submission['xgb'] + submission['lgb'] + submission['cat']) / 3
submission.drop(columns = ['xgb', 'lgb', 'cat'], axis = 1, inplace = True)
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.080756
1,42101,0.093917
2,42102,0.321609
3,42103,0.029929
4,42104,0.573701


In [24]:
submission.to_csv('ensemble_full_data.csv', index = False)

# Looping 

In [6]:
X_xgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_xgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'lead_time', 
                     'arrival_year', 'arrival_month', 'market_segment_type', 'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_during_weekend', 'quarter_2', 
                     'quarter_3', 'segment_0_feature_1', 'segment_0_feature_2', 'segment_1_feature_1', 
                     'segment_1_feature_2', 'segment_1_year_flag', 'price_lead_time_flag']]
preds = list()

for i in tqdm(range(0, 100)):
    
    XGB_md = XGBClassifier(tree_method = 'hist', 
                           max_depth = 5,
                           learning_rate = 0.012569604344366379,
                           n_estimators = 5050,
                           gamma = 1.1534413128130536,
                           min_child_weight = 10,
                           colsample_bytree = 0.4542341159557226,
                           subsample = 0.8576987566989286, 
                           random_state = i).fit(X_xgb, Y)
    preds.append(XGB_md.predict_proba(test_xgb)[:, 1])

xgb_pred_test = pd.DataFrame(preds).apply(np.mean, axis = 0)
clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = xgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

xgb_sub = submission.copy()
xgb_sub.columns = ['id', 'xgb']
xgb_sub.head()

100%|██████████| 100/100 [10:07<00:00,  6.08s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,xgb
0,42100,0.109704
1,42101,0.095307
2,42102,0.348856
3,42103,0.030804
4,42104,0.48027


In [7]:
X_lgb = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_lgb = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 
                     'no_of_special_requests', 'segment_1', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'quarter_2', 'quarter_3', 'segment_1_feature_1', 'price_lead_time_flag']]

preds = list()

for i in tqdm(range(0, 100)):

    lgb_md = LGBMClassifier(boosting_type = 'gbdt', 
                        n_estimators = 8743,
                        learning_rate = 0.005503197021026835,
                        max_depth = 11,
                        lambda_l1 = 1.815327684394496,
                        lambda_l2 = 0.2012642220501286,
                        num_leaves = 27,
                        bagging_fraction = 0.21774814214525,
                        feature_fraction = 0.4307774047555758,
                        random_state = i).fit(X_lgb, Y)
    
    preds.append(lgb_md.predict_proba(test_lgb)[:, 1])

lgb_pred_test = pd.DataFrame(preds).apply(np.mean, axis = 0)
clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = lgb_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

lgb_sub = submission.copy()
lgb_sub.columns = ['id', 'lgb']
lgb_sub.head()

  0%|          | 0/100 [00:00<?, ?it/s]



  1%|          | 1/100 [00:23<39:08, 23.73s/it]



  2%|▏         | 2/100 [00:46<37:37, 23.04s/it]



  3%|▎         | 3/100 [01:09<37:32, 23.23s/it]



  4%|▍         | 4/100 [01:29<34:50, 21.77s/it]



  5%|▌         | 5/100 [01:49<33:18, 21.04s/it]



  6%|▌         | 6/100 [02:13<34:34, 22.07s/it]



  7%|▋         | 7/100 [02:32<32:59, 21.29s/it]



  8%|▊         | 8/100 [02:52<32:06, 20.94s/it]



  9%|▉         | 9/100 [03:13<31:27, 20.74s/it]



 10%|█         | 10/100 [03:33<30:51, 20.58s/it]



 11%|█         | 11/100 [03:56<31:33, 21.27s/it]



 12%|█▏        | 12/100 [04:19<31:50, 21.71s/it]



 13%|█▎        | 13/100 [04:38<30:37, 21.13s/it]



 14%|█▍        | 14/100 [04:58<29:50, 20.82s/it]



 15%|█▌        | 15/100 [05:22<30:48, 21.75s/it]



 16%|█▌        | 16/100 [05:42<29:31, 21.09s/it]



 17%|█▋        | 17/100 [06:02<28:42, 20.75s/it]



 18%|█▊        | 18/100 [06:22<28:03, 20.54s/it]



 19%|█▉        | 19/100 [06:45<28:57, 21.46s/it]



 20%|██        | 20/100 [07:10<29:44, 22.31s/it]



 21%|██        | 21/100 [07:33<29:40, 22.53s/it]



 22%|██▏       | 22/100 [07:54<28:35, 21.99s/it]



 23%|██▎       | 23/100 [08:18<29:13, 22.77s/it]



 24%|██▍       | 24/100 [08:38<27:49, 21.96s/it]



 25%|██▌       | 25/100 [09:02<28:09, 22.53s/it]



 26%|██▌       | 26/100 [09:25<27:50, 22.57s/it]



 27%|██▋       | 27/100 [09:48<27:36, 22.69s/it]



 28%|██▊       | 28/100 [10:11<27:20, 22.79s/it]



 29%|██▉       | 29/100 [10:34<26:58, 22.80s/it]



 30%|███       | 30/100 [10:57<26:48, 22.98s/it]



 31%|███       | 31/100 [11:24<27:52, 24.24s/it]



 32%|███▏      | 32/100 [11:47<26:58, 23.80s/it]



 33%|███▎      | 33/100 [12:08<25:34, 22.91s/it]



 34%|███▍      | 34/100 [12:28<24:12, 22.00s/it]



 35%|███▌      | 35/100 [12:47<23:02, 21.26s/it]



 36%|███▌      | 36/100 [13:11<23:22, 21.91s/it]



 37%|███▋      | 37/100 [13:34<23:21, 22.25s/it]



 38%|███▊      | 38/100 [13:56<23:08, 22.39s/it]



 39%|███▉      | 39/100 [14:22<23:52, 23.49s/it]



 40%|████      | 40/100 [14:42<22:23, 22.38s/it]



 41%|████      | 41/100 [15:03<21:26, 21.80s/it]



 42%|████▏     | 42/100 [15:23<20:42, 21.42s/it]



 43%|████▎     | 43/100 [15:46<20:50, 21.93s/it]



 44%|████▍     | 44/100 [16:09<20:41, 22.18s/it]



 45%|████▌     | 45/100 [16:32<20:34, 22.45s/it]



 46%|████▌     | 46/100 [16:56<20:39, 22.96s/it]



 47%|████▋     | 47/100 [17:22<21:06, 23.89s/it]



 48%|████▊     | 48/100 [17:45<20:24, 23.54s/it]



 49%|████▉     | 49/100 [18:08<19:47, 23.27s/it]



 50%|█████     | 50/100 [18:31<19:17, 23.14s/it]



 51%|█████     | 51/100 [18:54<18:51, 23.09s/it]



 52%|█████▏    | 52/100 [19:16<18:22, 22.97s/it]



 53%|█████▎    | 53/100 [19:39<17:58, 22.96s/it]



 54%|█████▍    | 54/100 [20:02<17:33, 22.89s/it]



 55%|█████▌    | 55/100 [20:27<17:40, 23.56s/it]



 56%|█████▌    | 56/100 [20:50<17:14, 23.50s/it]



 57%|█████▋    | 57/100 [21:14<16:48, 23.44s/it]



 58%|█████▊    | 58/100 [21:37<16:18, 23.31s/it]



 59%|█████▉    | 59/100 [22:00<15:56, 23.32s/it]



 60%|██████    | 60/100 [22:23<15:28, 23.22s/it]



 61%|██████    | 61/100 [22:46<15:06, 23.24s/it]



 62%|██████▏   | 62/100 [23:15<15:40, 24.75s/it]



 63%|██████▎   | 63/100 [23:38<15:01, 24.36s/it]



 64%|██████▍   | 64/100 [24:01<14:23, 24.00s/it]



 65%|██████▌   | 65/100 [24:25<13:56, 23.89s/it]



 66%|██████▌   | 66/100 [24:48<13:27, 23.76s/it]



 67%|██████▋   | 67/100 [25:12<13:06, 23.82s/it]



 68%|██████▊   | 68/100 [25:35<12:34, 23.56s/it]



 69%|██████▉   | 69/100 [25:58<12:06, 23.44s/it]



 70%|███████   | 70/100 [26:26<12:16, 24.56s/it]



 71%|███████   | 71/100 [26:48<11:35, 23.99s/it]



 72%|███████▏  | 72/100 [27:12<11:12, 24.00s/it]



 73%|███████▎  | 73/100 [27:35<10:37, 23.62s/it]



 74%|███████▍  | 74/100 [27:58<10:05, 23.31s/it]



 75%|███████▌  | 75/100 [28:20<09:38, 23.13s/it]



 76%|███████▌  | 76/100 [28:41<09:00, 22.51s/it]



 77%|███████▋  | 77/100 [29:08<09:05, 23.71s/it]



 78%|███████▊  | 78/100 [29:31<08:35, 23.45s/it]



 79%|███████▉  | 79/100 [29:55<08:16, 23.63s/it]



 80%|████████  | 80/100 [30:18<07:48, 23.41s/it]



 81%|████████  | 81/100 [30:41<07:22, 23.29s/it]



 82%|████████▏ | 82/100 [31:03<06:56, 23.14s/it]



 83%|████████▎ | 83/100 [31:24<06:22, 22.51s/it]



 84%|████████▍ | 84/100 [31:48<06:06, 22.90s/it]



 85%|████████▌ | 85/100 [32:16<06:03, 24.27s/it]



 86%|████████▌ | 86/100 [32:38<05:29, 23.57s/it]



 87%|████████▋ | 87/100 [32:58<04:54, 22.63s/it]



 88%|████████▊ | 88/100 [33:20<04:29, 22.44s/it]



 89%|████████▉ | 89/100 [33:43<04:07, 22.53s/it]



 90%|█████████ | 90/100 [34:06<03:47, 22.77s/it]



 91%|█████████ | 91/100 [34:29<03:24, 22.73s/it]



 92%|█████████▏| 92/100 [34:52<03:02, 22.81s/it]



 93%|█████████▎| 93/100 [35:20<02:50, 24.34s/it]



 94%|█████████▍| 94/100 [35:43<02:23, 23.99s/it]



 95%|█████████▌| 95/100 [36:06<01:58, 23.62s/it]



 96%|█████████▌| 96/100 [36:26<01:30, 22.68s/it]



 97%|█████████▋| 97/100 [36:46<01:05, 21.86s/it]



 98%|█████████▊| 98/100 [37:07<00:42, 21.48s/it]



 99%|█████████▉| 99/100 [37:30<00:21, 21.93s/it]



100%|██████████| 100/100 [37:52<00:00, 22.73s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,lgb
0,42100,0.060219
1,42101,0.081466
2,42102,0.326648
3,42103,0.029949
4,42104,0.589078


In [8]:
X_cat = train_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]
Y = train_clean['booking_status']

test_cat = test_clean[['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                     'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type',
                     'repeated_guest', 'avg_price_per_room', 
                     'no_of_special_requests', 'total_guests', 'stay_length', 'stay_during_weekend',
                     'segment_0_feature_1', 'segment_1_feature_1', 'segment_1_year_flag', 'price_lead_time_flag']]

preds = list()

for i in tqdm(range(0, 10)):

    cat_md = CatBoostClassifier(loss_function = 'Logloss',
                            eval_metric = 'AUC',
                            iterations = 3287,
                            learning_rate = 0.06487165843182341,
                            depth = 5,
                            random_strength = 0.34199641155092914,
                            bagging_temperature = 0.81496844524381,
                            border_count = 255,
                            l2_leaf_reg = 25,
                            random_seed = i,    
                            verbose = False).fit(X_cat, Y)

    preds.append(cat_md.predict_proba(test_cat)[:, 1])

cat_pred_test = pd.DataFrame(preds).apply(np.mean, axis = 0)
clean_pred = pd.DataFrame({'id': test_clean['id']})
clean_pred['booking_status_clean'] = cat_pred_test

dup_pred = duplicates[['id_y', 'booking_status']]
dup_pred.columns = ['id', 'booking_status_dup']
dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']

submission = pd.merge(submission.drop(columns = 'booking_status', axis = 1), clean_pred, on = 'id', how = 'left')
submission = pd.merge(submission, dup_pred, on = 'id', how = 'left')
submission['booking_status'] = np.where(np.isnan(submission['booking_status_clean']), submission['booking_status_dup'], submission['booking_status_clean'])
submission.drop(columns = ['booking_status_clean', 'booking_status_dup'], axis = 1, inplace = True)

cat_sub = submission.copy()
cat_sub.columns = ['id', 'cat']
cat_sub.head()

100%|██████████| 10/10 [02:39<00:00, 15.95s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_pred['booking_status_dup'] = 1 - dup_pred['booking_status_dup']


Unnamed: 0,id,cat
0,42100,0.069672
1,42101,0.115494
2,42102,0.259006
3,42103,0.02725
4,42104,0.65464


In [9]:
submission = pd.merge(xgb_sub, lgb_sub, on = 'id')
submission = pd.merge(submission, cat_sub, on = 'id')
submission.head()

Unnamed: 0,id,xgb,lgb,cat
0,42100,0.109704,0.060219,0.069672
1,42101,0.095307,0.081466,0.115494
2,42102,0.348856,0.326648,0.259006
3,42103,0.030804,0.029949,0.02725
4,42104,0.48027,0.589078,0.65464


In [10]:
submission['booking_status'] = (submission['xgb'] + submission['lgb'] + submission['cat']) / 3
submission.drop(columns = ['xgb', 'lgb', 'cat'], axis = 1, inplace = True)
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.079865
1,42101,0.097422
2,42102,0.311503
3,42103,0.029335
4,42104,0.574663


In [11]:
submission.to_csv('ensemble_full_data_100.csv', index = False)