In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import chime
from sklearn.model_selection import GridSearchCV
from tqdm.notebook import tqdm_notebook

In [6]:
%load_ext chime

# Model selection
Kaggle Playground Series - Season 3, Episode 7

https://www.kaggle.com/competitions/playground-series-s3e7/overview

## Individual classifiers

### Note
I use the mean_test_score of cross validation in this notebook to compare model performance with different hyperparameters and see if the difference is big enough to continue optimization

# Data preparation

In [7]:
train = pd.read_csv('./data/train.csv', index_col='id'); train

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,2,0,0,2,1,0,0,9,2018,1,14,1,1,11,0,67.50,0,0
1,2,0,1,2,0,0,0,117,2018,7,29,0,0,0,0,72.25,0,0
2,2,0,0,1,0,0,0,315,2018,12,2,0,0,0,0,52.00,0,0
3,1,0,0,2,1,0,0,32,2018,12,1,1,0,0,0,56.00,0,0
4,2,0,1,0,0,0,0,258,2018,10,16,0,0,0,0,100.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42095,3,0,0,4,0,0,1,160,2018,12,30,1,0,0,0,140.00,2,1
42096,2,0,0,3,0,0,0,34,2017,9,23,0,0,0,0,224.67,0,0
42097,2,0,0,2,2,0,0,292,2018,7,21,0,0,0,0,96.00,0,0
42098,1,0,0,3,0,0,0,5,2018,11,9,0,0,0,0,120.00,0,0


I will not split off a validation set, rather will base parameter selection on cross-validation results.

In [8]:
X_train = train.drop(columns='booking_status')
y_train = train.booking_status

In [9]:
scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_train

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
0,0.151040,-0.313454,-0.998814,-0.278849,1.294624,-0.160945,-0.515173,-1.170469,0.409661,-2.330398,-0.214091,0.428551,5.766758,33.699120,-0.101479,-0.998052,-0.737691
1,0.151040,-0.313454,0.130259,-0.278849,-0.407020,-0.160945,-0.515173,0.161740,0.409661,-0.209778,1.473487,-1.149928,-0.173408,-0.060506,-0.101479,-0.870153,-0.737691
2,0.151040,-0.313454,-0.998814,-0.979466,-0.407020,-0.160945,-0.515173,2.604122,0.409661,1.557404,-1.564154,-1.149928,-0.173408,-0.060506,-0.101479,-1.415406,-0.737691
3,-1.753925,-0.313454,-0.998814,-0.278849,1.294624,-0.160945,-0.515173,-0.886758,0.409661,1.557404,-1.676659,0.428551,-0.173408,-0.060506,-0.101479,-1.307702,-0.737691
4,0.151040,-0.313454,0.130259,-1.680084,-0.407020,-0.160945,-0.515173,1.901012,0.409661,0.850531,0.010919,-1.149928,-0.173408,-0.060506,-0.101479,-0.122955,-0.737691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42095,2.056005,-0.313454,-0.998814,1.122386,-0.407020,-0.160945,0.685890,0.692156,0.409661,1.557404,1.585992,0.428551,-0.173408,-0.060506,-0.101479,0.954088,1.842849
42096,0.151040,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-0.862087,-2.441040,0.497095,0.798456,-1.149928,-0.173408,-0.060506,-0.101479,3.233919,-0.737691
42097,0.151040,-0.313454,-0.998814,-0.278849,2.996268,-0.160945,-0.515173,2.320411,0.409661,-0.209778,0.573445,-1.149928,-0.173408,-0.060506,-0.101479,-0.230659,-0.737691
42098,-1.753925,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-1.219810,0.409661,1.203968,-0.776617,-1.149928,-0.173408,-0.060506,-0.101479,0.415567,-0.737691


# Model selection

## Random forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
estimator = RandomForestClassifier(n_jobs=-1, random_state=8, bootstrap=True)

params = {
    'n_estimators': [1500, 2000, 2500],
    'max_depth': [10, 25, 50],
    'min_samples_split': [5, 10, 15],
    'max_features': ['sqrt', None],
    'max_samples': [0.1, 0.3, 0.5]
}

params = {
    'n_estimators': [2250],
    'max_depth': [30, 50, 100],
    'min_samples_split': [20, 30, 50],
    'max_features': ['sqrt'],
    'max_samples': [0.7, 1.0]
}

In [29]:
params = {
    'n_estimators': [2250],
    'max_depth': [40, 50, 70],
    'min_samples_split': [40, 50, 70, 100],
    'max_features': ['sqrt'],
    'max_samples': [0.8, 1.0]
}

In [30]:
model = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     scoring='roc_auc',
                     cv = 5,
                    verbose = 3)

In [31]:
%%time
%%chime
model.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=40, n_estimators=2250;, score=0.885 total time=  15.3s
[CV 2/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=40, n_estimators=2250;, score=0.896 total time=  15.0s
[CV 3/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=40, n_estimators=2250;, score=0.890 total time=  14.9s
[CV 4/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=40, n_estimators=2250;, score=0.896 total time=  14.9s
[CV 5/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=40, n_estimators=2250;, score=0.887 total time=  14.9s
[CV 1/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=50, n_estimators=2250;, score=0.885 total time=  14.6s
[CV 2/5] END max_depth=40, max_features=sqrt, max_samples=0.8, min_samples_split=50, n_estimators=2250;, score=0.896 total

[CV 1/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=40, n_estimators=2250;, score=0.886 total time=  16.8s
[CV 2/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=40, n_estimators=2250;, score=0.896 total time=  16.8s
[CV 3/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=40, n_estimators=2250;, score=0.890 total time=  16.7s
[CV 4/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=40, n_estimators=2250;, score=0.897 total time=  16.7s
[CV 5/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=40, n_estimators=2250;, score=0.887 total time=  16.8s
[CV 1/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=50, n_estimators=2250;, score=0.885 total time=  16.6s
[CV 2/5] END max_depth=50, max_features=sqrt, max_samples=1.0, min_samples_split=50, n_estimators=2250;, score=0.896 total time=  16.3s
[CV 3/5] END max_depth=50, max_features=sqrt, ma

In [32]:
model.best_params_

{'max_depth': 40,
 'max_features': 'sqrt',
 'max_samples': 1.0,
 'min_samples_split': 40,
 'n_estimators': 2250}

In [33]:
model.best_score_

0.8911929904636795

In [34]:
pd.Series(model.cv_results_['mean_test_score']).describe()

count    24.000000
mean      0.890123
std       0.000943
min       0.888429
25%       0.889552
50%       0.890375
75%       0.890890
max       0.891193
dtype: float64

## XGBoost

In [6]:
from xgboost import XGBClassifier

In [7]:
estimator = XGBClassifier(random_state=8,
                         max_leaves=0,
                         n_jobs=-1)

params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [2, 4, 6],
    'learning_rate': [0.1, 0.2, 0.3],
    'reg_alpha': [0.001, 0.01, 0.05],
    'reg_lambda': [0.2, 0.3, 0.5]
}

params = {
    'n_estimators': [170, 200, 300, 500],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.3, 0.5, 1],
    'reg_alpha': [5e-4, 0.001, 0.005],
    'reg_lambda': [0.1, 0.2, 0.25]
}

In [20]:
params = {
    'n_estimators': [500, 700, 1000],
    'max_depth': [3],
    'learning_rate': [0.3, 0.4],
    'reg_alpha': [0, 1e-4, 5e-4, 7e-4],
    'reg_lambda': [0.2]
}

In [21]:
model = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     scoring='roc_auc',
                     cv = 5,
                    verbose = 3)

In [22]:
%%time
%%chime
model.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.893 total time=   2.7s
[CV 2/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.902 total time=   2.6s
[CV 3/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.898 total time=   2.5s
[CV 4/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.902 total time=   2.5s
[CV 5/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.896 total time=   2.5s
[CV 1/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.893 total time=   2.6s
[CV 2/5] END learning_rate=0.3, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.902 total time=   2.6s
[CV 3/5] END learning_rate=0.3, max_depth=3, n_estimators=50

[CV 5/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0, reg_lambda=0.2;, score=0.895 total time=   2.7s
[CV 1/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.892 total time=   2.6s
[CV 2/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.902 total time=   2.5s
[CV 3/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.898 total time=   2.6s
[CV 4/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.901 total time=   2.5s
[CV 5/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0001, reg_lambda=0.2;, score=0.895 total time=   2.5s
[CV 1/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0005, reg_lambda=0.2;, score=0.891 total time=   2.5s
[CV 2/5] END learning_rate=0.4, max_depth=3, n_estimators=500, reg_alpha=0.0005, reg_lambda=0.2;, scor

In [23]:
model.best_params_

{'learning_rate': 0.3,
 'max_depth': 3,
 'n_estimators': 700,
 'reg_alpha': 0.0005,
 'reg_lambda': 0.2}

In [24]:
model.best_score_

0.8983333718679651

In [25]:
pd.Series(model.cv_results_['mean_test_score']).describe()

count    24.000000
mean      0.897284
std       0.000964
min       0.895196
25%       0.896817
50%       0.897530
75%       0.898086
max       0.898333
dtype: float64

## KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
estimator = KNeighborsClassifier(algorithm='auto', metric='minkowski', n_jobs=-1)

params = {
    'n_neighbors': [60, 70, 80],
    'weights': ['distance', 'uniform'],
    'p': [1, 2, 3]
}

In [37]:
params = {
    'n_neighbors': [30, 40, 60, 65],
    'weights': ['distance', 'uniform'],
    'p': [1, 2]
}

In [38]:
model = GridSearchCV(estimator=estimator,
                     param_grid=params,
                     scoring='roc_auc',
                     cv = 5,
                    verbose=3)

In [39]:
%%time
%%chime
model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END n_neighbors=30, p=1, weights=distance;, score=0.830 total time=   5.4s
[CV 2/5] END n_neighbors=30, p=1, weights=distance;, score=0.839 total time=   5.6s
[CV 3/5] END n_neighbors=30, p=1, weights=distance;, score=0.833 total time=   5.6s
[CV 4/5] END n_neighbors=30, p=1, weights=distance;, score=0.842 total time=   5.5s
[CV 5/5] END n_neighbors=30, p=1, weights=distance;, score=0.826 total time=   5.2s
[CV 1/5] END n_neighbors=30, p=1, weights=uniform;, score=0.858 total time=   5.4s
[CV 2/5] END n_neighbors=30, p=1, weights=uniform;, score=0.866 total time=   5.6s
[CV 3/5] END n_neighbors=30, p=1, weights=uniform;, score=0.864 total time=   5.6s
[CV 4/5] END n_neighbors=30, p=1, weights=uniform;, score=0.867 total time=   5.4s
[CV 5/5] END n_neighbors=30, p=1, weights=uniform;, score=0.858 total time=   5.2s
[CV 1/5] END n_neighbors=30, p=2, weights=distance;, score=0.823 total time=   5.7s
[CV 2/5] END n_neigh

In [40]:
model.best_params_

{'n_neighbors': 60, 'p': 1, 'weights': 'uniform'}

In [41]:
model.best_score_

0.8636631176440875

In [42]:
pd.Series(model.cv_results_['mean_test_score']).describe()

count    16.000000
mean      0.845103
std       0.015050
min       0.826958
25%       0.832321
50%       0.844764
75%       0.857355
max       0.863663
dtype: float64

## AdaBoost

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

params = {
    'n_estimators': [500, 700, 1000],
    'learning_rate': [1e-4, 5e-4, 0.001],
    'base_estimator__max_depth': [10, 20, 30],
    'base_estimator__min_samples_split': [5, 10, 20],
    'base_estimator__max_features': ['sqrt', None]
}

params = {
    'n_estimators': [600, 700, 750],
    'learning_rate': [5e-4],
    'base_estimator__max_depth': [25, 30, 40],
    'base_estimator__min_samples_split': [15, 20, 30],
    'base_estimator__max_features': ['sqrt']
}

params = {
    'n_estimators': [750, 900],
    'learning_rate': [5e-4],
    'base_estimator__max_depth': [25],
    'base_estimator__min_samples_split': [25, 30, 40, 50, 100],
    'base_estimator__max_features': ['sqrt']
}

In [23]:
params = {
    'n_estimators': [900],
    'learning_rate': [5e-4],
    'base_estimator__max_depth': [25],
    'base_estimator__min_samples_split': [80, 100, 150, 200],
    'base_estimator__max_features': ['sqrt']
}

In [24]:
model = GridSearchCV(estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=8),
                                                  random_state=8),
                     param_grid=params,
                     scoring='roc_auc',
                     cv = 5,
                     n_jobs=-1,
                    verbose=3)

In [25]:
%%time
%%chime
model.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
CPU times: user 40.8 s, sys: 80 ms, total: 40.9 s
Wall time: 4min 26s
[CV 4/5] END base_estimator__max_depth=25, base_estimator__max_features=sqrt, base_estimator__min_samples_split=80, learning_rate=0.0005, n_estimators=900;, score=0.896 total time=  46.9s
[CV 5/5] END base_estimator__max_depth=25, base_estimator__max_features=sqrt, base_estimator__min_samples_split=80, learning_rate=0.0005, n_estimators=900;, score=0.886 total time=  47.1s
[CV 2/5] END base_estimator__max_depth=25, base_estimator__max_features=sqrt, base_estimator__min_samples_split=150, learning_rate=0.0005, n_estimators=900;, score=0.894 total time=  43.2s
[CV 4/5] END base_estimator__max_depth=25, base_estimator__max_features=sqrt, base_estimator__min_samples_split=150, learning_rate=0.0005, n_estimators=900;, score=0.896 total time=  43.5s
[CV 2/5] END base_estimator__max_depth=25, base_estimator__max_features=sqrt, base_estimator__min_samples_split=200,

In [26]:
model.best_params_

{'base_estimator__max_depth': 25,
 'base_estimator__max_features': 'sqrt',
 'base_estimator__min_samples_split': 150,
 'learning_rate': 0.0005,
 'n_estimators': 900}

In [27]:
model.best_score_

0.8900577986404123

In [28]:
pd.Series(model.cv_results_['mean_test_score']).describe()

count    4.000000
mean     0.889818
std      0.000207
min      0.889575
25%      0.889698
50%      0.889818
75%      0.889938
max      0.890058
dtype: float64

# Prediction on the test set
## XGBoost
Best performer in cross-validation

In [29]:
from xgboost import XGBClassifier

In [30]:
best_params_xgb = {'learning_rate': 0.3,
 'max_depth': 3,
 'n_estimators': 700,
 'reg_alpha': 0.0005,
 'reg_lambda': 0.2}

In [31]:
best_xgb = XGBClassifier(random_state=8,
                         max_leaves=0,
                         n_jobs=-1,
                        **best_params_xgb)

In [32]:
best_xgb.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=0,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=700, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=8, ...)

In [36]:
test = pd.read_csv('./data/test.csv', index_col='id'); test

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
42100,3,0,1,4,0,0,1,111,2018,12,5,1,0,0,0,115.60,2
42101,2,0,0,3,0,0,0,22,2017,10,21,0,0,0,0,85.00,0
42102,2,2,0,1,0,0,3,18,2018,8,10,1,0,0,0,240.00,1
42103,2,0,0,3,0,0,0,88,2018,5,30,0,0,0,0,80.75,0
42104,1,0,0,2,2,0,0,7,2018,9,21,1,0,0,0,144.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70163,2,0,1,2,0,0,0,315,2018,9,30,1,0,0,0,160.00,0
70164,2,0,2,2,0,0,0,81,2018,3,25,0,0,0,0,65.00,1
70165,2,0,1,2,0,0,0,40,2018,10,22,0,0,0,0,85.00,0
70166,3,0,0,4,0,0,1,4,2018,9,6,1,0,0,0,162.75,0


In [37]:
X_test = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)
X_test

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
42100,2.056005,-0.313454,0.130259,1.122386,-0.407020,-0.160945,0.685890,0.087728,0.409661,1.557404,-1.226638,0.428551,-0.173408,-0.060506,-0.101479,0.297092,1.842849
42101,0.151040,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-1.010111,-2.441040,0.850531,0.573445,-1.149928,-0.173408,-0.060506,-0.101479,-0.526846,-0.737691
42102,0.151040,4.129777,-0.998814,-0.979466,-0.407020,-0.160945,3.088017,-1.059452,0.409661,0.143658,-0.664112,0.428551,-0.173408,-0.060506,-0.101479,3.646696,0.552579
42103,0.151040,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-0.195983,0.409661,-0.916652,1.585992,-1.149928,-0.173408,-0.060506,-0.101479,-0.641282,-0.737691
42104,-1.753925,-0.313454,-0.998814,-0.278849,2.996268,-0.160945,-0.515173,-1.195140,0.409661,0.497095,0.573445,0.428551,-0.173408,-0.060506,-0.101479,1.061793,-0.737691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70163,0.151040,-0.313454,0.130259,-0.278849,-0.407020,-0.160945,-0.515173,2.604122,0.409661,0.497095,1.585992,0.428551,-0.173408,-0.060506,-0.101479,1.492610,-0.737691
70164,0.151040,-0.313454,1.259332,-0.278849,-0.407020,-0.160945,-0.515173,-0.282330,0.409661,-1.623525,1.023466,-1.149928,-0.173408,-0.060506,-0.101479,-1.065367,0.552579
70165,0.151040,-0.313454,0.130259,-0.278849,-0.407020,-0.160945,-0.515173,-0.788076,0.409661,0.850531,0.685950,-1.149928,-0.173408,-0.060506,-0.101479,-0.526846,-0.737691
70166,2.056005,-0.313454,-0.998814,1.122386,-0.407020,-0.160945,0.685890,-1.232145,0.409661,0.497095,-1.114133,0.428551,-0.173408,-0.060506,-0.101479,1.566656,-0.737691


In [38]:
y_pred = best_xgb.predict_proba(X_test)
y_pred

array([[0.88840604, 0.11159399],
       [0.9344811 , 0.06551894],
       [0.70458484, 0.29541513],
       ...,
       [0.93591636, 0.06408364],
       [0.36941445, 0.63058555],
       [0.05540758, 0.9445924 ]], dtype=float32)

In [39]:
result = pd.DataFrame({'id': X_test.index, 'booking_status':y_pred[:, 1]})
result

Unnamed: 0,id,booking_status
0,42100,0.111594
1,42101,0.065519
2,42102,0.295415
3,42103,0.043288
4,42104,0.417605
...,...,...
28063,70163,0.943081
28064,70164,0.051289
28065,70165,0.064084
28066,70166,0.630586


In [40]:
result.to_csv('submit_xgboost.csv', index=False)

## AdaBoost

In [42]:
from sklearn.ensemble import AdaBoostClassifier

In [46]:
best_params_base = {'max_depth': 25,
 'max_features': 'sqrt',
 'min_samples_split': 150}

In [47]:
best_params_ada = {'learning_rate': 0.0005, 'n_estimators': 900}

In [48]:
best_ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=8,
                                                                   **best_params_base),
                              random_state=8,
                              **best_params_ada)

In [49]:
best_ada.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=25,
                                                         max_features='sqrt',
                                                         min_samples_split=150,
                                                         random_state=8),
                   learning_rate=0.0005, n_estimators=900, random_state=8)

In [50]:
test = pd.read_csv('./data/test.csv', index_col='id'); test

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
42100,3,0,1,4,0,0,1,111,2018,12,5,1,0,0,0,115.60,2
42101,2,0,0,3,0,0,0,22,2017,10,21,0,0,0,0,85.00,0
42102,2,2,0,1,0,0,3,18,2018,8,10,1,0,0,0,240.00,1
42103,2,0,0,3,0,0,0,88,2018,5,30,0,0,0,0,80.75,0
42104,1,0,0,2,2,0,0,7,2018,9,21,1,0,0,0,144.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70163,2,0,1,2,0,0,0,315,2018,9,30,1,0,0,0,160.00,0
70164,2,0,2,2,0,0,0,81,2018,3,25,0,0,0,0,65.00,1
70165,2,0,1,2,0,0,0,40,2018,10,22,0,0,0,0,85.00,0
70166,3,0,0,4,0,0,1,4,2018,9,6,1,0,0,0,162.75,0


In [51]:
X_test = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)
X_test

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
42100,2.056005,-0.313454,0.130259,1.122386,-0.407020,-0.160945,0.685890,0.087728,0.409661,1.557404,-1.226638,0.428551,-0.173408,-0.060506,-0.101479,0.297092,1.842849
42101,0.151040,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-1.010111,-2.441040,0.850531,0.573445,-1.149928,-0.173408,-0.060506,-0.101479,-0.526846,-0.737691
42102,0.151040,4.129777,-0.998814,-0.979466,-0.407020,-0.160945,3.088017,-1.059452,0.409661,0.143658,-0.664112,0.428551,-0.173408,-0.060506,-0.101479,3.646696,0.552579
42103,0.151040,-0.313454,-0.998814,0.421768,-0.407020,-0.160945,-0.515173,-0.195983,0.409661,-0.916652,1.585992,-1.149928,-0.173408,-0.060506,-0.101479,-0.641282,-0.737691
42104,-1.753925,-0.313454,-0.998814,-0.278849,2.996268,-0.160945,-0.515173,-1.195140,0.409661,0.497095,0.573445,0.428551,-0.173408,-0.060506,-0.101479,1.061793,-0.737691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70163,0.151040,-0.313454,0.130259,-0.278849,-0.407020,-0.160945,-0.515173,2.604122,0.409661,0.497095,1.585992,0.428551,-0.173408,-0.060506,-0.101479,1.492610,-0.737691
70164,0.151040,-0.313454,1.259332,-0.278849,-0.407020,-0.160945,-0.515173,-0.282330,0.409661,-1.623525,1.023466,-1.149928,-0.173408,-0.060506,-0.101479,-1.065367,0.552579
70165,0.151040,-0.313454,0.130259,-0.278849,-0.407020,-0.160945,-0.515173,-0.788076,0.409661,0.850531,0.685950,-1.149928,-0.173408,-0.060506,-0.101479,-0.526846,-0.737691
70166,2.056005,-0.313454,-0.998814,1.122386,-0.407020,-0.160945,0.685890,-1.232145,0.409661,0.497095,-1.114133,0.428551,-0.173408,-0.060506,-0.101479,1.566656,-0.737691


In [52]:
y_pred = best_ada.predict_proba(X_test)
y_pred

array([[9.67210107e-01, 3.27898931e-02],
       [9.99828799e-01, 1.71201283e-04],
       [6.66383889e-01, 3.33616111e-01],
       ...,
       [8.77066156e-01, 1.22933844e-01],
       [4.16303283e-01, 5.83696717e-01],
       [1.86712336e-02, 9.81328766e-01]])

In [53]:
result = pd.DataFrame({'id': X_test.index, 'booking_status':y_pred[:, 1]})
result

Unnamed: 0,id,booking_status
0,42100,0.032790
1,42101,0.000171
2,42102,0.333616
3,42103,0.001428
4,42104,0.603244
...,...,...
28063,70163,0.999541
28064,70164,0.000415
28065,70165,0.122934
28066,70166,0.583697


In [54]:
result.to_csv('submit_ada.csv', index=False)