In [13]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
import warnings
warnings.filterwarnings("ignore")

# Classifiers
from catboost import CatBoostClassifier

# Model selection
from sklearn.model_selection import StratifiedKFold, PredefinedSplit

# Metrics
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time

In [14]:
tr_df = pd.read_csv('../../../data/feature/cbasic_train.csv')

In [15]:
tr_df.head()

Unnamed: 0,Age,Ethnicity,Educational_Level,Income,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Required_Car_Parking,Use_Promotion,Room_Rate,month_Expected_checkin,dayofweek_Expected_checkin,stay,total_participants,Reservation_Status
0,-0.259916,0.010376,-0.001419,0.007453,0,0,0,0.022095,0,0,0.976918,7,2,1,4,1
1,0.328209,0.010376,0.006746,-0.012088,0,0,0,-0.069188,0,1,0.224803,7,2,1,6,1
2,-0.129221,-0.017315,-0.001419,0.007453,0,0,0,0.022095,0,1,-1.279426,7,3,4,6,1
3,-1.240124,0.014371,-0.003171,-0.01297,0,0,0,-0.069188,0,0,-0.709642,7,3,1,7,1
4,1.177723,0.010376,0.002616,0.012407,0,0,0,0.022095,1,0,1.52391,7,4,1,2,1


In [25]:
tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63720 entries, 0 to 63719
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         63720 non-null  float64
 1   Ethnicity                   63720 non-null  float64
 2   Educational_Level           63720 non-null  float64
 3   Income                      63720 non-null  float64
 4   Meal_Type                   63720 non-null  int64  
 5   Visted_Previously           63720 non-null  int64  
 6   Previous_Cancellations      63720 non-null  int64  
 7   Deposit_type                63720 non-null  float64
 8   Required_Car_Parking        63720 non-null  int64  
 9   Use_Promotion               63720 non-null  int64  
 10  Room_Rate                   63720 non-null  float64
 11  month_Expected_checkin      63720 non-null  int64  
 12  dayofweek_Expected_checkin  63720 non-null  int64  
 13  stay                        637

In [36]:
#tr_df.columns
cat_inds = [4,5,6,8,9]


In [37]:
def report_perf(optimizer, X, y,title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y,callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [38]:
val_df = pd.read_csv('../../../data/feature/cbasic_validation.csv').sample(frac=1, random_state=43)
cv_size = val_df.shape[0]//2
full_stack = [tr_df, val_df.iloc[:cv_size, :]]
full_df = pd.concat(full_stack)

ind_list = [-1,]*tr_df.shape[0]+[0,]*cv_size
prd = PredefinedSplit(ind_list)



In [39]:
score = make_scorer(f1_score, average='macro')
# score = make_scorer(accuracy_score)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=443)

In [40]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='MultiClass',
                         od_type = 'Iter',
                         cat_features=cat_inds,
                         task_type="GPU",
                         verbose= False
                        )
# cat_features=cat_inds,

In [41]:
search_spaces = {'iterations': Integer(10, 1000),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30)}

In [49]:
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=score,
                    cv=prd,
                    n_iter=150,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    random_state=443)
# optimizer_kwargs={'base_estimator': 'GP'},
#                     refit=True,

In [50]:

best_params = report_perf(opt, full_df.iloc[:, :-1], full_df.iloc[:, -1],'CatBoost', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 1.8030
Function value obtained: -0.2504
Current minimum: -0.2504
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 6.8201
Function value obtained: -0.2474
Current minimum: -0.2504
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 11.7003
Function value obtained: -0.2474
Current minimum: -0.2504
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 9.9100
Function value obtained: -0.2580
Current minimum: -0.2580
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 6.2002
Function value obtaine

In [52]:
opt.best_params_

OrderedDict([('bagging_temperature', 0.6838302028968318),
             ('border_count', 1),
             ('depth', 3),
             ('iterations', 812),
             ('l2_leaf_reg', 25),
             ('learning_rate', 0.2576354989666216),
             ('random_strength', 1.8903478754276727e-05)])

In [53]:
b_model = opt.best_estimator_

In [54]:
#val_df = pd.read_csv('../../../data/feature/cb_validation.csv')
X_val = val_df.iloc[cv_size:, :-1]
y_val = val_df.iloc[cv_size:, -1]

In [55]:
y_val_pred = b_model.predict(X_val)

In [56]:
print(f1_score(y_val, y_val_pred, average='macro'))

0.32835575615013707


In [18]:
pred_df = pd.DataFrame()
pred_df['true'] = y_val
pred_df['prediction'] = y_val_pred


In [26]:
pred_df.head()

Unnamed: 0,true,prediction
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [20]:
ts_df = pd.read_csv('../../../data/feature/cbasi_test.csv')
ts_df.head()


Unnamed: 0,Reservation-id,Age,Ethnicity,Educational_Level,Income,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Required_Car_Parking,Use_Promotion,Room_Rate,month_Expected_checkin,dayofweek_Expected_checkin,stay,total_participants
0,62931593,0.524251,0.010376,-0.001419,0.012407,0.145374,0.010791,0.003794,0.022095,0.005952,0.004285,-0.50452,11,4,1,6
1,70586099,0.197515,0.010376,-0.001419,0.012407,0.622649,0.010791,0.003794,0.022095,-0.014993,-0.0131,0.794587,11,4,1,3
2,4230648,-1.044082,-0.00934,-0.001419,0.007453,-0.444474,0.010791,0.003794,0.022095,-0.014993,0.004285,-1.325009,4,4,3,4
3,25192322,1.373765,-0.017315,0.002616,0.012407,0.622649,0.010791,0.003794,0.022095,0.005952,0.004285,-1.552923,11,4,2,6
4,80931528,0.066821,0.014371,-0.003171,0.012407,-0.444474,0.010791,0.003794,-0.069188,-0.014993,-0.0131,-1.279426,11,4,2,4


In [21]:
preds = b_model.predict(ts_df.iloc[:, 1:])

In [22]:
sub_df = pd.DataFrame()
sub_df['Reservation-id'] = ts_df.iloc[:, 0]
sub_df['Reservation_status'] = preds

In [23]:
sub_df.to_csv('submission3.4.csv', index=False)

In [23]:
val_df = pd.read_csv('../../../data/feature/cb_validation.csv')
tr_df = pd.read_csv('../../../data/feature/cb_train.csv')
print(f'train size : {tr_df.shape[0]}, validation size : {val_df.shape[0]}')


train size : 27499, validation size : 2749


In [12]:
full_stack = [tr_df, val_df]
full_df = pd.concat(full_stack)

In [15]:
full_df.sample(frac=1).head()
full_df.shape

Unnamed: 0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Meal_Type,Visted_Previously,Previous_Cancellations,...,Booking_channel,Required_Car_Parking,Use_Promotion,Room_Rate,month_Expected_checkin,dayofweek_Expected_checkin,stay,booking_to_checkin,total_participants,Reservation_Status
24781,1,32,0,2,3,3,0,2,0,0,...,2,0,0,173,6,4,1,150,5,0
8244,0,54,1,2,1,2,2,0,0,0,...,0,0,0,134,6,3,1,214,2,0
13519,0,18,3,0,0,3,2,2,1,0,...,0,0,0,162,5,4,3,100,4,0
21455,0,49,3,0,3,2,2,0,0,0,...,0,0,0,213,2,0,1,33,4,2
13450,1,52,2,0,0,1,2,1,0,0,...,0,0,0,239,5,0,2,60,5,0


(30248, 21)

In [16]:
ind_list = [-1,]*tr_df.shape[0]+[0,]*val_df.shape[0]

In [17]:
prd = PredefinedSplit(ind_list)

In [28]:
val_df.shape[0]/2

1374.5