In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [83]:
pd.set_option('display.max_rows', 100)

In [84]:
df = pd.read_csv('hotel_bookings.csv')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [85]:
def value_counts_csv(df):
    assert isinstance(df, pd.DataFrame)
    
    if not os.path.exists('value_counts_csv'):
        os.makedirs('value_counts_csv')
    
    for col in df.columns:
        value_counts = pd.DataFrame(df[col].value_counts(dropna = False, ascending = False))
        value_counts['Percentage'] = value_counts.iloc[:,0]/sum(value_counts.iloc[:,0])
        
        value_counts.reset_index(drop = False, inplace = True)
        value_counts.rename(columns = {'index': col, col: 'Count'}, inplace = True)
        
        value_counts.to_csv('./value_counts_csv/'+col+'.csv', index = False)

In [86]:
def count_nulls(df):
    assert isinstance(df, pd.DataFrame)
    
    if not os.path.exists('./value_counts_csv/Nulls_folder'):
        os.makedirs('./value_counts_csv/Nulls_folder')
    
    nulls = pd.DataFrame(df.isna().sum())
    nulls.reset_index(drop = False, inplace = True)
    nulls.rename(columns = {'index': 'Column_Name', nulls.columns[1]:'Count'}, inplace = True)
    
    nulls['Percentage'] = nulls['Count']/df.shape[0]
        
    nulls.to_csv('./value_counts_csv/Nulls_folder/Nulls.csv', index = False)

In [87]:
def hist_boxplot(df):
    assert isinstance(df, pd.DataFrame)
    
    if not os.path.exists('plots/distribution'):
        os.makedirs('plots/distribution')
        
    for col in df.columns:
        if df[col].dtype == np.object:
            plt.figure(col)
            plt.hist(df[col].dropna())
            plt.savefig('plots/distribution/' + col)
            plt.close(col)
        else:
            fig, ax = plt.subplots(1,2, figsize=(20,10))
            plt.sca(ax[0])
            plt.hist(df[col].dropna())
                
            plt.sca(ax[1])
            df.boxplot(column=col)
                
            fig.savefig('plots/distribution/' + col)
            plt.close(fig)
            

In [None]:
value_counts_csv(df)
count_nulls(df)
hist_boxplot(df)

In [None]:
pip3 install xgboost

In [None]:
import xgboost

In [None]:
import xgboost

In [None]:
python setup.py install--user

In [None]:
conda install -c conda-forge xgboost

In [88]:
df_model = df.copy()
df_model = df_model.select_dtypes(exclude=['object'])
df_model = pd.get_dummies(df_model)

In [33]:
df_model.shape
#df.shape

(119390, 20)

In [89]:
X = df_model.drop('is_canceled', axis = 1)
y = df_model['is_canceled']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [37]:
y.value_counts()

0    75166
1    44224
Name: is_canceled, dtype: int64

In [101]:
#Import libraries:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [50]:
D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)

param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 2} 

steps = 20  # The number of training iterations

model = xgb.train(param, D_train, steps)

preds = model.predict(D_test)
print(preds)

best_preds = np.asarray([np.argmax(line) for line in preds])
print(best_preds)

print("Precision = {}".format(precision_score(Y_test, best_preds, average='macro')))
print("Recall = {}".format(recall_score(Y_test, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))

[[0.30378234 0.69621766]
 [0.8485303  0.15146971]
 [0.63967097 0.360329  ]
 ...
 [0.8928962  0.10710382]
 [0.6548712  0.34512874]
 [0.7434501  0.25654984]]
[1 0 0 ... 0 0 0]
Precision = 0.7651682925149316
Recall = 0.7372718628401349
Accuracy = 0.7740179244492839


In [91]:
xgb_clf = xgb.XGBClassifier()

parameters = {
     "eta"    : [0.20, 0.30] ,
     "max_depth"        : [ 3, 6],
     "gamma"            : [ 0.1, 0.2]
     }

grid = GridSearchCV(xgb_clf,
                    parameters,
                    scoring="neg_log_loss",
                    cv=3)

model = grid.fit(X_train, Y_train)
#predictions = model.predict(X_test)

#model.dump_model('dump.raw.txt')

In [94]:
preds = model.predict(X_test)
print(preds)

[1 0 0 ... 0 0 0]


In [93]:
model.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.2, gamma=0.1,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [96]:
model.best_params_
model.best_estimator_
cvres = model.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

-0.448651451896503 {'eta': 0.2, 'gamma': 0.1, 'max_depth': 3}
-0.4030927727304741 {'eta': 0.2, 'gamma': 0.1, 'max_depth': 6}
-0.44865160945180954 {'eta': 0.2, 'gamma': 0.2, 'max_depth': 3}
-0.4037234858139123 {'eta': 0.2, 'gamma': 0.2, 'max_depth': 6}
-0.448651451896503 {'eta': 0.3, 'gamma': 0.1, 'max_depth': 3}
-0.4030927727304741 {'eta': 0.3, 'gamma': 0.1, 'max_depth': 6}
-0.44865160945180954 {'eta': 0.3, 'gamma': 0.2, 'max_depth': 3}
-0.4037234858139123 {'eta': 0.3, 'gamma': 0.2, 'max_depth': 6}


In [None]:
xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(train['QuoteConversion_Flag'], n_folds=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(train[features], train["QuoteConversion_Flag"])

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(test[features])[:,1]

sample = pd.read_csv('../input/sample_submission.csv')
sample.QuoteConversion_Flag = test_probs
sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

In [142]:
def classifier_gridCV(X_train, y_train, clf, 
                      X_test = None, y_test = None, cv = 3, scoring = 'accuracy', params = {}, model_name = "model"):
    
    grid = GridSearchCV(clf, params, cv = cv, scoring = scoring, refit = True)
    
    model = grid.fit(X_train, y_train)
    print("The best parameters of grid are: ", model.best_params_, 
          "\nThe best estimator is: ", model.best_estimator_)
    
    if not os.path.exists('Models/CV_results'):
        os.makedirs('Models/CV_results')
    
    cvres = model.cv_results_
    
    if params != {}:    
        dataframe = pd.DataFrame(cvres["params"])
        dataframe.insert(0, "mean_test_score", cvres["mean_test_score"])
                        
    else: 
        dataframe = pd.DataFrame({"mean_test_score":cvres["mean_test_score"]})
            
    dataframe.to_csv("./Models/CV_results/CV_results_"+model_name+".csv", index = False) 
    
    if X_test is not None:
        results = model.predict(X_test)
    
        if y_test is not None:
            print("Precision = {}".format(precision_score(y_test, results, average='macro')))
            print("Recall = {}".format(recall_score(y_test, results, average='macro')))
            print("Accuracy = {}".format(accuracy_score(y_test, results)))
            
    return(model)
    

In [143]:
classifier_gridCV(X_train, Y_train, xgb.XGBClassifier(), X_test, Y_test, 
                  model_name = "xg_boost_2")

The best parameters of grid are:  {} 
The best estimator is:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
Precision = 0.7863874380590321
Recall = 0.7546231566520929
Accuracy = 0.7921098919507497


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None, param_grid={},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)