In [1]:
import os
import numpy as np
import pandas as pd 
from pathlib import Path
from datetime import datetime


from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split, GridSearchCV 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

Read the processed data in the previous notebook

In [2]:
train_df = pd.read_csv('processed data\processed_train_Agebin.csv')
test_df = pd.read_csv('processed data\processed_test_Agebin.csv')
TARGET = 'Transported'

train_df.sample(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,group_id,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_P,Side_S
2470,0,1,0,490.0,313.0,0.0,6.0,0.0,False,421.0,1,1,0,0,0,0,1,0,1
2560,0,2,0,0.0,0.0,0.0,0.0,0.0,True,570.0,1,0,0,1,0,0,1,1,0
4387,0,0,0,0.0,0.0,0.0,0.0,0.0,False,757.0,7,1,0,0,0,0,1,1,0
8300,0,1,1,0.0,2191.0,863.0,207.0,6262.0,False,329.0,2,0,1,0,0,1,0,0,1
8129,0,2,0,51.0,6.0,87.0,21.0,404.0,False,1679.0,1,1,0,0,0,1,0,0,1


In [3]:
X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape

(6954, 18)

Now we train the base model with default parameter.

In [5]:
xgb_model = XGBClassifier()
model = xgb_model.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", model.score(X_train, y_train))



Performance on train data: 0.912568306010929


In [6]:
yp = model.predict_proba(X_valid)
yp2 = (yp[:,1] > 0.5) 
print("Performance on validation data:", f1_score(y_valid, yp2, average='micro'))

Performance on validation data: 0.787809085681426


In [7]:
y_pred_v = model.predict(X_valid)
print("Performance on validation data:", f1_score(y_valid, y_pred_v, average='micro'))

Performance on validation data: 0.787809085681426


This is the baseline, then we can strat fine tune the parameter

In [8]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=1)

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
folds = 5
param_comb = 50

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=2, random_state=42 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   16.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  2.2min finished


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.



 Time taken: 0 hours 2 minutes and 13.31 seconds.

 All results:
{'mean_fit_time': array([1.79340229, 1.42359099, 1.90930243, 1.50796542, 1.88495479,
       1.80557404, 1.38729324, 1.64380817, 2.53921852, 2.59207125,
       2.11434894, 2.25954566, 2.44007998, 2.71354475, 2.52763338,
       2.41653543, 2.26833453, 1.93680758, 1.9306056 , 1.79499121,
       1.40683684, 3.0909265 , 2.14187679, 2.10695758, 2.686235  ,
       1.62945042, 1.52831116, 2.2507782 , 2.11794567, 1.89552374,
       2.34353638, 1.80751371, 1.56327925, 1.72019801, 1.62525468,
       2.17538099, 1.23070731, 2.84379215, 1.46468139, 2.1656065 ,
       2.26373405, 2.7340919 , 1.37292728, 1.47046113, 1.85643349,


In [9]:
# optimized_xgb = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
#               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
#               early_stopping_rounds=None, enable_categorical=False,
#               eval_metric=None, gamma=5, gpu_id=-1, grow_policy='depthwise',
#               importance_type=None, interaction_constraints='',
#               learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
#               max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=5, monotone_constraints='()', n_estimators=600,
#               n_jobs=1, nthread=1, num_parallel_tree=1, predictor='auto',
#               random_state=0, reg_alpha=0)
optimized_xgb = random_search.best_estimator_
optimized_model = optimized_xgb.fit(X_train, y_train, eval_metric='logloss')
print("Performance on train data:", optimized_model.score(X_train, y_train))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Performance on train data: 0.8239861949956859


In [10]:
optimized_model = optimized_xgb.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", optimized_model.score(X_train, y_train))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Performance on train data: 0.8239861949956859


In [11]:
import pickle
# open a file, where you ant to store the data
with open('xgboost_tuned.pkl', 'wb') as file:
    pickle.dump(optimized_model, file)

with open('xgboost_tuned.pkl', 'rb') as model:
    xgboost_loaded = pickle.load(model)

In [12]:
y_pred = xgboost_loaded.predict(test_df)

In [13]:
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [14]:
#Create a  DataFrame with the passengers ids and our prediction
submission_df = pd.read_csv('sample_submission.csv')
submission_df["Transported"] = y_pred
submission_df["Transported"] = submission_df["Transported"].astype(bool)

In [15]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [16]:
submission_df.to_csv('submission.csv', index=False)