In [22]:
import os
import numpy as np
import pandas as pd 
from pathlib import Path
from datetime import datetime


from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

Read the processed data in the previous notebook

In [23]:
train_df = pd.read_csv('processed data\processed_train.csv')
test_df = pd.read_csv('processed data\processed_test.csv')
TARGET = 'Transported'

train_df.sample(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,group_id,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_P,Side_S
2263,1,1,0,0.0,0.0,0.0,0.0,0.0,True,94.0,1,0,1,0,0,0,1,0,1
3727,0,2,0,0.0,0.0,970.0,0.0,0.0,True,758.0,1,1,0,0,0,0,1,0,1
1781,0,1,0,4.0,841.0,0.0,363.0,1107.0,True,600.367671,2,0,1,0,0,0,1,0,1
1210,0,0,0,0.0,0.0,0.0,0.0,304.854791,False,195.0,1,1,0,0,0,0,1,0,1
5561,0,1,0,0.0,109.0,0.0,4810.0,25.0,False,201.0,2,0,1,0,0,0,1,1,0


In [24]:
X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape

(6954, 18)

Now we train the base model with default parameter.

In [26]:
xgb_model = XGBClassifier()
model = xgb_model.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", model.score(X_train, y_train))

Performance on train data: 0.912568306010929




In [27]:
yp = model.predict_proba(X_valid)
yp2 = (yp[:,1] > 0.5) 
print("Performance on validation data:", f1_score(y_valid, yp2, average='micro'))

Performance on validation data: 0.787809085681426


In [28]:
y_pred_v = model.predict(X_valid)
print("Performance on validation data:", f1_score(y_valid, y_pred_v, average='micro'))

Performance on validation data: 0.787809085681426


In [29]:
print(f1_score(y_valid, yp[:,1] > 0.5, average='micro'))

0.787809085681426


This is the baseline, then we can strat fine tune the parameter

In [30]:
optimized_xgb = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=5, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
               monotone_constraints='()', n_estimators=600,
              n_jobs=1, nthread=1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0)

optimized_model = optimized_xgb.fit(X_train, y_train, eval_metric='logloss')
print("Performance on train data:", optimized_model.score(X_train, y_train))

Performance on train data: 0.822260569456428


In [31]:
y_pred_v = optimized_model.predict(X_valid)

print("Performance on validation data:", f1_score(y_valid, y_pred_v, average='micro'))

Performance on validation data: 0.7935595169637721


In [32]:
from scipy.optimize import minimize

def f1_opt_valid(x):
    return -f1_score(y_valid, yp[:,1] > x, average='micro')

result_valid = minimize(f1_opt_valid, x0 = np.array([0.5]), method='Nelder-Mead')
best_threshold_valid = result_valid['x'].item()

best_threshold_valid

0.484375

In [33]:
tp = model.predict_proba(X_train)

In [34]:
def f1_opt_train(x):
    return -f1_score(y_train, tp[:,1] > x, average='micro')

result_train = minimize(f1_opt_train, x0 = np.array([0.5]), method='Nelder-Mead')
best_threshold_train = result_train['x'].item()

best_threshold_train

0.525

In [35]:
best_score_v = f1_score(y_valid, yp[:,1] > best_threshold_valid)
best_score_v

0.7977839335180055

In [36]:
best_score_t = f1_score(y_train, tp[:,1] > best_threshold_train)
best_score_t

0.9143918822352437

In [37]:
import pickle
# open a file, where you ant to store the data
with open('xgboost_tuned2.pkl', 'wb') as file:
    pickle.dump(optimized_model, file)



In [38]:
with open('xgboost_tuned2.pkl', 'rb') as model:
    xgboost_loaded = pickle.load(model)

In [39]:
y_pred = xgboost_loaded.predict_proba(test_df)
final_y_pred = (y_pred[:,1] > best_threshold_train) 

In [40]:
final_y_pred

array([ True, False,  True, ...,  True,  True,  True])

In [41]:
#Create a  DataFrame with the passengers ids and our prediction
submission_df = pd.read_csv('sample_submission.csv')
submission_df["Transported"] = final_y_pred
submission_df.to_csv('submission5_agebin.csv', index=False)

In [42]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
