In [1]:
import os
import numpy as np
import pandas as pd 
from pathlib import Path
from datetime import datetime


from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

Read the processed data in the previous notebook

In [2]:
train_df = pd.read_csv('processed data\processed_train.csv')
test_df = pd.read_csv('processed data\processed_test.csv')
TARGET = 'Transported'

train_df.sample(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,group_id,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_P,Side_S
2300,0,28.82793,0,0.0,27.0,0.0,3573.0,3643.0,False,85.0,1,0,1,0,0,0,1,1,0
4335,1,25.0,0,0.0,0.0,0.0,0.0,0.0,True,753.0,1,1,0,0,1,0,0,1,0
3099,1,28.0,0,0.0,0.0,0.0,0.0,0.0,True,546.0,1,1,0,0,0,1,0,1,0
2708,0,16.0,0,0.0,20.0,0.0,300.0,681.0,False,601.0,1,1,0,0,0,0,1,1,0
4823,0,4.0,0,0.0,0.0,0.0,0.0,0.0,True,1050.0,4,0,0,1,0,0,1,1,0


In [3]:
X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape

(6954, 18)

Now we train the base model with default parameter.

In [5]:
xgb_model = XGBClassifier()
model = xgb_model.fit(X_train, y_train, eval_metric='logloss')

print("Performance on train data:", model.score(X_train, y_train))

Performance on train data: 0.9263733103249928




In [6]:
yp = model.predict_proba(X_valid)
yp2 = (yp[:,1] > 0.5) 
print("Performance on validation data:", f1_score(y_valid, yp2, average='micro'))

Performance on validation data: 0.7906843013225993


In [7]:
y_pred_v = model.predict(X_valid)
print("Performance on validation data:", f1_score(y_valid, y_pred_v, average='micro'))

Performance on validation data: 0.7906843013225993


In [9]:
print(f1_score(y_valid, yp[:,1] > 0.5, average='micro'))

0.7906843013225993


This is the baseline, then we can strat fine tune the parameter

In [10]:
optimized_xgb = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=5, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=5, monotone_constraints='()', n_estimators=600,
              n_jobs=1, nthread=1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0)

optimized_model = optimized_xgb.fit(X_train, y_train, eval_metric='logloss')
print("Performance on train data:", optimized_model.score(X_train, y_train))



Performance on train data: 0.8212539545585275


In [11]:
y_pred_v = optimized_model.predict(X_valid)

print("Performance on validation data:", f1_score(y_valid, y_pred_v, average='micro'))

Performance on validation data: 0.787809085681426


In [16]:
from scipy.optimize import minimize

def f1_opt_valid(x):
    return -f1_score(y_valid, yp[:,1] > x, average='micro')

result_valid = minimize(f1_opt_valid, x0 = np.array([0.5]), method='Nelder-Mead')
best_threshold_valid = result_valid['x'].item()

best_threshold_valid

0.5281250000000001

In [17]:
tp = model.predict_proba(X_train)

In [18]:
def f1_opt_train(x):
    return -f1_score(y_train, tp[:,1] > x, average='micro')

result_train = minimize(f1_opt_train, x0 = np.array([0.5]), method='Nelder-Mead')
best_threshold_train = result_train['x'].item()

best_threshold_train

0.5128906249999999

In [19]:
best_score_v = f1_score(y_valid, yp[:,1] > best_threshold_valid)
best_score_v

0.7935222672064777

In [20]:
best_score_t = f1_score(y_train, tp[:,1] > best_threshold_train)
best_score_t

0.9285307517084282

In [21]:
import pickle
# open a file, where you ant to store the data
with open('xgboost_tuned2.pkl', 'wb') as file:
    pickle.dump(optimized_model, file)



In [None]:
with open('xgboost_tuned2.pkl', 'rb') as model:
    xgboost_loaded = pickle.load(model)

In [29]:
y_pred = xgboost_loaded.predict_proba(test_df)
final_y_pred = (y_pred[:,1] > best_score_v) 

In [30]:
final_y_pred

array([False, False,  True, ...,  True, False, False])

In [31]:
#Create a  DataFrame with the passengers ids and our prediction
submission_df = pd.read_csv('sample_submission.csv')
submission_df["Transported"] = final_y_pred
submission_df.to_csv('submission3.csv', index=False)

In [32]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
