In [37]:
#Import Packages
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import sys
import xgboost as xgb

# add folder to system path
sys.path.insert(0, '/Users/timtamothy/Documents/GitHub/adv_dsi_lab_2/')

from src.models.performance import print_class_perf

In [31]:
# Load smote and pre-scaled data from the processed folder
from src.data.sets import load_sets

X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/')

### Use detuned hyperparameters to train

In [49]:
xgb_detuned = xgb.XGBClassifier(
    max_depth = 3,
    learning_rate = 0.001,
    min_child_weight = 3,
    subsample = 0.5,
    colsample_bytree = 0.4
)

xgb_detuned.fit(X_train, y_train)

In [50]:
y_train_preds = xgb_detuned.predict(X_train)
y_val_preds = xgb_detuned.predict(X_val)

print_class_perf(y_preds = y_train_preds, y_actuals = y_train, set_name = "Training", average = "weighted")
print_class_perf(y_preds = y_val_preds, y_actuals = y_val, set_name = "Validation", average = "weighted")

Accuracy Training: 0.7062881135659302
F1 Training: 0.7057200257654124
Accuracy Validation: 0.7148425787106447
F1 Validation: 0.7142212797909334


In [51]:
xgb_detuned_roc = roc_auc_score(y_val, y_val_preds)
print(xgb_detuned_roc)

0.714828617010411


### Training Model for Kaggle Submission

In [62]:
# Load unsplit data
X_smote = np.load('../data/processed/X_smote.npy')
y_smote = np.load('../data/processed/y_smote.npy')

df_test_backup = pd.read_csv("../data/raw/2022_test.csv")

In [65]:
# Retrain model on whole dataset using new hyperparameters
# train/fit
xgb_detuned.fit(X_smote, y_smote)

# predict using scaled test data
xgb_detuned_preds= xgb_detuned.predict_proba(X_test)
probabilities_detuned = xgb_detuned_preds[:,1]

# create a dataframe and import back the Ids into with each prediction probability
df_xgb_detuned = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities_detuned})

# save to CSV for upload to Kaggle without the index
df_xgb_detuned.to_csv('../data/external/2022_timwang_week4_try2.csv', index = False)