In [20]:
#Import Packages
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

### Data Cleaning Step for input into training and prediction

In [87]:
#Loading Data
df = pd.read_csv('data/raw/2022_train.csv')
df_test = pd.read_csv("data/raw/2022_test.csv")
df_test_backup = pd.read_csv("data/raw/2022_test.csv")

# Improve distribution with skew/tails
df['MIN2'] = df['MIN']**(1/3)
df['AST2'] = df['AST']**(1/3)
df['PTS2'] = df['PTS']**(1/3)
df['FGM2'] = df['FGM']**(1/3)
df['FGA2'] = df['FGA']**(1/3)
df['FTM2'] = df['FTM']**(1/3)
df['FTA2'] = df['FTA']**(1/3)
df['OREB2'] = df['OREB']**(1/3)
df['DREB2'] = df['DREB']**(1/3)
df['REB2'] = df['REB']**(1/3)
df['STL2'] = df['STL']**(1/3)
df['TOV2'] = df['TOV']**(1/3)

df2 = df.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

# Upsampling using SMOTE
from imblearn.over_sampling import SMOTE

# separate into y and X
y = df2.pop('TARGET_5Yrs')
X = df2

# use SMOTE
su = SMOTE(random_state=42)
X_smote, y_smote = su.fit_resample(X, y)

# Scale the X values for the upsampled and SMOTE sampled data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_smote = scaler.fit_transform(X_smote)

from sklearn.model_selection import train_test_split
Xt_smote, Xv_smote, yt_smote, yv_smote = train_test_split(X_smote, y_smote, test_size = 0.25, stratify=y_smote)

# complete the cleaning steps on the test data
df_test['MIN2'] = df_test['MIN']**(1/3)
df_test['AST2'] = df_test['AST']**(1/3)
df_test['PTS2'] = df_test['PTS']**(1/3)
df_test['FGM2'] = df_test['FGM']**(1/3)
df_test['FGA2'] = df_test['FGA']**(1/3)
df_test['FTM2'] = df_test['FTM']**(1/3)
df_test['FTA2'] = df_test['FTA']**(1/3)
df_test['OREB2'] = df_test['OREB']**(1/3)
df_test['DREB2'] = df_test['DREB']**(1/3)
df_test['REB2'] = df_test['REB']**(1/3)
df_test['STL2'] = df_test['STL']**(1/3)
df_test['TOV2'] = df_test['TOV']**(1/3)

df2_test = df_test.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

# Run the Standard Scaler on our test features
X_test = scaler.fit_transform(df2_test)

### Save your datat into the processed folder

In [90]:
#np.save('data/processed/Xt_smote', Xt_smote)
#np.save('data/processed/Xv_smote', Xv_smote)
#np.save('data/processed/yt_smote', yt_smote)
#np.save('data/processed/yv_smote', yv_smote)
#np.save('data/processed/X_test', X_test)

from src.data.sets import save_sets

save_sets(X_train = Xt_smote, y_train = yt_smote, X_val = Xv_smote, y_val = yv_smote, X_test = X_test)

np.save('data/processed/X_smote', X_smote)
np.save('data/processed/y_smote', y_smote)

In [89]:
print(yt_smote)

3645     1
9693     0
10279    0
1078     1
779      0
        ..
6921     1
778      0
3884     1
6340     1
11243    0
Name: TARGET_5Yrs, Length: 10003, dtype: int64


### High AUROC XGBoost Model

In [23]:
# XGBoost
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1, max_depth = 5, random_state=42)

# train/fit
xgb.fit(Xt_smote, yt_smote)

# predict
yv_pred6 = xgb.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred6)
print(xgb_smote)

0.8914763809827962


### Better XGBoost Model?
Perhaps a lower AUROC will generalise better

In [24]:
# XGBoost with lower tree depth (4), lower learning rate (0.01)
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb4 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb4.fit(Xt_smote, yt_smote)

# predict
yv_pred8 = xgb4.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred8)
print(xgb_smote)

0.7544244748172668


In [26]:
# Training the lower tree depth (4) with lower learning rate (0.01) on whole training set

# instatiate
xgb5 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb5.fit(X_smote, y_smote)

# predict using scaled test data
xgb_train_y_test_5 = xgb5.predict_proba(X_test)
probabilities5 = xgb_train_y_test_5[:,1]

# create a dataframe and import back the Ids into with each prediction probability
xgb_draft_5 = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities5})

# save to CSV for upload to Kaggle without the index
xgb_draft_5.to_csv('data/external/2022_timwang_week3try5.csv', index = False)

In [27]:
### Save Model into the model folder

In [28]:
from joblib import dump
dump(xgb5, 'models/xgb5.joblib')

['models/xgb5.joblib']

In [29]:
# Even more punishment for the XGB
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb6 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42, min_child_weight = 2, subsample = 0.5)

# train/fit
xg64.fit(Xt_smote, yt_smote)

# predict
yv_pred9 = xgb6.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred9)
print(xgb_smote)

TypeError: GradientBoostingClassifier.__init__() got an unexpected keyword argument 'min_child_weight'

In [30]:
# Using xgboost from xgboost package
import xgboost as xgb

xgb_original = xgb.XGBClassifier()

xgb_original.fit(Xt_smote, yt_smote)

In [31]:
# Save the original model
from joblib import dump

dump(xgb_original, 'models/xgb_original.joblib')

['models/xgb_originaljoblib']

In [32]:
# Predict for training and validation sets
y_train_preds = xgb_original.predict(Xt_smote)
y_val_preds = xgb_original.predict(Xv_smote)

In [33]:
# Display accuracy and f1 scores for base xgb
from src.models.performance import print_class_perf

print_class_perf(y_preds = y_train_preds, y_actuals = yt_smote, set_name = "Training", average = "weighted")
print_class_perf(y_preds = y_val_preds, y_actuals = yv_smote, set_name = "Validation", average = "weighted")

Accuracy Training: 0.9800059982005398
F1 Training: 0.979998279752876
Accuracy Validation: 0.8893553223388306
F1 Validation: 0.8890007206535906


# Import Hyperopt and perform some Hyperparameter tuning

In [34]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [60]:
# Define search/grid space for XGB hyperparameters
space = {
    'max_depth' : hp.choice('max_depth', range(5, 20, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.001, 0.01, 0.1),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, .1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.05)
}

In [61]:
# Objective function
def objective(space):
    from sklearn.model_selection import cross_val_score
    
    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree']
    )
    
    acc = cross_val_score(xgboost, Xt_smote, yt_smote, cv=10, scoring="accuracy").mean()

    return{'loss': 1-acc, 'status': STATUS_OK }

In [62]:
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=5
)

100%|█████████████| 5/5 [01:12<00:00, 14.41s/trial, best loss: 0.5000499500499501]


In [63]:
print("Best: ", best)

Best:  {'colsample_bytree': 0.4, 'learning_rate': 0.0, 'max_depth': 1, 'min_child_weight': 9.8, 'subsample': 0.5}


# Use best set to train

In [75]:
xgb_best = xgb.XGBClassifier(
    max_depth = 4,
    learning_rate = 0.01,
    min_child_weight = 2,
    subsample = 0.5,
    colsample_bytree = 0.4
)

In [76]:
xgb_best.fit(Xt_smote, yt_smote)

In [77]:
y_train_preds_best = xgb_best.predict(Xt_smote)
y_val_preds_best = xgb_best.predict(Xv_smote)

print_class_perf(y_preds = y_train_preds_best, y_actuals = yt_smote, set_name = "Training", average = "weighted")
print_class_perf(y_preds = y_val_preds_best, y_actuals = yv_smote, set_name = "Validation", average = "weighted")

Accuracy Training: 0.7813655903229031
F1 Training: 0.7813595682095444
Accuracy Validation: 0.7718140929535232
F1 Validation: 0.771812615777882


In [78]:
xgb_better_roc = roc_auc_score(yv_smote, y_val_preds_best)
print(xgb_better_roc)

0.7718148816279908


In [79]:
dump(xgb_best, 'models/xgb_best.joblib')

['models/xgb_best.joblib']

In [80]:
# Retrain model on whole dataset using new hyperparameters
# train/fit
xgb_best.fit(X_smote, y_smote)

# predict using scaled test data
xgb_train_y_test_best= xgb_best.predict_proba(X_test)
probabilities_best = xgb_train_y_test_best[:,1]

# create a dataframe and import back the Ids into with each prediction probability
df_xgb_best = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities_best})

# save to CSV for upload to Kaggle without the index
df_xgb_best.to_csv('data/external/2022_timwang_week4_try1.csv', index = False)