In [2]:
#Import Packages
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

### Data Cleaning Step for input into training and prediction

In [5]:
#Loading Data
df = pd.read_csv('../data/raw/2022_train.csv')
df_test = pd.read_csv("../data/raw/2022_test.csv")
df_test_backup = pd.read_csv("../data/raw/2022_test.csv")

# Improve distribution with skew/tails
df['MIN2'] = df['MIN']**(1/3)
df['AST2'] = df['AST']**(1/3)
df['PTS2'] = df['PTS']**(1/3)
df['FGM2'] = df['FGM']**(1/3)
df['FGA2'] = df['FGA']**(1/3)
df['FTM2'] = df['FTM']**(1/3)
df['FTA2'] = df['FTA']**(1/3)
df['OREB2'] = df['OREB']**(1/3)
df['DREB2'] = df['DREB']**(1/3)
df['REB2'] = df['REB']**(1/3)
df['STL2'] = df['STL']**(1/3)
df['TOV2'] = df['TOV']**(1/3)

df2 = df.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

# Upsampling using SMOTE
from imblearn.over_sampling import SMOTE

# separate into y and X
y = df2.pop('TARGET_5Yrs')
X = df2

# use SMOTE
su = SMOTE(random_state=42)
X_smote, y_smote = su.fit_resample(X, y)

# Scale the X values for the upsampled and SMOTE sampled data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_smote = scaler.fit_transform(X_smote)

from sklearn.model_selection import train_test_split
Xt_smote, Xv_smote, yt_smote, yv_smote = train_test_split(X_smote, y_smote, test_size = 0.25, stratify=y_smote)

# complete the cleaning steps on the test data
df_test['MIN2'] = df_test['MIN']**(1/3)
df_test['AST2'] = df_test['AST']**(1/3)
df_test['PTS2'] = df_test['PTS']**(1/3)
df_test['FGM2'] = df_test['FGM']**(1/3)
df_test['FGA2'] = df_test['FGA']**(1/3)
df_test['FTM2'] = df_test['FTM']**(1/3)
df_test['FTA2'] = df_test['FTA']**(1/3)
df_test['OREB2'] = df_test['OREB']**(1/3)
df_test['DREB2'] = df_test['DREB']**(1/3)
df_test['REB2'] = df_test['REB']**(1/3)
df_test['STL2'] = df_test['STL']**(1/3)
df_test['TOV2'] = df_test['TOV']**(1/3)

df2_test = df_test.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

# Run the Standard Scaler on our test features
X_test = scaler.fit_transform(df2_test)

### High AUROC XGBoost Model

In [8]:
# XGBoost
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1, max_depth = 5, random_state=42)

# train/fit
xgb.fit(Xt_smote, yt_smote)

# predict
yv_pred6 = xgb.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred6)
print(xgb_smote)

0.8941729639683574


### Better XGBoost Model?
Perhaps a lower AUROC will generalise better

In [6]:
# XGBoost with lower tree depth (4), lower learning rate (0.01)
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb4 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb4.fit(Xt_smote, yt_smote)

# predict
yv_pred8 = xgb4.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred8)
print(xgb_smote)

0.7511244154046888


In [7]:
# Training the lower tree depth (4) with lower learning rate (0.01) on whole training set

# instatiate
xgb5 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb5.fit(X_smote, y_smote)

# predict using scaled test data
xgb_train_y_test_5 = xgb5.predict_proba(X_test)
probabilities5 = xgb_train_y_test_5[:,1]

# create a dataframe and import back the Ids into with each prediction probability
xgb_draft_5 = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities5})

# save to CSV for upload to Kaggle without the index
xgb_draft_5.to_csv('../data/external/2022_timwang_week3try5.csv', index = False)

In [9]:
### Save Model into the model folder

In [None]:
from joblib import dump
dump(xgb5, '../models/xgb5.joblib')