In [10]:
#Import Packages
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [11]:
#Loading Data
df = pd.read_csv('../data/raw/2022_train.csv')
df_test = pd.read_csv("../data/raw/2022_test.csv")
df_test_backup = pd.read_csv("../data/raw/2022_test.csv")

In [12]:
df.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,...,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,...,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,...,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,...,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,...,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [13]:
# EDA using Profile Report
# ProfileReport(df, title="EDA of full training data")

### Feature Engineering
Improve distribution of certain features that have right tails using cube root. Log is not recommened as some values are 0

In [14]:
df['MIN2'] = df['MIN']**(1/3)
df['AST2'] = df['AST']**(1/3)
df['PTS2'] = df['PTS']**(1/3)
df['FGM2'] = df['FGM']**(1/3)
df['FGA2'] = df['FGA']**(1/3)
df['FTM2'] = df['FTM']**(1/3)
df['FTA2'] = df['FTA']**(1/3)
df['OREB2'] = df['OREB']**(1/3)
df['DREB2'] = df['DREB']**(1/3)
df['REB2'] = df['REB']**(1/3)
df['STL2'] = df['STL']**(1/3)
df['TOV2'] = df['TOV']**(1/3)

df2 = df.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

### Data Rebalancing

Due to the fact that the target (Y) has fewer 0s than 1s (1333 vs 6669) we can use several tools to rebalance the data.

1. Upsampling to fill in more data

2. Smote to generate synthetic data


In [15]:
# Divide into datasets by the output classes
df_over_5 = df2[df2['TARGET_5Yrs'] == 1]
df_under_5 = df2[df2['TARGET_5Yrs'] == 0]


# Upsampling
from sklearn.utils import resample
df_upsample = resample(df_under_5, replace=True, n_samples=6669, random_state=42)
df_upsample.shape
df_upsample.head()

Unnamed: 0,GP,FG%,3P Made,3PA,3P%,FT%,BLK,TARGET_5Yrs,MIN2,AST2,PTS2,FGM2,FGA2,FTM2,FTA2,OREB2,DREB2,REB2,STL2,TOV2
6670,32,45.7,-0.1,-0.2,14.4,63.8,0.1,0,1.743513,0.464159,1.03228,0.736806,1.03228,0.464159,0.584804,0.736806,0.843433,1.0,0.584804,0.464159
5096,73,34.6,0.6,2.1,20.5,75.6,0.4,0,2.973847,1.169607,2.250617,1.638643,2.30835,1.193483,1.320006,1.193483,1.613429,1.806969,1.0,1.21644
7808,64,44.1,0.2,0.7,11.0,55.0,0.2,0,2.008299,1.091393,1.338866,1.062659,1.375069,0.584804,0.736806,0.736806,0.793701,0.965489,0.669433,0.793701
6758,59,42.0,0.2,1.0,9.6,66.3,0.5,0,2.732394,1.169607,1.709976,1.238562,1.650964,1.03228,1.169607,1.062659,1.375069,1.587401,0.928318,1.0
6478,36,38.5,0.6,1.9,8.2,71.8,0.2,0,2.147229,0.584804,1.392477,1.0,1.375069,0.736806,0.793701,0.584804,1.0,1.03228,0.584804,0.736806


In [16]:
# Combine upsample with the over_5s
df_upsampled = pd.concat([df_over_5, df_upsample])
df_upsampled.shape

(13338, 20)

In [17]:
# Upsampling using SMOTE
from imblearn.over_sampling import SMOTE

# separate into y and X
y = df2.pop('TARGET_5Yrs')
X = df2

# use SMOTE
su = SMOTE(random_state=42)
X_smote, y_smote = su.fit_resample(X, y)

In [18]:
#X_upsamp.shape

In [19]:
#y_upsamp.shape

### Standard Scaling of the features X

In [20]:
# Also split X and y for the upsampled, non-SMOTE data

y_upsamp = df_upsampled.pop('TARGET_5Yrs')
X_upsamp = df_upsampled

In [21]:
# Scale the X values for the upsampled and SMOTE sampled data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_upsamp = scaler.fit_transform(X_upsamp)
X_smote = scaler.fit_transform(X_smote)

### Split into Training and validation datasets with 75% training data

Also will stratify y just in case

In [22]:
from sklearn.model_selection import train_test_split
Xt_upsamp, Xv_upsamp, yt_upsamp, yv_upsamp = train_test_split(X_upsamp, y_upsamp, test_size = 0.25, stratify=y_upsamp)

In [23]:
Xt_smote, Xv_smote, yt_smote, yv_smote = train_test_split(X_smote, y_smote, test_size = 0.25, stratify=y_smote)

### Try balanced datasets using Random Forest

1. Using non-smote upscaled data

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth = 6, random_state = 42)

In [25]:
# train with the upsampled data
rfc.fit(Xt_upsamp, yt_upsamp)

RandomForestClassifier(max_depth=6, random_state=42)

In [26]:
# predict using the upsampled validation
yv_pred = rfc.predict(Xv_upsamp)

In [27]:
# Print the AUROC score
rfc_upsamp = roc_auc_score(yv_upsamp, yv_pred)
print(rfc_upsamp)

0.7037335338687658


2. Using the smote upscaled data

In [28]:
rfc2 = RandomForestClassifier(max_depth = 6, random_state = 42)

# train
rfc2.fit(Xt_smote, yt_smote)

# predict
yv_pred2 = rfc2.predict(Xv_smote)

# print auroc
rfc_smote = roc_auc_score(yv_smote, yv_pred2)
print(rfc_smote)

0.7427217793851302


### Get prediction probabilities of the rfc using test data

In [29]:
# First complete the cleaning steps on the test data

df_test['MIN2'] = df_test['MIN']**(1/3)
df_test['AST2'] = df_test['AST']**(1/3)
df_test['PTS2'] = df_test['PTS']**(1/3)
df_test['FGM2'] = df_test['FGM']**(1/3)
df_test['FGA2'] = df_test['FGA']**(1/3)
df_test['FTM2'] = df_test['FTM']**(1/3)
df_test['FTA2'] = df_test['FTA']**(1/3)
df_test['OREB2'] = df_test['OREB']**(1/3)
df_test['DREB2'] = df_test['DREB']**(1/3)
df_test['REB2'] = df_test['REB']**(1/3)
df_test['STL2'] = df_test['STL']**(1/3)
df_test['TOV2'] = df_test['TOV']**(1/3)

df2_test = df_test.drop(['Id', 'MIN', 'AST', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'STL', 'TOV'], axis = 1)

In [30]:
df2_test.shape

(3799, 19)

In [31]:
# Run the Standard Scaler on our test features
X_test = scaler.fit_transform(df2_test)

In [32]:
# Use the Random Forest Classifier trained with the SMOTE upscaled training set

y_test_predictions = rfc2.predict_proba(X_test)

probabilities = y_test_predictions[:,1]

# create a dataframe and import back the Ids into with each prediction probability

final = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities})

# save to CSV for upload to Kaggle without the index

final.to_csv("2022_timwang_week2try.csv", index = False)

In [33]:
final.head()

Unnamed: 0,Id,TARGET_5Yrs
0,0,0.302103
1,1,0.488752
2,2,0.652189
3,3,0.813496
4,4,0.296814


In [34]:
final.shape

(3799, 2)

### Other Models

Using SMOTE data, lets try different ML algorithms: Naive Bayes, KNN, SVM, XGBoost

In [35]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# fit model/training
gnb.fit(Xt_smote, yt_smote)

# predict
yv_pred3 = gnb.predict(Xv_smote)

# print auroc
gnb_smote = roc_auc_score(yv_smote, yv_pred3)
print(gnb_smote)

0.657852422321291


In [36]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

# instantiate model
knn = KNeighborsClassifier(n_neighbors=5)

# train/fit
knn.fit(Xt_smote, yt_smote)

# predict
yv_pred4 = knn.predict(Xv_smote)

# print auroc
knn_smote = roc_auc_score(yv_smote, yv_pred4)
print(knn_smote)

0.7273786249944256


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [37]:
# SVM
from sklearn.svm import SVC

# instantiate
svm = SVC(gamma='auto')

# train/fit
svm.fit(Xt_smote, yt_smote)

# predict
yv_pred5 = svm.predict(Xv_smote)

# print auroc
svm_smote = roc_auc_score(yv_smote, yv_pred5)
print(svm_smote)

0.7175280771183894


In [38]:
# XGBoost
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1, max_depth = 5, random_state=42)

# train/fit
xgb.fit(Xt_smote, yt_smote)

# predict
yv_pred6 = xgb.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred6)
print(xgb_smote)

0.8959740426015516


### Retrain XGBoost Model with whole training data (train + validation)

In [39]:
# instatiate
xgb2 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1, max_depth = 5, random_state=42)

# train/fit
xgb2.fit(X_smote, y_smote)

GradientBoostingClassifier(max_depth=5, random_state=42)

In [40]:
# Use the XGB trained with the SMOTE upscaled training set

xgb_y_test_predictions = xgb2.predict_proba(X_test)

probabilities = xgb_y_test_predictions[:,1]

# create a dataframe and import back the Ids into with each prediction probability

xgb_draft = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities})

# save to CSV for upload to Kaggle without the index

xgb_draft.to_csv("2022_timwang_week3try.csv", index = False)

In [41]:
xgb_draft.head()

Unnamed: 0,Id,TARGET_5Yrs
0,0,0.01451
1,1,0.040764
2,2,0.185286
3,3,0.423286
4,4,0.055149


### Retrain XGBoost Model with original training data (train)

By using the whole training data, are we overfitting?

In [57]:
# Use the XGB trained with the SMOTE upscaled training set

xgb_train_y_test_predictions = xgb2.predict_proba(X_test)

probabilities = xgb_train_y_test_predictions[:,1]

# create a dataframe and import back the Ids into with each prediction probability

xgb_draft_2 = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities})

# save to CSV for upload to Kaggle without the index

xgb_draft_2.to_csv("2022_timwang_week3try2.csv", index = False)

Note: XGBoost with all training, kaggle score 0.62666
      XGBoost with no validation in training, kaggle score 0.6119
      
Most likely XGBoost has high AUROC but may have overfit. Low generalisation on kaggle

How to avoid overfitting with XGBoost
How do you avoid overfitting when building an XGBoost model? Here are some tips you can follow to avoid overfitting when building a XGBoost or gradient boosted tree model.

Use fewer trees. If you find that your XGBoost model is overfitting, one option you have is to reduce the number of trees that are used in your model. Models that are highly complex with many parameters tend to overfit more than models that are small and simple. By reducing the number of trees in your model, you can reduce the complexity of your model and reduce the likelihood of overfitting.

Use shallow trees. Another way to reduce the amount of complexity in a XGBoost model and prevent the model from overfitting is to limit the model to using shallow trees. This reduces the number of splits that are made in each tree, which reduces the complexity of the model.

Use a lower learning rate. If you reduce the learning rate in your XGBoost model, your model will also be less likely to overfit. This will act as a regularization technique that prevents your model from paying too much attention to an unimportant feature.
Reduce the number of features. Reducing the number of features that you model has access to is another great way to reduce complexity in a machine learning model. This is another viable option for preventing an XGboost model from overfitting.

New Model with some different hyperparameters

In [51]:
# XGBoost with lower tree depth (4), lower learning rate (0.05)
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb3 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.05, max_depth = 4, random_state=42)

# train/fit
xgb3.fit(Xt_smote, yt_smote)

# predict
yv_pred7 = xgb3.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred7)
print(xgb_smote)

0.8686823786321872


In [54]:
# XGBoost with lower tree depth (4), lower learning rate (0.01)
from sklearn.ensemble import GradientBoostingClassifier

# instantiate
xgb4 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb4.fit(Xt_smote, yt_smote)

# predict
yv_pred8 = xgb4.predict(Xv_smote)

# print auroc
xgb_smote = roc_auc_score(yv_smote, yv_pred8)
print(xgb_smote)

0.7583281904770126


In [56]:
# Training the lower tree depth (4) with lower learning rate (0.01) on whole training set

# instatiate
xgb5 = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth = 4, random_state=42)

# train/fit
xgb5.fit(X_smote, y_smote)

GradientBoostingClassifier(learning_rate=0.01, max_depth=4, random_state=42)

In [58]:
# Use the XGB trained with the SMOTE upscaled training set

xgb_train_y_test_5 = xgb5.predict_proba(X_test)

probabilities5 = xgb_train_y_test_5[:,1]

# create a dataframe and import back the Ids into with each prediction probability

xgb_draft_5 = pd.DataFrame({'Id':df_test_backup.Id, 'TARGET_5Yrs':probabilities5})

# save to CSV for upload to Kaggle without the index

xgb_draft_5.to_csv('../data/external/2022_timwang_week3try5.csv', index = False)

### Save Model into the model folder

In [59]:
from joblib import dump
dump(xgb5, '../models/xgb5.joblib')

['../models/xgb5.joblib']