In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle
import matplotlib.pyplot as plt
import joblib
import shap

from functions.xgboost import allnba_predict

pd.options.display.max_columns=50
pd.options.display.float_format = '{:.2f}'.format

In [None]:
with open('data/allnba_2021.pickle', 'rb') as f:
    training_data, features, id_data = pickle.load(f)

X = training_data[:,:-1]
Y = training_data[:,-1].astype(int)

m, n = X.shape

assert X.shape[0] == Y.shape[0], 'X and Y must have the same number of training examples'

print('Number of Features: {}'.format(n))
print('Number of Training Examples: {}'.format(m))
print('Percentage of Positive Examples: {:.1%}'.format(Y.sum()/m))

In [None]:
##### LOAD MODEL SNAPSHOT #####
model_type = 'custom'
if model_type=='kfold':
    grid_result = joblib.load('data/model_snapshot_{}.pickle'.format(model_type))
else:
    best_model = joblib.load('data/model_snapshot_{}.pickle'.format(model_type))
with open('data/data_snapshot_{}.pickle'.format(model_type), 'rb') as f:
    if model_type == 'kfold':
        [ind_train, ind_test, X_train, X_test, Y_train, Y_test] = pickle.load(f)
    else:
        [ind_train, ind_test, X_train, X_test, Y_train, Y_test, ind_val, X_val, Y_val] = pickle.load(f)

In [None]:
predictions, pred_df = allnba_predict(best_model.predict_proba(X)[:,1], id_data, pos_field='selected_pos')

In [None]:
top_x = 10
# features_imp = grid_result.best_estimator_.feature_importances_
features_imp = best_model.feature_importances_
sorted_idx = features_imp.argsort()

top_features = np.array(features)[sorted_idx][-top_x:]
pos_field = 'selected_pos'
id_cols = ['team','last_name','first_name',pos_field,'season']

explainer = shap.TreeExplainer(best_model, feature_names=features)
shap_output = explainer(X)

In [None]:
endgame = pred_df[(pred_df['proba'] >= pred_df['proba'].nlargest(22).min()) | (pred_df['y_pred'] == 1)][id_cols+list(top_features)+['y_pred','proba']].sort_values([pos_field, 'proba'], ascending=False).reset_index()
current_idx=0
endgame

In [None]:
current_player_details = endgame.loc[current_idx,:]
print('{} {} - {}: {:.2f} ({:.2f})'.format(current_player_details['first_name'], current_player_details['last_name'], 
                                           current_player_details['season'], current_player_details['y_pred'], current_player_details['proba']))
shap.plots.waterfall(shap_output[current_player_details['index']], max_display=15)
current_idx += 1
current_idx = current_idx % len(endgame)