In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
from basketball_reference_scraper.shot_charts import get_shot_chart
from basketball_reference_scraper.players import get_stats, get_game_logs
from pydotplus import graph_from_dot_data
from IPython.display import Image

In [2]:
%matplotlib inline

In [3]:
def time_to_secs(time_remaining):
    min_sec_split = time_remaining.split(':')
    time_in_quarter = float(min_sec_split[0])*60 + float(min_sec_split[1])
    return time_in_quarter

def annot_max(x,y, ax=None):
    xmax = x[np.argmax(y)]
    ymax = y.max()
    text= "lr={:.3f}, probability={:.3f}%".format(xmax, ymax*100)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=60")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymax), xytext=(0.94,0.96), **kw)
    
def extract_features(player, start, end, split=0.75):
    df = get_game_logs(player, start, end, playoffs=False, ask_matches=False)

    gp_dates = list(df['DATE'].astype(str))
    gp_teams = list(df['TEAM'])
    gp_opponents = list(df['OPPONENT'])

    gp = len(gp_dates)

    print(f'Found shot data for {gp} games played by {player}.')

    shots_aggregate = np.zeros((0,7))

    for idx in range(gp):
        s = get_shot_chart(gp_dates[idx], gp_teams[idx], gp_opponents[idx])[gp_teams[idx]]
        ss = s.loc[(s['PLAYER'] == player)]

        fg_count = len(ss)
        shots = np.zeros((fg_count,7))
        for i in range(len(ss)):
            shots[i][0] = float(ss.iloc[i]['x'].strip(' ft'))
            shots[i][1] = float(ss.iloc[i]['y'].strip(' ft'))
            shots[i][2] = float(ss.iloc[i]['DISTANCE'].strip(' ft'))
            shots[i][3] = time_to_secs( ss.iloc[i]['TIME_REMAINING'] )
            shots[i][4] = 1 if ss.iloc[i]['QUARTER'] % 2 == 0 else 0 # 2nd or 4th quarter "end of half"
            shots[i][5] = 1 if ss.iloc[i]['QUARTER'] % 4 == 0 else 0 # 4th quarter "end of regulation"
            shots[i][6] = 1 if ss.iloc[i]['MAKE_MISS'] == 'MAKE' else 0 

        shots_aggregate = np.concatenate((shots_aggregate,shots), axis=0)
        
        
        if (idx % 100 == 0) and idx > 0:
            print(f'Data retrieved for {idx} games')
        elif idx == gp-1:
            print(f'All shot data retrieved for {gp} games')
            
    np.random.shuffle(shots_aggregate)
    
    train_range = math.floor(split * gp)


    X_train = shots_aggregate[:train_range,:-1]
    y_train = shots_aggregate[:train_range,-1:].ravel()

    X_test = shots_aggregate[train_range:,:-1]
    y_test = shots_aggregate[train_range:,-1:].ravel()

    # shots_aggregate.shape
    return X_train, y_train, X_test, y_test

def percentile_analysis(clf, X_test, percentile):
    high_chance_buckets = np.zeros((0,6))

    for attempt in range(len(X_test)):
        prediction = clf.predict_proba(X_test[attempt].reshape(1,-1))

        if prediction[0][1] > percentile:
            high_chance_buckets = np.concatenate((high_chance_buckets, X_test[attempt].reshape(1,-1)))

    return high_chance_buckets

def plot_percentile_shooting(player, high_chance_buckets, percentile):
    plt.rcParams['figure.dpi'] = 150
    plt.rcParams['savefig.dpi'] = 150

    xlim = 50 # width of NBA court
    ylim = 47 # half the length of NBA court

    datafile = 'nbahalfcourt.png'
    img = plt.imread(datafile)
    img2 = np.flipud(img)
    
    colors = ["blue", "orange"]
    colormap = ListedColormap(colors)
    
    fourth_qt = np.argwhere(high_chance_buckets[:,-1]) # find all 4th quarter shots
    color_indices = np.zeros((high_chance_buckets.shape[0],1)) 
    color_indices[fourth_qt] = 1 # color code 4th quarter shots as red
    
    plt.scatter(1+high_chance_buckets[:,0],1+high_chance_buckets[:,1],zorder=1,
               c=color_indices, cmap=colormap)
    
    plt.title(f'{100*percentile}-percentile shooting by {player}')

    plt.imshow(img2, zorder=0, extent=[0, xlim,0, ylim])
    plt.xlim(0,xlim)
    plt.ylim(0,ylim)
    plt.figure(figsize=(12,8), dpi= 200, facecolor='w', edgecolor='k')

    plt.show()
    
def fit_clt(X_train, y_train, X_test, y_test, plot=False):
    # Learning Rate Analysis 

    learning_rates = np.array(range(5, 50, 5))/500
    estimators = np.array(range(10, 100, 10))
    rand_state = 23

    plt_scores = np.zeros((0,3))

    clf_main = None
    clf_accuracy = 0.0

    # for n_estimators in estimators:
    for lr in learning_rates:
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr, random_state=rand_state)
        clf.fit(X_train, y_train)
        score = accuracy_score(y_test, clf.predict(X_test))

        if score > clf_accuracy:
            clf_main = clf
            clf_accuracy = score
            print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators} [New peak]')
        else:
            print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators}')


        results = np.array([score, lr, n_estimators]).reshape(1,-1)  
        plt_scores = np.concatenate((plt_scores, results),axis=0)

    if plot:
        # Plot learning rate vs. test accuracy
        xmax = plt_scores[np.argmax(plt_scores[:,0])][1]
        ymax = plt_scores[np.argmax(plt_scores[:,0])][0]

        fig = plt.figure()
        ax = plt.axes()
        annot_max(plt_scores[:,1], plt_scores[:,0])
        ax.set_ylim(0.5,.6)
        plt.plot(plt_scores[:,1], plt_scores[:,0])
    
    return clf

In [None]:
player = 'LeBron James'
start = '2003-10-29'
end = '2021-11-06'

X_train, y_train, X_test, y_test = extract_features(player, start, end, split=0.75)

In [None]:
# Learning Rate Analysis 

learning_rates = np.array(range(5, 50, 5))/500
estimators = np.array(range(10, 200, 10))
rand_state = 23

plt_scores = np.zeros((0,3))

clf_lebron = None
clf_accuracy = 0.0

for n_estimators in estimators:
    for lr in learning_rates:
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr, random_state=rand_state)
        clf.fit(X_train, y_train)
        score = accuracy_score(y_test, clf.predict(X_test))

        if score > clf_accuracy:
            clf_lebron = clf
            clf_accuracy = score
            print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators} [New peak]')
        else:
            print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators}')


        results = np.array([score, lr, n_estimators]).reshape(1,-1)  
        plt_scores = np.concatenate((plt_scores, results),axis=0)
    

# Plot learning rate vs. test accuracy
xmax = plt_scores[np.argmax(plt_scores[:,0])][1]
ymax = plt_scores[np.argmax(plt_scores[:,0])][0]

fig = plt.figure()
ax = plt.axes()
plt.xlabel('lr')
plt.ylabel('accuracy')
plt.title('lr versus accuracy')
annot_max(plt_scores[:,1], plt_scores[:,0])
ax.set_ylim(0.6,.7)
plt.plot(plt_scores[:,1], plt_scores[:,0])

In [None]:
percentile = 0.7

high_chance_buckets = percentile_analysis(clf=clf_lebron, X_test=X_test, percentile=percentile)
plot_percentile_shooting("LeBron James", high_chance_buckets, percentile)

In [None]:
sub_tree_42 = clf_lebron.estimators_[10, 0]

# Visualization. Install graphviz in your system
from pydotplus import graph_from_dot_data
from IPython.display import Image
dot_data = export_graphviz(
    sub_tree_42,
    out_file=None, filled=True, rounded=True,
    special_characters=True,
    feature_names=['X-coord (ft)','Y-coord (ft)','distance to basket (ft)','time remaining (sec)','half?','4th?'],
    proportion=True, impurity=False, # enable them if you want
)
graph = graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
player = 'Kevin Durant'
start = '2003-10-29'
end = '2021-11-06'

X_train2, y_train2, X_test2, y_test2 = extract_features(player, start, end, split=0.75)

In [None]:
# Learning Rate Analysis 

learning_rates = np.array(range(5, 50, 5))/500
estimators = np.array(range(10, 100, 10))
rand_state = 23

plt_scores = np.zeros((0,3))

clf_kd = None
clf_accuracy = 0.0

# for n_estimators in estimators:
for lr in learning_rates:
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr, random_state=rand_state)
    clf.fit(X_train2, y_train2)
    score = accuracy_score(y_test2, clf.predict(X_test2))

    if score > clf_accuracy:
        clf_kd = clf
        clf_accuracy = score
        print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators} [New peak]')
    else:
        print(f'Accuracy: {score*100:.2f}%, learning rate = {lr}, n_trees = {n_estimators}')


    results = np.array([score, lr, n_estimators]).reshape(1,-1)  
    plt_scores = np.concatenate((plt_scores, results),axis=0)
    

# Plot learning rate vs. test accuracy
xmax = plt_scores[np.argmax(plt_scores[:,0])][1]
ymax = plt_scores[np.argmax(plt_scores[:,0])][0]

fig = plt.figure()
ax = plt.axes()
annot_max(plt_scores[:,1], plt_scores[:,0])
ax.set_ylim(0.5,.6)
plt.plot(plt_scores[:,1], plt_scores[:,0])

In [None]:
percentile = 0.70

high_chance_buckets = percentile_analysis(clf=clf_kd, X_test=X_test2, percentile=percentile)
plot_percentile_shooting("Kevin Durant", high_chance_buckets, percentile)

In [None]:
player = 'Stephen Curry'
start = '2003-10-29'
end = '2021-11-06'

X_train3, y_train3, X_test3, y_test3 = extract_features(player, start, end, split=0.75)


In [None]:
clf_curry = fit_clt(X_train3, y_train3, X_test3, y_test3, plot=False)

percentile = 0.70

high_chance_buckets = percentile_analysis(clf=clf_curry, X_test=X_test3, percentile=percentile)
plot_percentile_shooting(player, high_chance_buckets, percentile)