In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os
import time
from acquire import player_season_3pa
import explore

# Endpoints from NBA API
from nba_api.stats.endpoints import playbyplayv2
from nba_api.stats.endpoints import gamerotation
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import winprobabilitypbp

# Static Imports from NBA API
from nba_api.stats.static import players
from nba_api.stats.static import teams

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.cluster import KMeans

# Helpful Stuff
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


In [None]:
#calling in the data:
from acquire import tome_prep
df = tome_prep()

Loading game 12 of 63 for Bogdan Bogdanovic (player 1 of 596) in 2021-2022 Regular Season.                    

In [None]:
#looking at data size and info:
df.info()

In [None]:
#stats on numeric variables:
df.describe()

### Univariates:

In [None]:
#what is the shape of the numeric data? 
explore.univariate(df)

### Takeaways:
- Play and rest times have clear right-skewed data.
- it is interesting to see that 3pt shots are happening more often in certain locations on the court. (see loc_x)
- score margin seems to be normally distributed
- points made by player are right skewed

### Splitting data on train, validate, test to keep integrity of data and only predict on a portion of the data.

In [None]:
#importing splitter function from splitter.py
from splitter import splitter
train, validate, test=splitter(df, target = 'shot_result', train_split_1 = .8, train_split_2 = .7, random_state = 123)

In [None]:
#checking out what train looks like:
train.head()

### Exploring Target Variable: shot result

In [None]:
#looking at values:
df.shot_result.value_counts()

In [None]:
#putting into percentages:
df.shot_result.value_counts(normalize=True)

In [None]:
#looking at overall Made shot rate:
made_shot_rate=df.shot_result.value_counts(normalize=True)[1]

### Overall Made shot rate is 36%

I want to see the rates based on:
- Player(missed/made)
- Team
- Game
- Shot type
- Period

In [None]:
#using a function by Stephen FitzSimon that takes in a predicted outcome
#and returns the rate based on the feature against target variable:

def get_pct_outcome(df, shot_cols = ['player']):
    outputs = []
    for cat in shot_cols:
        for subcat in list(df[cat].unique()):
            for outcome in list(df['shot_result'].unique()):
                output = {
                    'player_name': cat,
                    'sub_col': subcat,
                    'outcome': outcome,
                    'total_shots_made':(df[df[cat]==subcat].shot_result == outcome).sum(),
                    'proportion': (df[df[cat]==subcat].shot_result == outcome).mean()
                }
                outputs.append(output)
    return pd.DataFrame(outputs)

get_pct_outcome(df)

In [None]:
#creating an object to ask questions on:
proportion_df = get_pct_outcome(df)

# Regular Season Leaders

### Which players have the higher 3pt shot 'Made  Shot' rates?

In [None]:
top_3pt_shooters = proportion_df[(proportion_df.proportion > made_shot_rate) & (proportion_df.outcome == 'Made Shot')]
top_3pt_shooters[top_3pt_shooters.total_shots_made >= 87].sort_values(by='proportion', ascending=False).head(60)

### What is the average of 3pt 'Made Shots' per season?

In [None]:
#determining average of 3pt shots made per season:
top_3pt_shooters.total_shots_made.mean()

### Takeaways:
This gives us a good list of players that are shooting higher than the 35.7% made-shot rate of the NBA overall.

I'll take out the the 1 shot-wonders however, so as not to skew the data.

In [None]:
#looking at value_counts()
top_3pt_shooters.total_shots_made.value_counts()

_____________________________________________

### Which teams have the highest rate of making 3pt shots?

In [None]:
### Looking a proportions of 3pt shots based on teams:
def get_pct_outcome(df, shot_cols = ['col']):
    outputs = []
    for cat in shot_cols:
        for subcat in list(df[cat].unique()):
            for outcome in list(df['shot_result'].unique()):
                output = {
                    'main_category': cat,
                    'sub_category': subcat,
                    'outcome': outcome,
                    'total_shots_made':(df[df[cat]==subcat].shot_result == outcome).sum(),
                    'proportion': (df[df[cat]==subcat].shot_result == outcome).mean()
                }
                outputs.append(output)
    return pd.DataFrame(outputs)

get_pct_outcome(df, shot_cols= ['team'])

In [None]:
#creating an object to apply to teams:
team_prop_df = get_pct_outcome(df, shot_cols= ['team'])

In [None]:
# result of top teams that make 3pt shots:
top_3pt_teams = team_prop_df[(team_prop_df.proportion > made_shot_rate) & (team_prop_df.outcome == 'Made Shot')]
top_3pt_teams.sort_values(by='proportion', ascending=False)

______________________

### Which Shot types have higher 3pt shot rates?

In [None]:
shot_type_prop = get_pct_outcome(df, shot_cols= ['shot_type'])
best_shot_types = shot_type_prop[(shot_type_prop.proportion > made_shot_rate) & (shot_type_prop.outcome == 'Made Shot')]
best_shot_types.sort_values(by='proportion', ascending=False)

________________

### Which zones have higher rates of shots made?

In [None]:
zone_prop = get_pct_outcome(df, shot_cols= ['zone'])
best_zones = zone_prop[(zone_prop.proportion > made_shot_rate) & (zone_prop.outcome == 'Made Shot')]
best_zones.sort_values(by='proportion', ascending=False)

________________________________

### Hypothesis Tests: 

#### Hypothesis 1): 3pt shots made have a relationship with the type of shot taken.

In [None]:
alpha = 0.05
null_hypothesis = "3pt shot result and shot type are independent"
alternative_hypothesis = "there is a relationship between 3pt shot results and shot type"

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(train.shot_result, train.shot_type)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis that", null_hypothesis)
    print("Sufficient evidence to move forward understanding that", alternative_hypothesis)
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p



#### Hypothesis 2): 3pt shots made have a relationship with where the shot was taken.

In [None]:
alpha = 0.05
null_hypothesis = "3pt shot result and zone shot from are independent"
alternative_hypothesis = "there is a relationship between 3pt shot results and zone shot from"

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(train.shot_result, train.zone)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis that", null_hypothesis)
    print("Sufficient evidence to move forward understanding that", alternative_hypothesis)
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p



#### Hypothesis 3): 3pt shots made have a relationship with how much time a player has been in the game.

In [None]:
shots_made_sample = train[train.shot_result == 'Made Shot'].play_time

In [None]:
shots_made_sample.hist()

In [None]:
shots_missed_sample = train[train.shot_result == 'Missed Shot'].play_time

In [None]:
shots_missed_sample.hist()

In [None]:
#putting the made vs missed shots on same histogram for comparison:
x = shots_made_sample
y = shots_missed_sample
plt.hist([x, y], color=['r','b'], alpha=0.5)

#### $H_0$ Mean of made shots is higher or equal to the mean of missed shots. 
#### $H_a$ Mean of made shots is less than mean of missed shots

In [None]:
alpha = .05

#checking for equal variances:
print(shots_made_sample.var())
print(shots_missed_sample.var())


Variances are not quite equal so we will use equal_var as False:

In [None]:
t, p = stats.ttest_ind(shots_made_sample, shots_missed_sample, equal_var=False)
t, p / 2

In [None]:
print("is p/2 < alpha? ", p / 2 < alpha)
print("is t > 0? ", t > 0)

In [None]:
if p / 2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")


#### Takeaway: It looks like the amount of time in game does not relate to when a 3pt shot was made.

______________________________

#### Hypothesis 4): 3pt shots made have a relationship with how much time a player had rested prior to the shoot.

In [None]:
shots_made_rest = train[train.shot_result == "Made Shot"].since_rest

In [None]:
shots_made_rest.hist()

In [None]:
shots_missed_rest = train[train.shot_result == "Missed Shot"].since_rest

In [None]:
shots_missed_rest.hist()

The shapes look very similar. Let's look at variances:

#### $H_0$ Mean of made shots is higher or equal to the mean of missed shots. 
#### $H_a$ Mean of made shots is less than mean of missed shots.

In [None]:
alpha = .05

#checking for equal variances:
print(shots_made_rest.var())
print(shots_missed_rest.var())

In [None]:
#variances not the same so var will equal false in test:
t, p = stats.ttest_ind(shots_made_rest, shots_missed_rest, equal_var=False)
t, p / 2

In [None]:
print("is p/2 < alpha? ", p / 2 < alpha)
print("is t > 0? ", t > 0)

In [None]:
if p / 2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

_____________________________________

## Rechecking with more confidence in alpha and ANOVA testing 
for Missed vs Made and play time 

I want to make sure there isn't some odd randomness happening, so I'll use both the Missed and Made shots and do an Anova test with 99% accuracy check if there isn't something odd going on between the variables.

I can tell on the histograms there is a difference happening, but maybe the 95% confidence in alpha needs to be set differently as the metrics may be close but still varying. We want to find/see that.

In [None]:
null_hypothesis = "Amount of time played when taking a 3pt shot has no relation to when shot is made or missed"
alternative_hypothesis = "Amount of time played when taking a 3pt shot has a relation to when shot is made or missed"
alpha = 0.01 # Let's be 99% certain the result we see isn't due to chance/randomness

In [None]:
# Isolating out made vs missed shots:
made_shots = train[train.shot_result == "Made Shot"].play_time
missed_shots = train[train.shot_result == "Missed Shot"].play_time

In [None]:
# f_oneway is our ANOVA test
# See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html for more info
from scipy.stats import f_oneway

f, p = f_oneway(made_shots, missed_shots)
p

In [None]:
if p < alpha:
    print("We reject the null hypothesis that", null_hypothesis)
    print("We move forward with the alternative hypothesis that", alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis")
    print("Evidence does not support the claim that play time averages are significantly different across made or missed shots.")

______________________________