In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os
import time
from acquire import player_season_3pa

# Endpoints from NBA API
from nba_api.stats.endpoints import playbyplayv2
from nba_api.stats.endpoints import gamerotation
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import winprobabilitypbp

# Static Imports from NBA API
from nba_api.stats.static import players
from nba_api.stats.static import teams

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.cluster import KMeans

# Helpful Stuff
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
df_Beasley = player_season_3pa('Malik Beasley')

In [None]:
df_Beasley

In [None]:
#finding MIN game with the most 3pts shots taken by Malik:
df_gameid = shotchartdetail.ShotChartDetail(team_id = 1610612750, player_id = 1627736).get_data_frames()[0]
df_gameid[df_gameid.GAME_DATE == '20220309']

In [None]:
df_teams = pd.DataFrame(teams.get_teams())
team_id_list = list(df_teams.id)

#### Save the player_id/team_id combo to a csv

In [None]:
filename = 'team_player_ids.csv'
if os.path.isfile(filename):
    team_player_ids =  pd.read_csv(filename, index_col=0)
else:
    players_list = []
    for team in team_id_list:
        df_tpd = teamplayerdashboard.TeamPlayerDashboard(team,
                                                         season = '2021-22').get_data_frames()
        player_list = list(df_tpd[1].PLAYER_ID)
        for player in player_list:
            row = [team,player]
            players_list.append(row)
    team_player_ids = pd.DataFrame(players_list, columns = ['team_id','player_id'])
    team_player_ids.to_csv('team_player_ids.csv')

## Acquiring a dataframe of every shot taken from the 2021-2022 regular NBA season.

In [None]:
players_list = team_player_ids.values.tolist()

In [None]:
"""This function will acquire and cache .csv in notebook directory"""
filename2 = 'all_last_season_shots.csv'
if os.path.isfile(filename2):
    df_all_shots =  pd.read_csv(filename2, index_col=0)
else:
    players_list = []
    df = pd.DataFrame()
    index = 0
    for player in players_list:
        print(f'\rFetching index {index} of 714', end='')
        df_pl = shotchartdetail.ShotChartDetail(team_id = player[0],
                                                        player_id = player[1],
                                                        season_type_all_star='Regular Season',
                                                        season_nullable='2021-22',
                                                        context_measure_simple = 'FG3A').get_data_frames()
        time.sleep(.5)
        index += 1
        #df = pd.concat([df, df_pl[0]])
        #df.to_csv(filename2) these last two lines of code are looping and overwriting the current csv until last time 
        #through the loop.
        
        

## Adding in only 3pt shots into the dataframe:

-We are removing outliers by taking the sum of the lowest and highest quantile and multiplying by 1.5 to create an IQR upper bound (removing the anomaly shots that are opposite court-side and keeping the shots only with the 3pt shot range).

In [None]:
#calling in our df_all_shots dataframe and resetting index:
df_all_shots = df_all_shots.reset_index(drop = True)

In [None]:
#creating a df for only the 3pt shots taken:
df_all_3pt = df_all_shots[df_all_shots.SHOT_TYPE == '3PT Field Goal']

In [None]:
#Creating an IQR upper bound to remove the large anomaly 3pt shots taken.
low = df_all_3pt.SHOT_DISTANCE.quantile(.25)
high = df_all_3pt.SHOT_DISTANCE.quantile(.75)
add = (high-low) * 1.5
bound = high + add
#this is where our bound is:
bound

In [None]:
#creating our bounds: any thing over 29ft in shot distance is in outliers df:
df_outlier_3pt = df_all_3pt[df_all_3pt.SHOT_DISTANCE > 29.0]

In [None]:
#anything less than 29 or equal to is now our df_3pt df:
df_3pt = df_all_3pt[df_all_3pt.SHOT_DISTANCE <= 29.0]

In [None]:
#recreating our main df to only hold those 3pt shots and IQR:
df_shots = df_all_shots[df_all_shots.SHOT_DISTANCE <= 29.0]

### Clustering the 3pt shot locations (once outliers are taken out)

In [None]:
#creating our X variable:
X = df_3pt[['LOC_X','LOC_Y']]

In [None]:
#we found that using 7 kmean clusters gave us not only good 3pt
#area clusters, but also shots made in center court of the arch
kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

clusters = kmeans.predict(X)

In [None]:
#we will add a new column onto our main df to hold these clusters:
df_3pt['three_pt_location'] = clusters

In [None]:
#plotting these 7clusters on the 3pt zone:
plt.figure(figsize = (14,12))
sns.scatterplot(data =df_3pt, x='LOC_X', y = 'LOC_Y', hue = 'three_pt_location')
plt.show()

### We want to have descriptive names of each of these cluster zones. 
We are giving each of these clusters names by map their locations:

In [None]:
# creating a new column with the location names:
df_3pt['location'] = df_3pt['three_pt_location'].map({0: 'R Above Break', 1: 'L Above Break',2:'L Below Break/Corner',3:'R Center',4:'R Below Break/Corner',5:'Center',6:'L Center'})

In [None]:
#creating a variable to use for applying clusters to 3pt only shots
#note: 2pt shots will be NaN
location_column = df_3pt[['location']]

In [None]:
#merging the applying the clusters:
df_shots = df_shots.merge(location_column, how = 'left', left_index = True, right_index = True)

In [None]:
df_shots.head()

## Let's see if this works!
Creating a Test case using the Spurs and Malik Beasley.

In [None]:
team_id = teams.find_team_by_abbreviation('MIN')['id'] # For Minnesota Timberwolves

player_id = players.find_players_by_full_name("Malik Beasley")[0]['id'] # For Malik Beasley

game_id = '0022100986' # First game of year Spurs vs. Magic - note that game id must be converted to an int in df_shots

In [None]:
team_id

### We are using the `win_probability` api endpoint for a base of the analysis dataframe.

In [None]:
# Select for a single game
df_base = winprobabilitypbp.WinProbabilityPBP(game_id).get_data_frames()[0]
df_base

#### We started with the win_prob endpoint as this houses play-by-play by second/minutes. 
We need to then have a `absolute_time` in game seconds since the play started.

In [None]:
# Utilize user function to create absolute time (in seconds) column
df_base['abs_time'] = np.where(df_base.PERIOD <5,
                     ((df_base.PERIOD - 1) * 720 + (720 - df_base.SECONDS_REMAINING)),
                     (2880 + (df_base.PERIOD - 5) * 300 + (300 - df_base.SECONDS_REMAINING)))

### Adding on the Rotation Dataframe so we can have each players' minutes of play vs rest times.

In [None]:
df_rotation = gamerotation.GameRotation(game_id).get_data_frames()

'Game Rotation' returns all the minutes (play time in the court) for every player in the game, seperated into Home and Visitor Dataframes.  As such we search through players for the records with our player_id.  Returns a dataframe holding their rotational stats, most importantly in_time and out_time.

In [None]:
#for-loop to cycle through the players (by ID) in the rotation df
#to then grab their times:
for i in range(2):
    for player in df_rotation[i].PERSON_ID:
        if player == player_id:
            df_player_roto = df_rotation[i][df_rotation[i].PERSON_ID == player]


We need to convert the rotational time in 1/10ths of second, into seconds, then isolate those times out.

In [None]:
df_player_roto['abs_in_time'] = df_player_roto.IN_TIME_REAL/10
df_player_roto['abs_out_time'] = df_player_roto.OUT_TIME_REAL/10
df_player_roto_times = df_player_roto[['abs_in_time','abs_out_time']].reset_index(drop = 'True')

Using a zip function, we are putting together the in and out times together.

In [None]:
zipped = list(zip(df_player_roto_times.abs_in_time, df_player_roto_times.abs_out_time))

In [None]:
#showing what it looks like now:
zipped

#### Let's filter out the times of the game (from the df_base of  win_prob) that align with the player's play time.

In [None]:
# Let me create a holder dataframe as I pull slices off from the base
df_player_game = pd.DataFrame()
for tuplez in zipped:
    df_slice = df_base[(df_base.abs_time >= tuplez[0]) & (df_base.abs_time <= tuplez[1])]
    df_player_game = pd.concat([df_player_game, df_slice])
df_player_game

In [None]:
#testing this out:
df_test = df_player_game.copy()
df_test

In [None]:
df_player_game.info()

In [None]:
#changing GAME_ID to numeric to later add on shot details df:
df_player_game.GAME_ID=pd.to_numeric(df_player_game.GAME_ID)

In [None]:
#checking the dtypes
df_player_game.dtypes

### Creating Shot Details df:

In [None]:
#using abs_time on main df to account for overtime, and then calculates that overtime a bit different than regular 
#period plays
#plus the minutes - seconds remaining
df_shots['abs_time'] = np.where(df_shots.PERIOD < 5,
                                (df_shots.PERIOD - 1) * 720 + (720 - (60 * df_shots.MINUTES_REMAINING) - (df_shots.SECONDS_REMAINING)),
                                2880 + ((df_shots.PERIOD - 5) * 300) + (300 - (60 * df_shots.MINUTES_REMAINING) - (df_shots.SECONDS_REMAINING)))

In [None]:
#setting up GAME_ID for this df to be int:
df_game_shots = df_shots[df_shots.GAME_ID == int(game_id)]

In [None]:
#changing the df PLAYER_ID (will change all to lower down below)
df_game_shots = df_game_shots[df_game_shots.PLAYER_ID == player_id]

In [None]:
#resetting the index here:/
df_game_shots.reset_index(drop = True)

In [None]:
#getting the Spurs abbreviations- 
teams.find_teams_by_full_name(df_game_shots.TEAM_NAME.max())[0]['abbreviation']

In [None]:
#combining home and visitor column with only the Spurs team
df_game_shots['player_h_v'] = np.where(teams.find_teams_by_full_name(df_game_shots.TEAM_NAME.max())[0]['abbreviation'] == df_game_shots.HTM, 'Home', 'Visitor')

### Merging main_df with the Shot details df:

In [None]:
df_almostthere = df_player_game.merge(df_game_shots, how = 'inner', on = 'abs_time')

In [None]:
df_almostthere

In [None]:
#adding in the column score_margin that holds the home score margins:
df_almostthere['score_margin'] = np.where(df_almostthere.player_h_v == 'Home', df_almostthere.HOME_SCORE_MARGIN, df_almostthere.HOME_SCORE_MARGIN * -1)

In [None]:
#since we get two columns for every game (Home vs Visitor)
#we want to drop and only hold the Spurs with whichever
#they are (H or V) and only show those win percentages:

if df_almostthere.loc[0,'player_h_v'] == 'Home':
    df_almostthere = df_almostthere.drop(columns = ['VISITOR_PCT'])
    df_almostthere = df_almostthere.rename(columns = {'HOME_PCT':"WIN_PCT"})
else:
    df_almostthere = df_almostthere.drop(columns = ['HOME_PCT'])
    df_almostthere = df_almostthere.rename(columns = {'VISITOR_PCT':"WIN_PCT"})

In [None]:
#creating a column `play_points` that 
df_almostthere['play_points'] = np.where(df_almostthere.SHOT_TYPE == '2PT Field Goal',
                                    np.where(df_almostthere.SHOT_MADE_FLAG == 1, 2,0),
                                    np.where(df_almostthere.SHOT_MADE_FLAG == 1, 3,0))

In [None]:
df_almostthere['points'] = df_almostthere['play_points'].cumsum()

In [None]:
df_almostthere['shots_taken'] = df_almostthere['SHOT_ATTEMPTED_FLAG'].cumsum()
df_almostthere['shots_hit'] = df_almostthere['SHOT_MADE_FLAG'].cumsum()
df_almostthere['game_pct'] = round(df_almostthere['shots_hit']/df_almostthere['shots_taken'],2)

In [None]:
df_almostthere.info()

In [None]:
columns_to_drop = [
    'GAME_ID_x',
    'EVENT_NUM',
    'HOME_PTS',
    'VISITOR_PTS',
    'HOME_SCORE_MARGIN',
    'PERIOD_x',
    'SECONDS_REMAINING_x',
    'HOME_POSS_IND',
    'HOME_G',
    'LOCATION',
    'PCTIMESTRING',
    'ISVISIBLE',
    'GRID_TYPE',
    'GAME_EVENT_ID',
    'MINUTES_REMAINING',
    'SECONDS_REMAINING_y',
    'SHOT_ZONE_BASIC',
    'SHOT_ZONE_AREA',
    'SHOT_ZONE_RANGE',
    'SHOT_ATTEMPTED_FLAG',
    'SHOT_MADE_FLAG',
    'GAME_DATE',
    'HTM',
    'VTM']

In [None]:
df_almostthere = df_almostthere.drop(columns = columns_to_drop)

In [None]:
df_almostthere

In [None]:
columns_to_rename = {'WIN_PCT':'win_probability',
                     'DESCRIPTION':'play_description',
                     'GAME_ID_y':'game_id',
                     'PLAYER_ID':'player_id',
                     'PLAYER_NAME':'player',
                     'TEAM_ID':'team_id',
                     'TEAM_NAME':'team',
                     'PERIOD_y':'period',
                     'EVENT_TYPE': 'shot_result',
                     'ACTION_TYPE':'shot_type',
                     'SHOT_TYPE':'field_goal',
                     'SHOT_DISTANCE':'distance'}

In [None]:
df_almostthere = df_almostthere.rename(columns = columns_to_rename)

In [None]:
#df_game_player_target = df_almostthere[df_almostthere['2pt_or_3pt'] == '3PT Field Goal']

In [None]:
df_almostthere.head()

### Add on's for clean up:

In [None]:
#let's reset the index to game_id:
#first, setting index:
df_almostthere.set_index('game_id', inplace=True)

In [None]:
#replacing shot_result string to only say 'missed' and 'made'
df_almostthere.shot_result = df_almostthere.shot_result.replace({'Missed Shot':'Missed', 'Made Shot':'Made'})

In [None]:
df_almostthere.head()

________________________________________________

In [None]:
#plotting data for shots in game:
shotsGame = df_almostthere[['abs_time','play_points','points','shots_taken','shots_hit','location','field_goal']].drop_duplicates()

In [None]:
#just looking at what it holds:
shotsGame.head()

#### Looking at where Malik is taking the most 3pt shots this game:

In [None]:
sns.catplot(data=shotsGame, kind='box', x='field_goal', y='abs_time', aspect=1.8, palette ='deep')

In [None]:
sns.catplot(data=shotsGame, kind='box', x='play_points', y='abs_time', aspect=1.8, palette ='deep')

___________________________

### Some simple EDA on TimberWolves vs Thunder game- player Malik Beasley:

In [None]:
df_almostthere

In [None]:
df_almostthere.describe()

In [None]:
df_almostthere.isnull().sum()

In [None]:
df_almostthere.loc[~df_almostthere.index.duplicated(), :]

In [None]:
df_almostthere.info()

In [None]:
#creating a column of shot_result to made = 1, missed = 0 for charts-sake:
df_almostthere['bool_shot_result'] = df_almostthere.shot_result.astype(bool)
#then making this column into integer for the 1 and o
df_almostthere.bool_shot_result=df_almostthere.bool_shot_result.astype(int)

df_almostthere.head()

### Univariate Exploration:

In [None]:
def univariate():
    df_almostthere.hist(bins = 30, figsize = (20, 20), color= 'blue')

In [None]:
univariate()

In [None]:
#create variables that split data by made or missed shots: 
#create variables that split train by attrition value 
left_train = df_almostthere[df_almostthere['bool_shot_result'] == 1]
stayed_train = df_almostthere[df_almostthere['bool_shot_result'] == 0]

In [None]:
#looking at 
df_almostthere.shot_result.value_counts()

In [None]:
def made_3ptshot_rate():
    labels = 'Missed', 'Made'
    data = [6, 11]
    plt.pie(data, labels=labels, colors = ['#17408B', '#C9082A'])
    plt.show()

In [None]:
#simple pie chart to show Malik's 3pt shots made/missed rate:
made_3ptshot_rate()

#### If I drop nulls I drop the whole df for this game, as there are only 17 rows here.

In [None]:
plt.title("Win Probability relating to 2pt and 3pt shots")
sns.barplot(x="win_probability", y="shot_result", data=df_almostthere)

plt.legend()
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(17.7,8.27)})
plt.title("Win Probability relating to type of shot made")
sns.barplot(x="win_probability", y="shot_type", data=df_almostthere)
plt.legend()
plt.show()

In [None]:
#function from  Savvas Tjortjoglou of how to create an NBA sized court:
from matplotlib.patches import Circle, Rectangle, Arc

def draw_court(ax=None, color='black', lw=2, outer_lines=False):
    # If an axes object isn't provided to plot onto, just get current one
    if ax is None:
        ax = plt.gca()

    # Create the various parts of an NBA basketball court

    # Create the basketball hoop
    # Diameter of a hoop is 18" so it has a radius of 9", which is a value
    # 7.5 in our coordinate system
    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)

    # Create backboard
    backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)

    # The paint
    # Create the outer box 0f the paint, width=16ft, height=19ft
    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color,
                          fill=False)
    # Create the inner box of the paint, widt=12ft, height=19ft
    inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color,
                          fill=False)

    # Create free throw top arc
    top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
                         linewidth=lw, color=color, fill=False)
    # Create free throw bottom arc
    bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0,
                            linewidth=lw, color=color, linestyle='dashed')
    # Restricted Zone, it is an arc with 4ft radius from center of the hoop
    restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw,
                     color=color)

    # Three point line
    # Create the side 3pt lines, they are 14ft long before they begin to arc
    corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw,
                               color=color)
    corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
    # 3pt arc - center of arc will be the hoop, arc is 23'9" away from hoop
    # I just played around with the theta values until they lined up with the 
    # threes
    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw,
                    color=color)

    # Center Court
    center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0,
                           linewidth=lw, color=color)
    center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0,
                           linewidth=lw, color=color)

    # List of the court elements to be plotted onto the axes
    court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw,
                      bottom_free_throw, restricted, corner_three_a,
                      corner_three_b, three_arc, center_outer_arc,
                      center_inner_arc]

    if outer_lines:
        # Draw the half court line, baseline and side out bound lines
        outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw,
                                color=color, fill=False)
        court_elements.append(outer_lines)

    # Add the court elements onto the axes
    for element in court_elements:
        ax.add_patch(element)

    return ax

In [None]:
g=sns.relplot(data=df_almostthere.field_goal, kind = 'scatter',
               x = df_almostthere.LOC_X, y= df_almostthere.LOC_Y, hue= df_almostthere.shot_result)

for i, ax in enumerate(g.axes.flat):
    ax = draw_court(ax, outer_lines=True)
    ax.set_xlim(-300, 300)
    ax.set_ylim(-100, 500)

______________________

### Hypothesis testing categorical features:


In [None]:
# Let's run a chi squared to compare proportions, to have more confidence
alpha = 0.05
null_hypothesis = "shot results and shot type are independent"
alternative_hypothesis = "there is a relationship between show results and shot type"

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(df_almostthere.shot_result, df_almostthere.shot_type)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis that", null_hypothesis)
    print("Sufficient evidence to move forward understanding that", alternative_hypothesis)
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p


In [None]:
# Let's run a chi squared to compare proportions, to have more confidence
alpha = 0.05
null_hypothesis = "shot results and period of game are independent"
alternative_hypothesis = "there is a relationship between shot results and period of game"

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(df_almostthere.shot_result, df_almostthere.period)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis that", null_hypothesis)
    print("Sufficient evidence to move forward understanding that", alternative_hypothesis)
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p

In [None]:
# Let's run a chi squared to compare proportions, to have more confidence
alpha = 0.05
null_hypothesis = "shot results and location of shot are independent"
alternative_hypothesis = "there is a relationship between shot results and location of shot"

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(df_almostthere.shot_result, df_almostthere.location)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis that", null_hypothesis)
    print("Sufficient evidence to move forward understanding that", alternative_hypothesis)
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p

### Takeaways:
Because this is only one game and it shows that any sort of results/data to be understood--we need more data.

In [1]:
from acquire import tome_prep
df = tome_prep()
df

KeyError: "['win_prob'] not found in axis"

In [2]:
import pandas as pd
df = pd.read_csv('league_3pa.csv').drop(columns='Unnamed: 0')
df.to_csv('league_3pa.csv', index=False)

KeyError: "['Unnamed: 0'] not found in axis"

__________________________