In [1]:
import pandas as pd
import numpy as np

import requests

import re
import string

from py_ball import league, image, boxscore, player

# static lists to get players and teams
from nba_api.stats.static import players
from nba_api.stats.static import teams

# endpoints to get the list of players on every team and shot chart for each player
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import teamplayerdashboard

# getting nba defensive statistics
from nba_api.stats.endpoints import boxscoredefensive
from nba_api.stats.endpoints import defensehub
from nba_api.stats.endpoints import leaguedashptdefend
from nba_api.stats.endpoints import playerdashptshotdefend
from nba_api.stats.endpoints import boxscoretraditionalv2


# getting nba player info for their position
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear

HEADERS = {'Connection': 'keep-alive',
           'Host': 'stats.nba.com',
           'Origin': 'http://stats.nba.com',
           'Upgrade-Insecure-Requests': '1',
           'Referer': 'stats.nba.com',
           'x-nba-stats-origin': 'stats',
           'x-nba-stats-token': 'true',
           'Accept-Language': 'en-US,en;q=0.9',
           "X-NewRelic-ID": "VQECWF5UChAHUlNTBwgBVw==",
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6)' +\
                         ' AppleWebKit/537.36 (KHTML, like Gecko)' + \
                         ' Chrome/81.0.4044.129 Safari/537.36'}

## Finding all the made shots for each player from each game for the 2019/20 nba season


In [2]:
# importing a dictionary of the nba teams to get a list of the team ids

teams_dict = teams.get_teams()

In [3]:
# obtaining list of team ideas for each team in the team dictionary

team_ids = [team['id'] for team in teams_dict]

In [4]:
# creating an empty list to store the dataframes of players

df_list = []

In [5]:
# looping through each team id in the list of team ids
for team in team_ids:
    
    # obtaining the dashboard of the teams with all the active players and their stats
    team_players = teamplayerdashboard.TeamPlayerDashboard(team, season = '2019-20')

    # using that list to grab the player ids from every player on every team
    player_ids = team_players.players_season_totals.get_data_frame().PLAYER_ID

    # looping through all the players on every team
    for player_id in player_ids:

        # getting the shot chart object from the nba api for each player using the team id and the player id
        shot_chart = shotchartdetail.ShotChartDetail(team, player_id)

        # turning that shot chart object into a dataframe
        shot_chart_df = shot_chart.get_data_frames()[0]

        # changing the dates into a datetime to grab the games from 2019-2020, then setting that as the index
        shot_chart_df['GAME_DATE'] = pd.to_datetime(shot_chart_df['GAME_DATE'], format = '%Y%m%d', errors = 'ignore')
        shot_chart_df.set_index('GAME_DATE', inplace = True)

        # locating only games for the 2019/20 nba season
        shot_chart_2020 = shot_chart_df.loc['2019-10-22':'2020-03-11']

        # appending it to a list to later concat into a single DataFrame
        df_list.append(shot_chart_2020)

In [6]:
# concating the list of dataframes into a single dataframe

df = pd.concat(df_list)

In [7]:
# storing that in a csv

df.to_csv('csv_files/2019-20_nba_shot_charts.csv')

# Cleaning the NBA shot charts to the amount of made shots for each shooting location

## Messing with the data to try and get an idea of court locations

For x and y location, 10 units in the LOC_X or LOC_Y value is equal to 1 ft on the court. 

-250, -50 = left corner

250, -50 = right corner

0, 238 = top of the key (any value for LOC_Y that is 238 or higher would be great)

Shot Areas: 

### 3 Pointers

X(-250 to -220) & Y(-50 to 87) = left corner 3

X(220 to 250) & Y(-50 to 87) = right corner 3

X(-250 to -80) & Y(87 to 280) & Shot_zone(above the break) = left wing 3

X(80 to 250) & Y(87 to 280) & Shot_zone(above the break) = right wing 3

X(-80 to 80) and Y(87 to 280) & Shot_zone(above the break) = Center 3

X(any) and Y(280 to 350) = Deep 3

X(any) and Y(350+) = Heave

### Mid Range 

X(-220 to -150) & Y(-50 to 90) = left baseline deep midrange

X(150 to 220) & Y(-50 to 90) = right baseline deep midrange

((X(-220 to 150) & Y(90+)) & (X(-150 to 80) & Y(150+))) & Shotzone(Mid-Range) = left wing deep mid ranger

((X(150 to 220) & Y(90+)) & (X(80 to 150) & Y(150+))) & Shotzone(Mid-Range) = right wing deep mid ranger

X(-150 to -80) & Y(-50 to 90) = short left baseline midranger

X(80 to 150) & Y(-50 to 90) = short right baseline midranger

X(-150 to -80) & Y(90 to 150) = short left wing mid ranger

X(80 to 150) & Y(90 to 150) = short right wing mid ranger

X(-80 to 80) & Y(210+) & Shotzone(Mid-Range) = deep center midranger

X(-80 to 80) & Y(150 to 210) = short center midrange

### Paint

X(-80 to 80) & Y(90 to 150) = Floater

X(-80 to 80) & Y(-50) & Shotzone(In The Paint (Non-RA)) = layup/in the paint

Shotzone(restricted area) = restricted area

## Organizing the data and using the function to group the data in the format i am looking for

In [20]:
df.PLAYER_ID = pd.to_numeric(df.PLAYER_ID)

In [21]:
# Creating a new df with just the players and the year they played as the index, only 
# included the player id in case it was necessary to concat into the future

players_df = df.groupby('PLAYER_NAME')['PLAYER_ID'].mean().to_frame()

In [22]:
# dataframe needs to be in the format of the shot chart data that is provided by the nba api

from mod_5_functions import shooting_positions

In [23]:
# running the function which returns the amount of shots made for each player for each position, so i need to set
# the output of the function to all of the different positions on the court

left_corner_3, right_corner_3, left_wing_3, right_wing_3, center_3, deep_3,heave, left_baseline_deep_2, right_baseline_deep_2, left_wing_deep_2, right_wing_deep_2, left_baseline_short_2, right_baseline_short_2, left_wing_short_2, right_wing_short_2, deep_center_2, short_center_2, floater_range, in_the_paint, restricted_area = shooting_positions(df)


In [24]:
# taking all of those values, as well as the player names and concating them to a single dataframe

player_shot_profile = pd.concat([players_df, left_corner_3, right_corner_3, left_wing_3, right_wing_3, center_3,
                                 deep_3,heave, left_baseline_deep_2, right_baseline_deep_2, left_wing_deep_2, 
                                 right_wing_deep_2, left_baseline_short_2, right_baseline_short_2, left_wing_short_2,
                                 right_wing_short_2, deep_center_2, short_center_2, floater_range, in_the_paint, 
                                 restricted_area], axis = 1)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [26]:
# taking the na values, areas where players didnt have a single shot (big men and 3s) and replacing them with 0s

player_shot_profile.fillna(0, inplace = True)

In [27]:
# saving it to a csv file

player_shot_profile.to_csv('csv_files/Player_comparison_Shot_Profiles.csv')

## Getting Defensive statistics through py_ball

This worked but only gave me data for over 500 players, this is due to the fact that there is no defensive tracking data available for older players. So i decided to keep it out of the classification model for now and just stick to the offensive stats, will use this for the similarity models i will run later on

In [28]:
# getting a list of unique players to loop through and get their defensive stats

unique_players = df.PLAYER_ID.unique()

In [29]:
# looping through all the different players to get the dataframe of their defensive shot 
# location metrics and store the dataframe to a list (this only works for the newer players)

defensive_shooting = []

league_id = '00'

for ids in unique_players:
    
    # saving the data as the defense object in pyball for each player 
    defense = player.Player(headers = HEADERS, endpoint = 'playerdashptshotdefend', 
                           league_id = league_id, player_id = ids)
    
    # storing the data in the defensive shooting list
    defensive_shooting.append(pd.DataFrame(defense.data['DefendingShots']))


In [30]:
# concating the list of dataframes into a single dataframe

defensive_shots_df = pd.concat(defensive_shooting)

In [31]:
# storing into a csv

defensive_shots_df.to_csv('csv_files/defensive_shooting_by_player.csv')

# Cleaning the Defensive Statistics

### Even though i didnt end up using the defensive stats for the mvp presentation i still want to use it for my my similarty matrix going into the future so i wanted to clean up the data and get it in the format that makes the most sense. So i end up breaking it down similary to the shot charts by creating dummies for each category. Then multiplying that category by the players defensive scores and grouping by the player so in the end i just have a row with a specific player and his defensive shooting numbers in every single category.

In [32]:
# removing the overall as i want to look at each area individually

defensive_shots_df = defensive_shots_df[defensive_shots_df['DEFENSE_CATEGORY'] != 'Overall']

In [33]:
#replacing the name of the column for player id to match the other ones

defensive_shots_df.rename(columns = {'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'}, inplace = True)

In [34]:
# reset index so it goes from 1 to the end instead of going by 5s and then dropping the old index column

defensive_shots_df.reset_index(inplace = True)
defensive_shots_df.drop(columns = 'index', inplace = True)

In [35]:
# creating dummy columns of the different defensive shot locations

shot_defense_dummies = pd.get_dummies(defensive_shots_df['DEFENSE_CATEGORY'], prefix = 'def')

In [36]:
# concating the dummies with the original

full_df = pd.concat([defensive_shots_df, shot_defense_dummies], axis = 1)

In [37]:
# for each of the dummies i want to get the actual value for the different categories im looking at so i .multiply all of them
# by the specific value i want and set that to a new column in the data frame

full_df[['def_2 Pointers_freq','def_3 Pointers_freq', 'def_Greater_Than_15_Ft_freq', 'def_Less_Than_10_Ft_freq', 'def_Less_Than_6_Ft_freq']] = full_df[['def_2 Pointers','def_3 Pointers', 'def_Greater Than 15 Ft', 'def_Less Than 10 Ft', 'def_Less Than 6 Ft']].multiply(full_df['FREQ'], axis = 'index')
full_df[['def_2 Pointers_dfgm','def_3 Pointers_dfgm', 'def_Greater_Than_15_Ft_dfgm', 'def_Less_Than_10_Ft_dfgm', 'def_Less_Than_6_Ft_dfgm']] = full_df[['def_2 Pointers','def_3 Pointers', 'def_Greater Than 15 Ft', 'def_Less Than 10 Ft', 'def_Less Than 6 Ft']].multiply(full_df['D_FGM'], axis = 'index')
full_df[['def_2 Pointers_dfga','def_3 Pointers_dfga', 'def_Greater_Than_15_Ft_dfga', 'def_Less_Than_10_Ft_dfga', 'def_Less_Than_6_Ft_dfga']] = full_df[['def_2 Pointers','def_3 Pointers', 'def_Greater Than 15 Ft', 'def_Less Than 10 Ft', 'def_Less Than 6 Ft']].multiply(full_df['D_FGA'], axis = 'index')
full_df[['def_2 Pointers_plus_minus','def_3 Pointers_plus_minus', 'def_Greater_Than_15_Ft_plus_minus', 'def_Less_Than_10_Ft_plus_minus', 'def_Less_Than_6_Ft_plus_minus']] = full_df[['def_2 Pointers','def_3 Pointers', 'def_Greater Than 15 Ft', 'def_Less Than 10 Ft', 'def_Less Than 6 Ft']].multiply(full_df['PCT_PLUSMINUS'], axis = 'index')

In [38]:
# checking the columns to see what needs to be dropped and kept

full_df.columns

Index(['PLAYER_ID', 'GP', 'G', 'DEFENSE_CATEGORY', 'FREQ', 'D_FGM', 'D_FGA',
       'D_FG_PCT', 'NORMAL_FG_PCT', 'PCT_PLUSMINUS', 'def_2 Pointers',
       'def_3 Pointers', 'def_Greater Than 15 Ft', 'def_Less Than 10 Ft',
       'def_Less Than 6 Ft', 'def_2 Pointers_freq', 'def_3 Pointers_freq',
       'def_Greater_Than_15_Ft_freq', 'def_Less_Than_10_Ft_freq',
       'def_Less_Than_6_Ft_freq', 'def_2 Pointers_dfgm', 'def_3 Pointers_dfgm',
       'def_Greater_Than_15_Ft_dfgm', 'def_Less_Than_10_Ft_dfgm',
       'def_Less_Than_6_Ft_dfgm', 'def_2 Pointers_dfga', 'def_3 Pointers_dfga',
       'def_Greater_Than_15_Ft_dfga', 'def_Less_Than_10_Ft_dfga',
       'def_Less_Than_6_Ft_dfga', 'def_2 Pointers_plus_minus',
       'def_3 Pointers_plus_minus', 'def_Greater_Than_15_Ft_plus_minus',
       'def_Less_Than_10_Ft_plus_minus', 'def_Less_Than_6_Ft_plus_minus'],
      dtype='object')

In [39]:
# dropping unnecessary columns

full_df.drop(columns = [ 'GP', 'G', 'DEFENSE_CATEGORY', 'FREQ', 'D_FGM', 'D_FGA', 'D_FG_PCT', 'NORMAL_FG_PCT', 
                        'PCT_PLUSMINUS', 'def_2 Pointers', 'def_3 Pointers', 'def_Greater Than 15 Ft', 
                        'def_Less Than 10 Ft', 'def_Less Than 6 Ft'], inplace = True)

In [40]:
# grouping by the individual player so now i have a dataframe with just the player id and the dummy columns i wanted 
# of different defensive statistics in the 5 different areas of the court

defensive_shooting = full_df.groupby('PLAYER_ID').sum()

In [41]:
# to csv

defensive_shooting.to_csv('csv_files/defensive_stats_for_new_players.csv')

## Merging the defensive stats and the shooting Stats

In [47]:
#resetting index to keep the player names as they will be used later on to concat with the basketball reference data

player_shot_profile.reset_index(inplace = True)
player_shot_profile.rename(columns = {'index': 'Player'}, inplace = True)

In [56]:
player_stats = pd.merge(defensive_shooting, player_shot_profile, on = 'PLAYER_ID')

#### Because the defensive stats are so much harder to get there were only 316 players from the possible 497 that had both shooting data and defensive data, for now i will stick to it but likely will just use the player shot profile and basic stats as well later to see if it does better

## Getting basic stats from the players for the 2019-20 nba season

In [74]:
# Getting player data from the 2020 season, dropping duplicates because the players have individual stats for each
# team they were on for that season, so i just want the total for the year, regardless of team

player_bios = pd.read_html('https://www.basketball-reference.com/leagues/NBA_2020_per_game.html')[0].drop_duplicates(keep = 'first')

# dropping all the unnecessary columns that i do not want to use in the final data
player_bios.drop(columns = ['Rk', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', 
                            '2P', '2PA', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB'], inplace = True)

# dropping every 20 rows which shows the column names again (from the website)
player_bios = player_bios[player_bios['Player'] != 'Player']

# changing the values from strings to all numeric values
player_bios = player_bios.apply(pd.to_numeric, errors = 'ignore')


In [78]:
# fixing the multiple position players using knowledge of the game to pick their positions

positions = []

for x in player_bios.Pos:
    if x == 'SF-SG':
        positions.append('SF')
    elif x == 'C-PF':
        positions.append('PF')
    elif x == 'SF-PF':
        positions.append('PF')
    elif x == 'PF-SF':
        positions.append('SF')
    else:
        positions.append(x)

In [81]:
# replacing the players positions with the new list that doesnt have have duplicates

player_bios.Pos = positions

## Concating the basketball reference player bios with the shooting stats

In [84]:
# applying the cleaning for both of the dataframes.
# the punctuation replacment is for both, the special characters are mostly for the basketball reference players

def clean_names(name):
    name = re.sub('[%s]' % re.escape(string.punctuation), '', name)
    name = re.sub('ć', 'c', name)
    name = re.sub('Ć', 'C', name)
    name = re.sub('Ž', 'Z', name)
    name = re.sub('č', 'c', name)
    name = re.sub('Č', 'C', name)
    return name

# creating an object that applys the cleaning function to all of the rows of a column instead of on the column
cleaning = lambda x: clean_names(x)

In [94]:
# applying the cleaner to both the shot charts/defensive stats from nba_api and basketball reference

player_stats['Player'] = pd.DataFrame(player_stats.Player.apply(cleaning))
player_bios['Player'] = pd.DataFrame(player_bios.Player.apply(cleaning))

In [89]:
# merging the 2 dataframes on the player column 

all_stats = pd.merge(player_stats, player_bios, on = 'Player')

In [92]:
# filling any remaining na values with 0s

all_stats = all_stats.fillna(0)

In [93]:
all_stats.to_csv('csv_files/Player_comparison_df.csv')