In [1]:
import pandas as pd
import numpy as np

import requests

import re
import string

from py_ball import league, image, boxscore, player

# static lists to get players and teams
from nba_api.stats.static import players
from nba_api.stats.static import teams

# endpoints to get the list of players on every team and shot chart for each player
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import teamplayerdashboard

# getting nba defensive statistics
from nba_api.stats.endpoints import boxscoredefensive
from nba_api.stats.endpoints import defensehub
from nba_api.stats.endpoints import leaguedashptdefend
from nba_api.stats.endpoints import playerdashptshotdefend
from nba_api.stats.endpoints import boxscoretraditionalv2


# getting nba player info for their position
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear

HEADERS = {'Connection': 'keep-alive',
           'Host': 'stats.nba.com',
           'Origin': 'http://stats.nba.com',
           'Upgrade-Insecure-Requests': '1',
           'Referer': 'stats.nba.com',
           'x-nba-stats-origin': 'stats',
           'x-nba-stats-token': 'true',
           'Accept-Language': 'en-US,en;q=0.9',
           "X-NewRelic-ID": "VQECWF5UChAHUlNTBwgBVw==",
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6)' +\
                         ' AppleWebKit/537.36 (KHTML, like Gecko)' + \
                         ' Chrome/81.0.4044.129 Safari/537.36'}

## Finding all shot charts for all players from 1999-00 season to present

In [2]:
# importing a dictionary of the nba teams to get a list of the team ids

teams_dict = teams.get_teams()

In [3]:
# obtaining list of team ideas for each team in the team dictionary

team_ids = [team['id'] for team in teams_dict]

In [None]:
# getting a list of all the nba seasons from 1999-00 season to present

seasons = ['2019-20', '2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14',
           '2012-13', '2011-12', '2010-11', '2009-10', '2008-09', '2007-08', '2006-07', 
           '2005-06', '2004-05', '2003-04', '2002-03', '2001-02', '2000-01', '1999-00']

In [None]:
# list to append all the dataframes of player shot charts to
all_seasons_list = []

# list to store player ids so the same player isnt used more than once
used_players = []


# looping through each team id in the list of team ids
for team in team_ids:
    
    # looping through each season to grab the players from the team in that season
    for season in seasons:
        
        try:
            # obtaining the dashboard of the teams with all the active players and their stats
            team_players = teamplayerdashboard.TeamPlayerDashboard(team, season = season)

            # using that list to grab the player ids from every player on every team
            player_ids = team_players.players_season_totals.get_data_frame().PLAYER_ID
            
            # looping through all the players on every team
            for player_id in player_ids:
                
                if player_id not in used_players:
                    
                    # getting the shot chart object from nba api for each player using team id and player id
                    shot_chart = shotchartdetail.ShotChartDetail(team, player_id)

                    # turning that shot chart object into a dataframe
                    shot_chart_df = shot_chart.get_data_frames()[0]

                    # changing the dates into a datetime to grab the games from 2019-2020, then setting that as the index
                    shot_chart_df['GAME_DATE'] = pd.to_datetime(shot_chart_df['GAME_DATE'], format = '%Y%m%d', errors = 'ignore')
                    shot_chart_df.set_index('GAME_DATE', inplace = True)

                    # locating only games for the 2019/20 nba season
                    shot_chart_all = shot_chart_df.loc['1999-11-02':'2020-03-11']
                    
                    # appending it to a list to later concat into a single DataFrame
                    all_seasons_list.append(shot_chart_all)
                    
                    # appending the player id to store and not use again in case it comes up for another team
                    used_players.append(player_id)
                    
                else:
                    pass
                
        except JSONDecodeError:
            pass

In [None]:
# concating the list of dataframes into a single dataframe

all_seasons = pd.concat(all_seasons_list)

In [None]:
# saving to a csv

all_seasons.to_csv('csv_files/2000-2020_shot_charts.csv')

This gave me an issue as it only output the made shots so i used this temporarily for my data but later took just the game ids to use with the py_ball library which gave all the shots i wanted. Will eventually be doing the same for the 2019-20 season when comparing players later on. 

ended up sticking with this as the length of the process to gather data using pyball, even for a single season made it impossible to use in the timeframe, will be developing and running that for just the 2019-20 season to see if it will help

# Cleaning the NBA shot charts to the amount of made shots for each shooting location

## Messing with the data to try and get an idea of court locations

In [5]:
# looking at all the unique shot locations

all_seasons.SHOT_ZONE_BASIC.unique()

array(['Restricted Area', 'Right Corner 3', 'In The Paint (Non-RA)',
       'Left Corner 3', 'Mid-Range', 'Above the Break 3', 'Backcourt'],
      dtype=object)

For x and y location, 10 units in the LOC_X or LOC_Y value is equal to 1 ft on the court. 

-250, -50 = left corner

250, -50 = right corner

0, 238 = top of the key (any value for LOC_Y that is 238 or higher would be great)

Shot Areas: 

### 3 Pointers

X(-250 to -220) & Y(-50 to 87) = left corner 3

X(220 to 250) & Y(-50 to 87) = right corner 3

X(-250 to -80) & Y(87 to 280) & Shot_zone(above the break) = left wing 3

X(80 to 250) & Y(87 to 280) & Shot_zone(above the break) = right wing 3

X(-80 to 80) and Y(87 to 280) & Shot_zone(above the break) = Center 3

X(any) and Y(280 to 350) = Deep 3

X(any) and Y(350+) = Heave

### Mid Range 

X(-220 to -150) & Y(-50 to 90) = left baseline deep midrange

X(150 to 220) & Y(-50 to 90) = right baseline deep midrange

((X(-220 to 150) & Y(90+)) & (X(-150 to 80) & Y(150+))) & Shotzone(Mid-Range) = left wing deep mid ranger

((X(150 to 220) & Y(90+)) & (X(80 to 150) & Y(150+))) & Shotzone(Mid-Range) = right wing deep mid ranger

X(-150 to -80) & Y(-50 to 90) = short left baseline midranger

X(80 to 150) & Y(-50 to 90) = short right baseline midranger

X(-150 to -80) & Y(90 to 150) = short left wing mid ranger

X(80 to 150) & Y(90 to 150) = short right wing mid ranger

X(-80 to 80) & Y(210+) & Shotzone(Mid-Range) = deep center midranger

X(-80 to 80) & Y(150 to 210) = short center midrange

### Paint

X(-80 to 80) & Y(90 to 150) = Floater

X(-80 to 80) & Y(-50) & Shotzone(In The Paint (Non-RA)) = layup/in the paint

Shotzone(restricted area) = restricted area

## Organizing the data and using the function to group the data in the format i am looking for

In [13]:
# reset the index to be able to access the datetime again as a string to get the year for each player

new_df = all_seasons.reset_index()

In [15]:
# Creating a new df with just the players and the year they played as the index, only 
# included the player idin case it was necessary to concat into the future

players_df = new_df.groupby('PLAYER_NAME')['PLAYER_ID'].mean().to_frame()


In [1]:
# dataframe needs to be in the format of the shot chart data that is provided by the nba api

from mod_5_functions import shooting_positions

In [None]:
# running the function which returns the amount of shots made for each player for each position, so i need to set
# the output of the function to all of the different positions on the court

left_corner_3, right_corner_3, left_wing_3, right_wing_3, center_3, deep_3,heave, left_baseline_deep_2, right_baseline_deep_2, left_wing_deep_2, right_wing_deep_2, left_baseline_short_2, right_baseline_short_2, left_wing_short_2, right_wing_short_2, deep_center_2, short_center_2, floater_range, in_the_paint, restricted_area = shooting_positions(new_df)


In [18]:
# taking all of those values, as well as the player names and concating them to a single dataframe

player_shot_profile = pd.concat([players_df, left_corner_3, right_corner_3, left_wing_3, right_wing_3, center_3,
                                 deep_3,heave, left_baseline_deep_2, right_baseline_deep_2, left_wing_deep_2, 
                                 right_wing_deep_2, left_baseline_short_2, right_baseline_short_2, left_wing_short_2,
                                 right_wing_short_2, deep_center_2, short_center_2, floater_range, in_the_paint, 
                                 restricted_area], axis = 1)



In [19]:
# taking the na values, areas where players didnt have a single shot (big men and 3s) and replacing them with 0s

player_shot_profile.fillna(0, inplace = True)

In [20]:
# saving it to a csv file

player_shot_profile.to_csv('csv_files/Shot_Profiles.csv')

## Gathering remaining data from Basketball Reference

#### NBA api did not have a clean method to get the basic stats that are readily available on the basketball reference api. This led to some issues as the names of the foreign players do not match exactly with basketball reference having the letters with accents while the nba api doesnt but its worth losing some data points for the data i get from the basketball reference api.

In [42]:
# getting an idea of what the columns i am getting are

pd.read_html('https://www.basketball-reference.com/leagues/NBA_2020_per_game.html')[0].drop_duplicates(keep = 'first').columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [43]:
#getting a list of the seasons to iterate through

seasons = ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013',
           '2012', '2011', '2010', '2009', '2008', '2007', '2006',
           '2005', '2004', '2003', '2002', '2001', '2000']

In [44]:
# creating an empty list to store all the dataframes for each year

player_bio_list = []

In [45]:
# iterating through each season and getting the full list of players for each game in that season, 
# also dropping all the unnecessary columns 

for season in seasons:
    player_bios = pd.read_html('https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'.format(season))[0].drop_duplicates(keep = 'first')
    player_bios.drop(columns = ['Rk', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '2P', '2PA', 'eFG%', 'FT', 'FTA', 'FT%',
       'TRB'], inplace = True)
    player_bio_list.append(player_bios)

In [46]:
# concating all the dataframes in the list to a single dataframe

player_bio = pd.concat(player_bio_list)

In [47]:
# dropping every 20 rows which shows the column names again (from the website)
player_bio = player_bio[player_bio['Player'] != 'Player']

In [48]:
# changing all numberic values in the list from strings to ints/floats

player_bio = player_bio.apply(pd.to_numeric, errors = 'ignore')

In [49]:
# aggregating all the numeric valuesby the mean

player_bio_agg = player_bio.groupby('Player').mean()

In [50]:
# function to find the most common item in a list
# this is for players who are labeled as different positions over the course of their careers, i am only taking the position 
# they were referred to the most out of any of them

def most_frequent(List): 
    counter = 0
    num = List[0] 
      
    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num 

In [51]:
# aggregating all the positions into a single string seperated by commas

player_bio_pos = (player_bio.groupby('Player')['Pos'].apply(lambda x: ', '.join(x))).to_frame()

In [52]:
# looping through all the positions, turning them into a list with .split then using the function to find the most frequent

player_bio_pos['Pos'] = [most_frequent(x.split(', ')) for x in player_bio_pos['Pos']]

In [53]:
# concating the 2 datframes

player_bios = pd.concat([player_bio_pos, player_bio_agg], axis = 1)

In [54]:
player_bios.reset_index(inplace = True)

In [55]:
# to csv

player_bios.to_csv('csv_files/Player_bios.csv')

## Cleaning up and combining the 2 data frames

#### As stated in the basketball reference data gathering folder the names on these 2 dont match exactly, mostly for foreign players. so it is important to first clean up names and include some common foriegn characters as well as removing players so people like PJ Tucker (bbref) and P.J. Tucker (nba api) match. this is done using some basic regex code

In [56]:
player_shot_profile = pd.read_csv('csv_files/Shot_Profiles.csv', index_col = 0)

In [57]:
player_shot_profile.reset_index(inplace = True)

In [58]:
# renaming the column for player names to match that of the basketball reference page

player_shot_profile.rename(columns = {'index': 'Player'}, inplace = True)

In [59]:
# applying the cleaning for both of the dataframes.
# the punctuation replacment is for both, the special characters are mostly for the basketball reference players

def clean_names(name):
    name = re.sub('[%s]' % re.escape(string.punctuation), '', name)
    name = re.sub('ć', 'c', name)
    name = re.sub('Ć', 'C', name)
    name = re.sub('Ž', 'Z', name)
    name = re.sub('č', 'c', name)
    name = re.sub('Č', 'C', name)
    return name

# creating an object that applys the cleaning function to all of the rows of a column instead of on the column
cleaning = lambda x: clean_names(x)

In [60]:
# applying the cleaner to both the shot charts from nba_api an

player_shot_profile['Player'] = pd.DataFrame(player_shot_profile.Player.apply(cleaning))
player_bios['Player'] = pd.DataFrame(player_bios.Player.apply(cleaning))

In [61]:
# merging the 2 dataframes on the player column 

finalized_df = pd.merge(player_shot_profile, player_bios, on = 'Player')

In [64]:
finalized_df.fillna(0, inplace = True)

In [65]:
# saving as the final dataframe to use for eda and modeling

finalized_df.to_csv('csv_files/Final_df.csv')