In [1]:
#Documentation
#https://github.com/vishaalagartha/basketball_reference_scraper/blob/master/API.md

In [130]:
import pandas as pd
from tqdm import tqdm
import pickle
from datetime import datetime, timedelta
import sys
from dateutil.relativedelta import relativedelta
import unicodedata

sys.path.append('./basketball_reference_scraper/basketball_reference_scraper')

In [3]:
reg_season_dates = [
    ["2016-10-25", "2017-04-12"],  # No play-in
    ["2017-10-17", "2018-04-11"],  # No play-in
    ["2018-10-16", "2019-04-10"],  # No play-in
    ["2019-10-22", "2020-08-15"],  # Play-in: Aug 15 (Blazers vs. Grizzlies)
    ["2020-12-22", "2021-05-21"],  # Play-in: May 18–21
    ["2021-10-19", "2022-04-15"],  # Play-in: Apr 12–15
    ["2022-10-18", "2023-04-14"],  # Play-in: Apr 11–14
    ["2023-10-24", "2024-04-19"]   # Play-in: Apr 16–19
]

In [4]:
def save_pickle(structure, name):
    # Save
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(structure, f)

def load_pickle(file):
    with open(file, 'rb') as f:
        loaded_data = pickle.load(f)
        return loaded_data

In [131]:
def normalize_name(name):
    return ''.join(
        c for c in unicodedata.normalize('NFKD', name)
        if not unicodedata.combining(c)
    )

In [5]:
nba_teams = {
    'ATL', 'BOS', 'BRK', 'CHI', 'CHA', 'CLE', 'DAL', 'DEN', 'DET', 'GSW',
    'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',
    'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS', 'NBA'
}

In [6]:
class Teams:
    team_city_abbreviations = {
        'Atlanta Hawks': 'ATL',
        'Boston Celtics': 'BOS',
        'Brooklyn Nets': 'BRK',
        'Charlotte Bobcats': 'CHA',
        'Charlotte Hornets': 'CHO',
        'Chicago Bulls': 'CHI',
        'Cleveland Cavaliers': 'CLE',
        'Dallas Mavericks': 'DAL',
        'Denver Nuggets': 'DEN',
        'Detroit Pistons': 'DET',
        'Golden State Warriors': 'GSW',
        'Houston Rockets': 'HOU',
        'Indiana Pacers': 'IND',
        'LA Clippers': 'LAC',
        'Los Angeles Clippers': 'LAC',
        'LA Lakers': 'LAL',
        'Los Angeles Lakers': 'LAL',
        'Memphis Grizzlies': 'MEM',
        'Miami Heat': 'MIA',
        'Milwaukee Bucks': 'MIL',
        'Minnesota Timberwolves': 'MIN',
        'New Orleans Pelicans': 'NOP',
        'New York Knicks': 'NYK',
        'Oklahoma City Thunder': 'OKC',
        'Orlando Magic': 'ORL',
        'Philadelphia 76ers': 'PHI',
        'Phoenix Suns': 'PHO',
        'Portland Trail Blazers': 'POR',
        'Sacramento Kings': 'SAC',
        'San Antonio Spurs': 'SAS',
        'Toronto Raptors': 'TOR',
        'Utah Jazz': 'UTA',
        'Washington Wizards': 'WAS',
        'Free Agent': 'NBA'
    }

    team_abbreviations = {
        'Hawks': 'ATL',
        'Celtics': 'BOS',
        'Nets': 'BRK',
        'Bobcats': 'CHA',
        'Hornets': 'CHA',
        'Bulls': 'CHI',
        'Cavaliers': 'CLE',
        'Mavericks': 'DAL',
        'Nuggets': 'DEN',
        'Pistons': 'DET',
        'Warriors': 'GSW',
        'Rockets': 'HOU',
        'Pacers': 'IND',
        'Clippers': 'LAC',
        'Lakers': 'LAL',
        'Grizzlies': 'MEM',
        'Heat': 'MIA',
        'Bucks': 'MIL',
        'Timberwolves': 'MIN',
        'Pelicans': 'NOP',
        'Knicks': 'NYK',
        'Thunder': 'OKC',
        'Magic': 'ORL',
        '76ers': 'PHI',
        'Suns': 'PHX',
        'Trail Blazers': 'POR',
        'Kings': 'SAC',
        'Spurs': 'SAS',
        'Raptors': 'TOR',
        'Jazz': 'UTA',
        'Wizards': 'WAS',
        'Free Agent': 'NBA'
    }

In [7]:
from teams import *
from players import *
from seasons import *
from shot_charts import *
from box_scores import *

In [8]:
def get_all_playoff_teams(start_year=2014, end_year=2024):
    #Index 0: 2013-2014 season -> Index 10: 2023-2024 Season
    playoff_pictures = []
    for year in range(start_year, end_year + 1):
        playoff_teams = []
        year_playoff_scores = get_schedule(year, playoffs=True)
        while len(playoff_teams) < 16:
            for team in year_playoff_scores['VISITOR']:
                if team not in playoff_teams:
                    playoff_teams.append(team)
        playoff_pictures.append(playoff_teams)
    return playoff_pictures

In [9]:
def find_playoff_rotation_level_players(team_name):
    year, team = team_name.split('_')
    print(Teams.team_city_abbreviations[team])
    roster_playoff_stats = get_roster_stats(Teams.team_city_abbreviations[team], int(year), data_format='PER_GAME', playoffs=True)
    return roster_playoff_stats[roster_playoff_stats['MP'] > 10.0]

In [10]:
def create_dictionary_fprlp(teams):
    dictionary = {team : [] for team in teams}
    for team in teams:
        dataframe = find_playoff_rotation_level_players(team)
        rotation_players = [player for player in dataframe['PLAYER']]
        dictionary[team] = rotation_players
    return dictionary

In [None]:
'''
all_study_teams = get_all_playoff_teams()
teams_tagged = [
    [f"{2014 + i}_{team}" for team in team_list]
    for i, team_list in enumerate(all_study_teams)
]
flat_tagged = [team for season in teams_tagged for team in season]
save_pickle(flat_tagged, 'playoff_teams')
'''

'''
rotation_players = create_dictionary_fprlp(all_playoff_teams)
load_management = log_load_management(rotation_players, injuries_2016_2023, csv_injuries)
save_pickle(load_management, 'load_management')
'''

'''
for year in injuries_2016_2023:
    injury_list = injuries_2016_2023[year][0]
    rest_list = injuries_2016_2023[year][1]

    injury_list['Player'] = injury_list['Player'].apply(normalize_name)
    rest_list['Player'] = rest_list['Player'].apply(normalize_name)

for team in rotation_players:
    for index, name in enumerate(rotation_players[team]):
        rotation_players[team][index] = normalize_name(name)
'''

'\nrotation_players = create_dictionary_fprlp(all_playoff_teams)\n'

In [181]:
all_playoff_teams = load_pickle('playoff_teams.pkl')
rotation_players = load_pickle('rotation_players_normalized.pkl')
injuries_2016_2023 = load_pickle('seasons_normalized.pkl')
#pickle is indexed by starting year
#csv is indexed by ending year

In [None]:
suffixes = ['Jr.', 'Jr', 'Sr.', 'Sr', 'II', 'III', 'IV', 'V']

csv_injuries = pd.read_csv('2013-2024_injury_stats.csv')
csv_injuries.drop(['Unnamed: 0'], axis=1)
csv_injuries['Relinquished'] = csv_injuries['Relinquished'].str.replace(r'\b(?:' + '|'.join(suffixes) + r')\b', '', regex=True).str.replace(r'[.,]', '', regex=True).str.strip()
csv_injuries['Acquired'] = csv_injuries['Acquired'].str.replace(r'\b(?:' + '|'.join(suffixes) + r')\b', '', regex=True).str.replace(r'[.,]', '', regex=True).str.strip()
csv_injuries['Team'] = csv_injuries['Team'].str.strip()
csv_injuries['Date'] = pd.to_datetime(csv_injuries['Date'])
csv_injuries = csv_injuries[csv_injuries['Date'] >= pd.to_datetime('2016-10-25')]
csv_injuries = csv_injuries.drop(columns = ['Unnamed: 0'])

In [None]:
load_management = log_load_management(rotation_players, injuries_2016_2023, csv_injuries)
save_pickle(load_management, 'load_management_fix_normalized_3')

load_management_stats = load_pickle('load_management_fix_normalized_3.pkl')
for team in load_management_stats:
    load_management_stats[team] = [pair for pair in load_management_stats[team] if pair[0] != 'Team Totals']
save_pickle(load_management_stats, 'load_management_final_fix')

In [200]:
lms = load_pickle('load_management_final_fix.pkl')

In [217]:
lms['2019_Toronto Raptors']

[('Kawhi Leonard', 7),
 ('Kyle Lowry', 1),
 ('Pascal Siakam', 0),
 ('Marc Gasol', 0),
 ('Danny Green', 0),
 ('Fred VanVleet', 0),
 ('Serge Ibaka', 1),
 ('Norman Powell', 0)]

In [187]:
df = injuries_2016_2023[2016][0][injuries_2016_2023[2016][0]['Player'] == 'Tony Parker']
_, target = parse_days_missed('Tony Parker', df, 2016)

In [189]:
rest_days = double_check_csv('Tony Parker', target, 2016, csv_injuries)

You searched for "Tony Parker"
17 results found.
/players/p/parketo01


  df = pd.read_html(str(table))[0]


In [193]:
target

[['2016-11-04', '2016-11-10'],
 ['2016-11-21', '2016-11-22'],
 ['2016-12-06', '2016-12-07'],
 ['2016-12-23', '2017-12-07'],
 ['2017-01-19', '2017-01-26'],
 ['2017-03-01', '2017-03-02'],
 ['2017-03-09', '2017-03-17']]

In [191]:
find_all_rests('Tony Parker', 2016, csv_injuries)

5

In [196]:
def find_all_rests(player, season, csv_injuries):
    index = season - 2016
    reg_season_start = pd.to_datetime(reg_season_dates[index][0])
    reg_season_end = pd.to_datetime(reg_season_dates[index][1]) #Includes Play-In
    season_injury_list = csv_injuries[(csv_injuries['Date'] >= reg_season_start) & (csv_injuries['Date'] <= reg_season_end)]
    player_season_injuries = season_injury_list[season_injury_list['Relinquished'].str.contains(player, na=False)]
    player_season_injuries = player_season_injuries[(player_season_injuries['Notes'].str.contains('rest')) & (player_season_injuries['Notes'].str.contains('DTD'))]
    return len(player_season_injuries)

def log_load_management(playoff_rotations, pickle, csv, debug = 0):
    dict = {team : [] for team in playoff_rotations.keys()} #List of Tuples
    for team in tqdm(playoff_rotations.keys(), desc="Processing Teams"):
        year = int(team.split('_')[0])
        if year < 2017 or year < debug:
            continue
        else:
            pass
        year -= 1 #Indexing for the Pickle
        injury_info = pickle[year][0]
        rest_info = pickle[year][1]
        for player in playoff_rotations[team]:
            value = 0
            rest_column = rest_info[rest_info['Player'] == player]['Games Missed']
            all_rests = find_all_rests(player, year, csv)
            if rest_column.empty:
                dict[team].append((player, all_rests))
            else:
                rest_days = int(rest_column.iloc[0]) #Initial Value for Rest Days
                _, days_missed_intervals = parse_days_missed(player, injury_info, year)
                try:
                    rest_days += double_check_csv(player, days_missed_intervals, year, csv)
                except Exception as e:
                    print(player)
                    print("Season: " + str(year) + "-" + str(year + 1))
                dict[team].append((player, max(rest_days, all_rests)))
    return dict
            
def parse_days_missed(player, injury_info, year):
    dm_column = injury_info[injury_info['Player'] == player]['Days Missed'].iloc[0].split(' ')
    days_missed = dm_column[0]

    dates_missed_strs = dm_column[1:]
    dates_missed_arr = [str.split('-') for str in dates_missed_strs]

    i = 1
    for intervals in dates_missed_arr:
        # Assume intervals[0] is a partial date like '01/10' (MM/DD)
        try:
            # Try parsing normally (if already complete)
            parsed = pd.to_datetime(intervals[0], format='%m/%d/%y')
            intervals[0] = parsed.strftime('%Y/%m/%d')  # Reformat
        except ValueError:
            # Handle partial date — needs year appended
            partial_date = intervals[0]  # e.g., '01/10'
            
            # Try both year and year+1, check which one fits the season
            try_this_year = f"{partial_date}/{year}"
            try_next_year = f"{partial_date}/{year + 1}"

            dt_this_year = pd.to_datetime(try_this_year, format='%m/%d/%Y', errors='coerce')
            dt_next_year = pd.to_datetime(try_next_year, format='%m/%d/%Y', errors='coerce')

            # Season interval
            season_start = pd.to_datetime(reg_season_dates[year - 2016][0])
            season_end = pd.to_datetime(reg_season_dates[year - 2016][1])

            if season_start <= dt_this_year <= season_end:
                intervals[0] = dt_this_year.strftime('%Y/%m/%d')
            elif season_start <= dt_next_year <= season_end:
                intervals[0] = dt_next_year.strftime('%Y/%m/%d')
            else:
                raise ValueError(f"Could not determine correct year for interval: {intervals[0]}")
        if i != len(dates_missed_arr):
            intervals[1] = intervals[1][:-1] + f"/{datetime.strptime(intervals[0], '%Y/%m/%d').year}"
        else:
            intervals[1] = intervals[1] + f"/{datetime.strptime(intervals[0], '%Y/%m/%d').year}"
        intervals[1] = datetime.strptime(intervals[1], '%m/%d/%Y').strftime('%Y/%m/%d')
        if pd.to_datetime(intervals[1]) < pd.to_datetime(intervals[0]):
            intervals[1] = datetime.strptime(intervals[1], '%Y/%m/%d')
            # Now add 1 year
            if intervals[1] < pd.to_datetime(intervals[0]):
                intervals[1] += relativedelta(years=1)
            # Then convert back to string
            intervals[1] = intervals[1].strftime('%Y-%m-%d')
            intervals[0] = intervals[0].replace('/', '-')
        intervals[0] = intervals[0].replace('/', '-')
        intervals[1] = intervals[1].replace('/', '-')
        i += 1

    return days_missed, dates_missed_arr

def double_check_csv(player, out_dates, year, csv_injuries):
    #Index 0 = 2016
    game_logs = get_game_logs(player, year + 1)
    index = year - 2016
    if index < 0:
        raise Exception("Season cannot be before 2016-2017 season")
    reg_season_start = pd.to_datetime(reg_season_dates[index][0])
    reg_season_end = pd.to_datetime(reg_season_dates[index][1]) #Includes Play-In
    season_injury_list = csv_injuries[(csv_injuries['Date'] >= reg_season_start) & (csv_injuries['Date'] <= reg_season_end)]
    player_season_injuries = season_injury_list[season_injury_list['Relinquished'].str.contains(player, na=False)]
    if player_season_injuries.empty:
        print("No season injuries")
        return 0
    player_rests = player_season_injuries[player_season_injuries['Notes'].str.contains('rest')& player_season_injuries['Notes'].str.contains('DTD')] 
    if player_rests.empty:
        print("No rests")
        return 0
    unaccounted = set()
    for rest_date in player_rests['Date']:
        accounted = False
        for interval in out_dates:
            if pd.to_datetime(interval[0]) <= rest_date <= pd.to_datetime(interval[1]):
                accounted = True
                break
        if not accounted:
            unaccounted.add(rest_date)

    if len(unaccounted) == 0:
        print("All injuries accounted for")

    # Missing Rest Day(s)
    games_rested = 0
    for date in unaccounted:
        segment = game_logs[game_logs['DATE'] >= date]
        for _, row in segment.iterrows():
            if pd.isna(row['Gcar']):
                games_rested += 1
            else:
                break
    
    return games_rested

def is_valid_date_format(date_str, fmt='%m/%d/%y'):
    try:
        datetime.strptime(date_str, fmt)
        return True
    except ValueError:
        return False

In [None]:
from tqdm import tqdm
import pandas as pd

def create_injury_timeline():
    df = pd.read_csv('2013-2024_injury_stats.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date')

    df['Relinquished'] = df['Relinquished'].str.strip()
    df['Acquired'] = df['Acquired'].str.strip()

    injury_timeline = {}
    player_to_team = {}  # Track injury origin
    last_valid_date = None

    zipped_data = zip(df['Date'], df['Team'], df['Acquired'], df['Relinquished'])

    for date, team, acquired, relinquished in tqdm(zipped_data, total=len(df), desc="Building Injury Timeline"):
        date_str = date.strftime('%Y-%m-%d')

        if pd.isna(team):
            team_abbreviation = Teams.team_abbreviations['Free Agent']
        else:
            if team.split()[-1] == 'Blazers':
                team_abbreviation = Teams.team_abbreviations['Trail Blazers']
            else:
                team_abbreviation = Teams.team_abbreviations[team.split()[-1]]

        if date_str not in injury_timeline:
            if last_valid_date is None:
                injury_timeline[date_str] = {nba_team: set() for nba_team in nba_teams}
            else:
                injury_timeline[date_str] = {
                    team: players.copy()
                    for team, players in injury_timeline[last_valid_date].items()
                }
        last_valid_date = date_str

        # Add player to injury list
        if pd.isna(acquired) and pd.notna(relinquished):
            injury_timeline[date_str][team_abbreviation].add(relinquished)
            player_to_team[relinquished] = team_abbreviation

        # Player returns
        elif pd.notna(acquired) and pd.isna(relinquished):
            injury_team = player_to_team.get(acquired, team_abbreviation)
            if acquired in injury_timeline[last_valid_date][injury_team]:
                injury_timeline[date_str][injury_team].discard(acquired)

    return injury_timeline