In [12]:
import os
import pandas as pd
import requests
import numpy as np

def find_year(year: int) -> bool:
    # searches the data folder for specified year
    for file in os.listdir('./data'):
        if file.startswith(year2szn(year)): # must check for season formatted year
            return True
    return False


def year2szn(year: int) -> str:
    # changes an integer year into a string for that year's season
    # example: 2022 -> 2021-22'
    return f"{str(year-1)}-{str(year)[-2:]}"


def write_data(dfs: list, names: list, year: int) -> None:
    # Input: A list of DataFrames, a list of names, and a integer year
    # Function: Writes each dataframe to a csv with corresponding name for specified year
    
    dir_str = f'data/{year2szn(year)}/' # define directory string
    
    os.mkdir(dir_str) # create directory
    
    for i in range(len(dfs)):
        dfs[i].to_csv(dir_str+names[i]+'.csv') # concats year, name, and .csv as string


def html2csv(html_content, year: int):
    dfs = pd.read_html(html_content) # changes html file into dataframes
    
    # defines names for tables
    names = ['standings', 'per_game', 'per_game_against',
             'szn_totals', 'szn_totals_against',
             'szn_per100poss', 'szn_per100poss_against',
             'advanced', 'shooting', 'shooting_against']
    # 1971 separates east and west into separate tables
    if(year<=1970):
        first = dfs[0] # grabs first table, typically standings
        # finds western conference teams
        west_index = np.where(first['Team']=='Western Division')[0][0] 
        first.loc[:west_index, 'Conference'] = 'East' # Assigns everything under west conf as East
        first.loc[:west_index, 'Seed'] = range(west_index+1) # Assigns seed numbers to eastern teams
        first.loc[west_index:, 'Seed'] = range(len(first)-west_index) # assigns seed numbers to western teams
        first.loc[west_index:, 'Conference'] = 'West' # Assigns western teams respective conferences
        
        # remove random non-teams
        first = first[(first['Team']!='Eastern Division') & (first['Team']!='Western Division')]
        # call write_data function
        write_data([first]+dfs[1:], names[:-5]+['advanced'], year)
        return
    
    if(year < 2016): # 2016 introduced league wide table in addition to each conference
        # remove division names
        east = dfs[0][(dfs[0]['W'] != 'Atlantic Division') &
                      (dfs[0]['W'] != 'Central Division')].copy() 
        west = dfs[1][(dfs[1]['W'] != 'Midwest Division') &
                      (dfs[1]['W'] != 'Pacific Division')].copy()
        
        frames = dfs[2:] # get rest of dataframes
        
        # weird edge case managing
        if((year < 2000) & (year > 1996)):
            names = names[:-2]
            frames = frames[:-2]
        if(year <= 1973):
            names = names[:-3]+['advanced']
        
        combined = pd.concat([east, west]).sort_values(by='W',
                                               ascending=False, ignore_index=True)
        # create list of dataframes for usage
        final_dfs = [combined]+frames

        #turn them into csv files
        write_data(final_dfs, names, year)
        
    else: # 2016-present
        east = dfs[0]
        west = dfs[1]
        frames = dfs[4:]
        # assigning conference and seeds
        east['Conference'] = 'East'
        west['Conference'] = 'West'
        east['Seed'] = range(1, len(east)+1)
        west['Seed'] = range(1, len(west)+1)
        # renaming conference to just team name
        west.rename(columns={'Western Conference': 'Team'}, inplace=True)
        east.rename(columns={'Eastern Conference': 'Team'}, inplace=True)
        # combine the east and west dataframes sorted by Number of Wins followed by conference seed number
        combined = pd.concat([east, west]).sort_values(by=['W', 'Seed'],
                                                       ascending=[False, True], ignore_index=True)
        # create list of dataframes for usage
        final_dfs = [combined]+frames

        #turn them into csv files
        write_data(final_dfs, names, year)


def download_schedule_table(): # MAIN
    # if imperfect airflow or weird pipeline errors, this for loop finds the year not yet registered
    for year in range(2023, 2023, -1):
        if(find_year(year) == False):
            break
        
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"
    response = requests.get(url) # grab html from url
    
    if response.status_code == 200: # found table
        html2csv(response.content, year) # call helper
        download_schedule_table() # recursive
    elif response.status_code == 429: # soft banned
        print('Hit request limit')
    else:
        print(f"{year} data does not exist or has been downloaded.")
    return os.getcwd()


if __name__ == "__main__":
    download_schedule_table()
    print('All done')
''' Side Notes:
<1972-73 no per 100 poss stats
2001> introduced accurate shooting stts

2015-16> conference'''

FileExistsError: [Errno 17] File exists: 'data/1950-51/'

In [177]:
year = 2001
url = f"https://www.basketball-reference.com/leagues/NBA_2001_standings.html#all_expanded_standings"
response = requests.get(url)

In [180]:
dfs = pd.read_html(response.content)
dfs[0]

Unnamed: 0,Eastern Conference,W,L,W/L%,GB,PS/G,PA/G,SRS
0,Atlantic Division,Atlantic Division,Atlantic Division,Atlantic Division,Atlantic Division,Atlantic Division,Atlantic Division,Atlantic Division
1,Philadelphia 76ers*,56,26,.683,—,94.7,90.4,3.64
2,Miami Heat*,50,32,.610,6.0,88.9,86.6,1.73
3,New York Knicks*,48,34,.585,8.0,88.7,86.1,1.98
4,Orlando Magic*,43,39,.524,13.0,97.5,96.5,0.39
5,Boston Celtics,36,46,.439,20.0,94.6,96.8,-2.40
6,New Jersey Nets,26,56,.317,30.0,92.1,97.1,-5.30
7,Washington Wizards,19,63,.232,37.0,93.2,99.9,-6.75
8,Central Division,Central Division,Central Division,Central Division,Central Division,Central Division,Central Division,Central Division
9,Milwaukee Bucks*,52,30,.634,—,100.7,96.9,3.14


In [120]:
dfs = pd.read_html(response.content)
ranking = dfs[8]
ranking.columns = ranking.columns.droplevel()
ranking['Rank'] = range(ranking.shape[0])
pd.merge(ranking[['Team','Rank']], dfs[])

Unnamed: 0,Team,Rank
0,Los Angeles Lakers*,0
1,Portland Trail Blazers*,1
2,San Antonio Spurs*,2
3,Phoenix Suns*,3
4,Utah Jazz*,4
5,Indiana Pacers*,5
6,Miami Heat*,6
7,Sacramento Kings*,7
8,Charlotte Hornets*,8
9,Minnesota Timberwolves*,9


In [117]:
ranking.shape

(30, 31)

In [26]:
if(year < 2016): # 2016 introduced league wide table in addition to each conference
    # remove division names
    east_teams = dfs[0]['Eastern Conference']

    frames = dfs[2:] # get rest of dataframes

    # weird edge case managing
    if((year < 2000) & (year > 1996)):
        names = names[:-2]
        frames = frames[:-2]
    if(year <= 1973):
        names = names[:-3]+['advanced']


Unnamed: 0,Western Conference,W,L,W/L%,GB,PS/G,PA/G,SRS
1,Utah Jazz*,55,27,0.671,—,96.5,92.0,4.52
2,San Antonio Spurs*,53,29,0.646,2.0,96.2,90.2,5.92
3,Minnesota Timberwolves*,50,32,0.61,5.0,98.5,96.0,2.67
4,Dallas Mavericks,40,42,0.488,15.0,101.4,102.0,-0.29
5,Denver Nuggets,35,47,0.427,20.0,99.0,101.1,-1.76
6,Houston Rockets,34,48,0.415,21.0,99.5,100.3,-0.57
7,Vancouver Grizzlies,22,60,0.268,33.0,93.9,99.5,-5.1
9,Los Angeles Lakers*,67,15,0.817,—,100.8,92.3,8.41
10,Portland Trail Blazers*,59,23,0.72,8.0,97.5,91.0,6.36
11,Phoenix Suns*,53,29,0.646,14.0,98.9,93.7,5.24


In [23]:
# assigning conference and seeds
east['Conference'] = 'East'
west['Conference'] = 'West'
east['Seed'] = range(1, len(east)+1)
west['Seed'] = range(1, len(west)+1)
# renaming conference to just team name
west.rename(columns={'Western Conference': 'Team'}, inplace=True)
east.rename(columns={'Eastern Conference': 'Team'}, inplace=True)
# combine the east and west dataframes sorted by Number of Wins followed by conference seed number
combined = pd.concat([east, west]).sort_values(by=['W', 'Seed'],
                                               ascending=[False, True], ignore_index=True)
# create list of dataframes for usage

In [24]:
combined

Unnamed: 0,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Conference,Seed
0,Los Angeles Lakers*,67,15,0.817,—,100.8,92.3,8.41,West,8
1,Portland Trail Blazers*,59,23,0.72,8.0,97.5,91.0,6.36,West,9
2,Indiana Pacers*,56,26,0.683,—,101.3,96.7,4.15,East,8
3,Utah Jazz*,55,27,0.671,—,96.5,92.0,4.52,West,1
4,San Antonio Spurs*,53,29,0.646,2.0,96.2,90.2,5.92,West,2
5,Phoenix Suns*,53,29,0.646,14.0,98.9,93.7,5.24,West,10
6,Miami Heat*,52,30,0.634,—,94.4,91.3,2.75,East,1
7,New York Knicks*,50,32,0.61,2.0,92.1,90.7,1.3,East,2
8,Minnesota Timberwolves*,50,32,0.61,5.0,98.5,96.0,2.67,West,3
9,Philadelphia 76ers*,49,33,0.598,3.0,94.8,93.4,1.02,East,3
