This notebook scrapes All-NBA voting shares data from Basketball Reference. 

First, the relevant packages are loaded

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

In [3]:
# get the url for the voting shares webpage
def getURL(year):
    return f"https://www.basketball-reference.com/awards/awards_{year}.html"

# get soup from url 
def getSoup(year):
    url = getURL(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# given a table soup, get the headers for the table
def headers(table):
    th = table.find('thead').find_all('tr')[1]
    th = th.find_all('th')
    labels = []
    for i in range(len(th)):
        labels.append(th[i].text)
    return labels

# given a table, get the data of the table
def tableData(table):
    labels = headers(table)
    data = []
    for lab in labels:
        data.append([])
    rows = table.find('tbody').find_all('tr')
    for r in rows:
        if r.get('id') is not None:
            continue
        tm = r.find('th').text
        data[0].append(tm)
        td = r.find_all('td')
        for i in range(len(td)):
            data[i + 1].append(td[i].text)
    dataDict = {}
    for i in range(len(labels)):
        dataDict[labels[i]] = data[i]
    return pd.DataFrame(dataDict)

# convert strings to numeric values for relevant columns
def cleanCols(table):
    notNumeric = ['# Tm', 'Pos', 'Player', 'Tm']
    for c in table.columns.tolist():
        if c in notNumeric:
            continue
        table[c] = pd.to_numeric(table[c])
    table['3P%'] = table['3P%'].fillna(0)
    return table

# get soup, get table data, and clean data for one season
def allNBATable(year):
    soup = getSoup(year)
    table = soup.find('table', {'id' : 'leading_all_nba'})
    df = tableData(table)
    df = cleanCols(df)
    return df

# get stats for all seasons between a given start and end
def gatherStats(start, end):
    df_list = []
    for i in tqdm(range(start, end + 1)):
        table = allNBATable(i)
        time.sleep(2)
        table['Season'] = i
        df_list.append(table)
    big_df = pd.concat(df_list)
    seas = big_df.pop('Season')
    big_df.insert(3, 'Season', seas)
    return big_df

In [4]:
allnba = gatherStats(1980, 2023)

100%|███████████████████████████████████████████| 44/44 [01:44<00:00,  2.37s/it]


In [5]:
allnba

Unnamed: 0,# Tm,Pos,Player,Season,Age,Tm,Pts Won,Pts Max,Share,1st Tm,...,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1st,C,Kareem Abdul-Jabbar,1980,32,LAL,130,132,0.985,,...,24.8,10.8,4.5,1.0,3.4,0.604,0.000,0.765,14.8,0.227
1,1st,F,Julius Erving,1980,29,PHI,119,132,0.898,,...,26.9,7.4,4.6,2.2,1.8,0.519,0.200,0.787,12.5,0.213
2,1st,F,Larry Bird,1980,23,BOS,110,132,0.833,,...,21.3,10.4,4.5,1.7,0.6,0.474,0.406,0.836,11.2,0.182
3,1st,G,George Gervin,1980,27,SAS,90,132,0.682,,...,33.1,5.2,2.6,1.4,1.0,0.528,0.314,0.852,10.6,0.173
4,1st,G,Paul Westphal,1980,29,PHO,68,132,0.515,,...,21.9,2.3,5.1,1.5,0.4,0.525,0.280,0.862,10.5,0.189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,ORV,F,Paul George,2023,32,LAC,1,500,0.002,0.0,...,23.8,6.1,5.1,1.5,0.4,0.457,0.371,0.871,4.6,0.114
32,ORV,G,Tyrese Haliburton,2023,22,IND,1,500,0.002,0.0,...,20.7,3.7,10.4,1.6,0.4,0.490,0.400,0.871,7.6,0.195
33,ORV,F,Zach LaVine,2023,27,CHI,1,500,0.002,0.0,...,24.8,4.5,4.2,0.9,0.2,0.485,0.375,0.848,7.1,0.123
34,ORV,C,Brook Lopez,2023,34,MIL,1,500,0.002,0.0,...,15.9,6.7,1.3,0.5,2.5,0.531,0.374,0.784,8.0,0.161


In [8]:
allnba.to_csv('data/allnba.csv', index = False)