In [1]:
import requests 
from bs4 import BeautifulSoup
import html
import pandas as pd, numpy as np 
from io import StringIO
import math  
import sqlite3, sqlalchemy 
import re 

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 15)  

In [2]:
def clean_player(name): 
    i = name.find('*')   
    name = name[:i] if i > -1 else name 
    return name 

def load_soup(URL): 
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html') 

    return soup 

def select_team(group):
    value = group.head(1) 
    if 'Team' in value.columns:
        if len(group) > 1: 
            value['Team'] = "" 
            for i in range(1, len(group)):
                value['Team'] += group.iloc[i]['Team'] 
                value['Team'] += '-' if i < len(group)-1 else ''  
    else: 
        if len(group) > 1: 
            value['Tm'] = "" 
            for i in range(1, len(group)):
                value['Tm'] += group.iloc[i]['Tm'] 
                value['Tm'] += '-' if i < len(group)-1 else ''  
    return value 

def html_df(soup, key, value, remove_cols=['Rk'], drop_cols=True, drop_lvl=False): 
    stat_df = soup.find('table', attrs={key:value}) 
    stat_df = pd.read_html(StringIO(str(stat_df))) 
    stat_df = pd.DataFrame(stat_df[0]) 
    if drop_lvl: 
        stat_df.columns = stat_df.columns.droplevel() 
    stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True) 
    stat_df['Player'] = stat_df['Player'].apply(lambda x: clean_player(x)) 
    if drop_cols==True:  
        stat_df = stat_df.drop(remove_cols, axis=1) 

    return stat_df 

def combine_dfs(dfs):
    curr = None
    for name in dfs.keys():
        if curr is None: 
            curr = dfs[name]
        else:
            diff = dfs[name].columns.difference(curr.columns).tolist()
            diff.append('Player') 
            curr = pd.merge(curr, dfs[name].loc[:,diff], on = 'Player') 
    return curr  

def clean_if_dirty(master):
    filtered = master[master['Player']=='Player']
    pos = filtered.index
    master = master.drop(pos, axis=0) 
    master['Player'] = master['Player'].apply(lambda x: clean_player(x)) 
    master = master.dropna(axis=1, how='all')   
    if ('Tm' in master.columns) & ('Team' in master.columns): 
        master = master.drop('Tm', axis=1) 
         
    return master 

def mod_types_pct(master): 
    for col_name in master.columns: 
        if col_name not in ['Player', 'Team', 'Tm', 'Pos', 'Awards']:  
            master[col_name] = master[col_name].astype("float64")
        if col_name in ['FG%', '3P%', 'FT%', 'eFG%', 'TS%', '2P%']:
            master[col_name] *= 100 
    return master

def process_totals(totals, names): 
    if '3P' in totals.columns:  
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'BLK' in totals.columns: 
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'PF', 'MP', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'TRB':
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    else: 
        totals = totals.loc[:,['PTS', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    totals.columns = 'T' + totals.columns 
    totals['Player'] =  names 

    return totals 

def process_averages(master):
    if 'TOV' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'TOV':'ToPG', 'PTS':'PPG'}, errors="raise")
    elif 'BLK' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'PTS':'PPG'}, errors="raise") 
    elif 'TRB' in master.columns: 
        new = master.rename(columns={'G':'GP', 'TRB':'RPG', 'AST':'APG', 'PTS':'PPG'}, errors="raise") 
    else: 
        new = master.rename(columns={'G':'GP', 'AST':'APG','PTS':'PPG'}, errors="raise") 

    return new  

In [7]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

start_ssn = 2024     
stop_ssn = 2025                    

for season in range(start_ssn, stop_ssn): 
    
    URL_averages = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_game.html' 
    soup_averages = load_soup(URL_averages) 
    averages = html_df(soup_averages, 'id', 'per_game_stats', ['Rk'])  

    URL_totals = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_totals.html' 
    soup_totals = load_soup(URL_totals) 
    totals = html_df(soup_totals, 'id', 'totals_stats', ['Rk']) 
    totals = process_totals(totals, averages['Player'])   
    
    if season >= 1973: 
        URL_per100 = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_poss.html' 
        soup_per100 = load_soup(URL_per100) 
        try:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk', 'Unnamed: 29']) 
        except:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk']) 

    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_advanced.html'  
    soup_advanced = load_soup(URL_advanced) 
    try: 
        advanced = html_df(soup_advanced, 'id', 'advanced_stats', ['Rk', 'Unnamed: 19', 'Unnamed: 24'])  
    except:
        advanced = html_df(soup_advanced, 'id', 'advanced_stats', ['Rk'])  

    if season >= 1973: 
        dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced} 
    else: 
        dfs = {"averages": averages, "totals": totals, "advanced": advanced} 

    season_str = str(season) + '_' + str(season+1)[2:]
    master = combine_dfs(dfs) 
    master = clean_if_dirty(master) 
    master = mod_types_pct(master) 
    master = process_averages(master) 
    
    master.to_sql('master_'+season_str, con=engine, if_exists='replace', index=False) 
    
    



  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)


ValueError: No tables found

# Clean Tables 

In [4]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

start_ssn = 1973   
stop_ssn = 2025     

for season in range(start_ssn, stop_ssn):   
    season_str = str(season) + '_' + str(season+1)[2:] 
    master = pd.read_sql("master"+"_"+season_str, con=engine)  

    # Cleaning code here ... 
    master = master.sort_values(by='PPG', ascending=False)
            
    master.to_sql('master'+'_'+season_str, con = engine, if_exists = 'replace', index = False)  
        

In [5]:
pd.read_sql('master_1993_94', con=engine).head(10)   

Unnamed: 0,Player,Pos,Age,Team,GP,GS,MPG,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORPG,DRPG,RPG,APG,SPG,BPG,ToPG,PF,PPG,T2P,T2PA,T3P,T3PA,TAST,TBLK,TDRB,TFG,TFGA,TFT,TFTA,TMP,TORB,TPF,TPTS,TSTL,TTOV,TTRB,DRtg,ORtg,3PAr,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTr,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
0,David Robinson,C,28.0,SAS,80.0,80.0,40.5,10.5,20.7,50.7,0.1,0.4,34.5,10.4,20.4,51.0,51.0,8.7,11.6,74.9,3.0,7.7,10.7,4.8,1.7,3.3,3.2,2.9,29.8,830.0,1629.0,10.0,29.0,381.0,265.0,614.0,840.0,1658.0,693.0,925.0,3241.0,241.0,228.0,2383.0,139.0,253.0,855.0,98.0,119.0,0.017,21.6,5.4,11.9,3.8,20.3,6.7,0.558,8.1,8.8,13.3,30.7,2.3,10.9,14.8,57.7,32.0,11.4,20.0,0.296
1,Shaquille O'Neal,C,21.0,ORL,81.0,81.0,39.8,11.8,19.6,59.9,0.0,0.0,0.0,11.8,19.6,60.0,59.9,5.8,10.5,55.4,4.7,8.5,13.2,2.4,0.9,2.9,2.7,3.5,29.3,953.0,1589.0,0.0,2.0,195.0,231.0,688.0,953.0,1591.0,471.0,850.0,3224.0,384.0,281.0,2377.0,76.0,222.0,1072.0,102.0,120.0,0.001,11.0,4.5,6.8,0.6,23.7,4.8,0.534,6.2,13.5,12.1,28.5,1.2,10.2,18.7,60.5,29.0,7.2,16.9,0.252
2,Hakeem Olajuwon,C,31.0,HOU,80.0,80.0,41.0,11.2,21.2,52.8,0.1,0.2,42.1,11.1,20.9,52.9,53.0,4.9,6.8,71.6,2.9,9.1,11.9,3.6,1.6,3.7,3.4,3.6,27.3,886.0,1675.0,8.0,19.0,287.0,297.0,726.0,894.0,1694.0,388.0,542.0,3277.0,229.0,289.0,2184.0,128.0,271.0,955.0,95.0,109.0,0.011,16.4,5.7,6.8,3.3,23.3,7.9,0.32,3.5,8.2,6.4,25.3,2.0,12.3,16.2,56.5,29.8,7.3,14.3,0.21
3,Dominique Wilkins,SF,34.0,ATL-LAC,74.0,74.0,35.6,9.4,21.5,44.0,1.1,4.0,28.8,8.3,17.5,47.4,46.6,6.0,7.1,84.7,2.5,4.0,6.5,2.3,1.2,0.4,2.3,1.7,26.0,613.0,1293.0,85.0,295.0,169.0,30.0,299.0,698.0,1588.0,442.0,522.0,2635.0,182.0,126.0,1923.0,92.0,172.0,481.0,106.0,110.0,0.186,11.3,0.7,2.6,-1.6,12.3,2.8,0.329,4.2,7.5,5.6,21.4,1.7,8.6,9.9,52.9,32.0,3.0,8.4,0.153
4,Karl Malone,PF,30.0,UTA,82.0,82.0,40.6,9.4,18.9,49.7,0.1,0.4,25.0,9.3,18.5,50.3,50.0,6.2,9.0,69.4,2.9,8.6,11.5,4.0,1.5,1.5,2.9,3.3,25.2,764.0,1520.0,8.0,32.0,328.0,126.0,705.0,772.0,1552.0,511.0,736.0,3329.0,235.0,268.0,2063.0,125.0,234.0,940.0,100.0,111.0,0.021,17.1,2.6,5.6,1.5,24.1,6.1,0.474,4.1,8.3,7.4,22.9,1.9,11.1,16.3,55.0,28.0,6.4,13.4,0.193
5,Patrick Ewing,C,31.0,NYK,79.0,79.0,37.6,9.4,19.0,49.6,0.1,0.2,28.6,9.4,18.8,49.8,49.7,5.6,7.4,76.5,2.8,8.4,11.2,2.3,1.1,2.7,3.3,3.5,24.5,741.0,1489.0,4.0,14.0,179.0,217.0,666.0,745.0,1503.0,445.0,582.0,2972.0,219.0,275.0,1939.0,90.0,260.0,885.0,93.0,108.0,0.009,11.3,5.1,5.2,2.7,24.9,8.0,0.387,2.5,8.5,5.1,22.9,1.6,12.9,16.9,55.1,29.8,5.5,13.1,0.211
6,Mitch Richmond,SG,28.0,SAC,78.0,78.0,37.1,8.1,18.3,44.5,1.6,4.0,40.7,6.5,14.3,45.5,48.9,5.5,6.6,83.4,0.9,2.8,3.7,4.0,1.3,0.2,2.8,2.7,23.4,508.0,1116.0,127.0,312.0,313.0,17.0,216.0,635.0,1428.0,426.0,511.0,2897.0,70.0,211.0,1823.0,103.0,216.0,286.0,111.0,109.0,0.218,18.5,0.4,1.6,-1.1,8.3,1.2,0.358,2.7,2.6,5.2,17.7,1.8,11.6,5.4,55.1,27.2,2.7,6.4,0.106
7,Scottie Pippen,SF,28.0,CHI,72.0,72.0,38.3,8.7,17.8,49.1,0.9,2.7,32.0,7.8,15.0,52.2,51.5,3.8,5.7,66.0,2.4,6.3,8.7,5.6,2.9,0.8,3.2,3.2,22.0,564.0,1081.0,63.0,197.0,403.0,58.0,456.0,627.0,1278.0,270.0,409.0,2759.0,173.0,227.0,1587.0,211.0,232.0,629.0,97.0,109.0,0.154,24.6,1.4,7.7,3.2,19.4,6.0,0.32,4.5,7.3,5.2,23.2,4.0,13.7,13.3,54.4,27.1,6.8,11.2,0.194
8,Charles Barkley,PF,30.0,PHO,65.0,65.0,35.4,8.0,16.1,49.5,0.7,2.7,27.0,7.2,13.4,54.1,51.8,4.9,7.0,70.4,3.0,8.1,11.2,4.6,1.6,0.6,3.2,2.5,21.6,470.0,868.0,48.0,178.0,296.0,37.0,529.0,518.0,1046.0,318.0,452.0,2298.0,198.0,160.0,1402.0,101.0,206.0,727.0,103.0,113.0,0.17,20.0,1.0,5.2,0.7,25.6,3.3,0.432,4.5,9.8,5.6,22.8,2.2,14.2,17.8,56.3,26.5,4.2,8.8,0.185
9,Alonzo Mourning,C,23.0,CHH,60.0,59.0,33.6,7.1,14.1,50.5,0.0,0.0,0.0,7.1,14.1,50.7,50.5,7.2,9.5,76.2,3.0,7.2,10.2,1.4,0.5,3.1,3.3,3.5,21.5,427.0,843.0,0.0,2.0,86.0,188.0,433.0,427.0,845.0,433.0,568.0,2018.0,177.0,207.0,1287.0,27.0,199.0,610.0,105.0,109.0,0.002,6.6,5.7,1.4,0.2,23.0,2.5,0.672,1.2,10.0,3.8,21.7,0.7,15.4,16.7,58.8,27.3,1.7,6.3,0.151
