In [1]:
import requests 
from bs4 import BeautifulSoup
import html
import pandas as pd, numpy as np 
from io import StringIO
import math  
import sqlalchemy 
import re 

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 50)  

In [2]:
def clean_player(name): 
    i = name.find('*')   
    name = name[:i] if i > -1 else name 
    return name 

def load_soup(URL): 
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html') 

    return soup 

def select_team(group):
    value = group.head(1) 
    if 'Team' in value.columns:
        if len(group) > 1: 
            value['Team'] = "" 
            for i in range(1, len(group)):
                value['Team'] += group.iloc[i]['Team'] 
                value['Team'] += '-' if i < len(group)-1 else ''  
    else: 
        if len(group) > 1: 
            value['Tm'] = "" 
            for i in range(1, len(group)):
                value['Tm'] += group.iloc[i]['Tm'] 
                value['Tm'] += '-' if i < len(group)-1 else ''  
    return value 

def html_df(soup, key, value, remove_cols=['Rk'], drop_cols=True, drop_lvl=False): 
    stat_df = soup.find('table', attrs={key:value}) 
    stat_df = pd.read_html(StringIO(str(stat_df))) 
    stat_df = pd.DataFrame(stat_df[0]) 
    if drop_lvl: 
        stat_df.columns = stat_df.columns.droplevel() 
    stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True) 
    stat_df['Player'] = stat_df['Player'].apply(lambda x: clean_player(x)) 
    if drop_cols==True:  
        stat_df = stat_df.drop(remove_cols, axis=1) 

    return stat_df 

def combine_dfs(dfs):
    curr = None
    for name in dfs.keys():
        if curr is None: 
            curr = dfs[name]
        else:
            diff = dfs[name].columns.difference(curr.columns).tolist()
            diff.append('Player') 
            curr = pd.merge(curr, dfs[name].loc[:,diff], on = 'Player') 
    return curr  

def clean_if_dirty(master):
    filtered = master[master['Player']=='Player']
    pos = filtered.index
    master = master.drop(pos, axis=0) 
    master['Player'] = master['Player'].apply(lambda x: clean_player(x)) 
    master = master.dropna(axis=1, how='all')   
    if ('Tm' in master.columns) & ('Team' in master.columns): 
        master = master.drop('Tm', axis=1) 
         
    return master 

def mod_types_pct(master): 
    for col_name in master.columns: 
        if col_name not in ['Player', 'Team', 'Tm', 'Pos', 'Awards']:  
            master[col_name] = master[col_name].astype("float64")
        if col_name in ['FG%', '3P%', 'FT%', 'eFG%', 'TS%', '2P%']:
            master[col_name] *= 100 
    return master

def process_totals(totals, names): 
    if '3P' in totals.columns:  
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'BLK' in totals.columns: 
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'PF', 'MP', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'TRB':
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    else: 
        totals = totals.loc[:,['PTS', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    totals.columns = 'T' + totals.columns 
    totals['Player'] =  names 

    return totals 

def process_averages(master):
    if 'TOV' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'TOV':'ToPG', 'PTS':'PPG'}, errors="raise")
    elif 'BLK' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'PTS':'PPG'}, errors="raise") 
    elif 'TRB' in master.columns: 
        new = master.rename(columns={'G':'GP', 'TRB':'RPG', 'AST':'APG', 'PTS':'PPG'}, errors="raise") 
    else: 
        new = master.rename(columns={'G':'GP', 'AST':'APG','PTS':'PPG'}, errors="raise") 

    return new  

In [None]:
engine = sqlalchemy.create_engine("sqlite:///../../DB/ballbase.db") 
engine = sqlalchemy.create_engine('postgresql+psycopg2://brandon:access@localhost:5432/ballbase' ) 
start_ssn = 2024     
stop_ssn = 2025                    

for season in range(start_ssn, stop_ssn): 
    
    URL_averages = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_game.html' 
    soup_averages = load_soup(URL_averages) 
    averages = html_df(soup_averages, 'id', 'per_game_stats', ['Rk'])  

    URL_totals = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_totals.html' 
    soup_totals = load_soup(URL_totals) 
    totals = html_df(soup_totals, 'id', 'totals_stats', ['Rk']) 
    totals = process_totals(totals, averages['Player'])   
    
    if season >= 1973: 
        URL_per100 = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_poss.html' 
        soup_per100 = load_soup(URL_per100) 
        try:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk', 'Unnamed: 29']) 
        except:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk']) 

    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_advanced.html'  
    soup_advanced = load_soup(URL_advanced) 
    try: 
        advanced = html_df(soup_advanced, 'id', 'advanced', ['Rk', 'Unnamed: 19', 'Unnamed: 24'])  
    except:
        advanced = html_df(soup_advanced, 'id', 'advanced', ['Rk'])  

    if season >= 1973: 
        dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced} 
    else: 
        dfs = {"averages": averages, "totals": totals, "advanced": advanced} 

    season_str = str(season) + '_' + str(season+1)[2:]
    master = combine_dfs(dfs) 
    master = clean_if_dirty(master) 
    master = mod_types_pct(master) 
    master = process_averages(master) 
    
    master.to_sql('master_'+season_str, con=engine, if_exists='replace', index=False) 
    
    



  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)


# Clean Tables 

In [8]:
engine = sqlalchemy.create_engine('postgresql+psycopg2://brandon:access@localhost:5432/ballbase' ) 

start_ssn = 1973   
stop_ssn = 2025     

for season in range(start_ssn, stop_ssn):   
    season_str = str(season) + '_' + str(season+1)[2:] 
    master = pd.read_sql("master"+"_"+season_str, con=engine)  

    # Cleaning code here ... 
    try:
        master = master.sort_values(by='ppg', ascending=False)
    except:
        master = master.sort_values(by='PPG', ascending=False)
    master.to_sql('master'+'_'+season_str, con = engine, if_exists = 'replace', index = False)  
        

In [9]:
pd.read_sql('master_2024_25', con=engine).head(20)   

Unnamed: 0,Player,Age,Team,Pos,GP,GS,MPG,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORPG,DRPG,RPG,APG,SPG,BPG,ToPG,PF,PPG,T2P,T2PA,T3P,T3PA,TAST,TBLK,TDRB,TFG,TFGA,TFT,TFTA,TMP,TORB,TPF,TPTS,TSTL,TTOV,TTRB,DRtg,ORtg,3PAr,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTr,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
0,Giannis Antetokounmpo,30.0,MIL,PF,27.0,27.0,35.2,12.7,21.2,60.1,0.1,0.7,21.1,12.6,20.5,61.5,60.5,6.7,11.1,60.5,2.0,9.6,11.6,5.9,0.8,1.5,3.4,2.4,32.3,340.0,553.0,4.0,19.0,158.0,41.0,259.0,344.0,572.0,181.0,299.0,950.0,55.0,66.0,873.0,22.0,91.0,314.0,108.0,119.0,0.033,33.9,4.1,8.7,2.4,28.8,1.6,0.523,6.3,6.4,2.9,30.7,1.1,11.5,17.9,62.0,36.6,2.6,4.5,0.228
1,Nikola Jokić,29.0,DEN,C,31.0,31.0,37.1,12.0,21.7,55.3,2.3,4.8,47.3,9.7,16.9,57.6,60.5,5.2,6.5,80.6,3.5,9.5,13.0,9.7,1.7,0.6,3.3,2.1,31.5,301.0,523.0,71.0,150.0,301.0,19.0,296.0,372.0,673.0,162.0,201.0,1150.0,108.0,65.0,977.0,53.0,101.0,404.0,110.0,130.0,0.223,43.3,1.5,13.0,2.6,27.0,1.6,0.299,10.4,10.6,5.6,32.3,2.2,11.7,19.1,64.2,31.5,4.3,7.1,0.298
2,Shai Gilgeous-Alexander,26.0,OKC,PG,35.0,35.0,34.5,11.0,21.1,52.4,2.2,6.0,36.0,8.9,15.0,58.9,57.5,7.1,7.9,88.8,0.9,4.7,5.6,6.1,2.0,1.1,2.6,2.1,31.3,310.0,526.0,76.0,211.0,214.0,40.0,165.0,386.0,737.0,247.0,278.0,1209.0,30.0,72.0,1095.0,70.0,92.0,195.0,102.0,126.0,0.286,30.7,3.4,11.9,3.5,14.2,2.8,0.377,8.4,2.6,5.0,30.3,2.8,9.7,8.5,63.7,33.7,4.3,7.7,0.307
3,LaMelo Ball,23.0,CHO,PG,23.0,23.0,33.7,10.3,24.4,42.2,4.4,12.9,34.3,5.9,11.5,51.1,51.3,4.8,5.7,84.0,0.9,4.3,5.2,7.3,1.3,0.2,4.0,3.7,29.8,135.0,264.0,102.0,297.0,167.0,5.0,100.0,237.0,561.0,110.0,131.0,775.0,20.0,85.0,686.0,29.0,93.0,120.0,115.0,110.0,0.529,44.6,0.7,5.3,-0.5,14.3,0.7,0.234,5.7,2.7,1.0,21.6,1.8,13.1,8.3,55.4,38.9,1.4,1.7,0.104
4,Paolo Banchero,22.0,ORL,PF,5.0,5.0,36.4,9.6,19.4,49.5,2.2,6.4,34.4,7.4,13.0,56.9,55.2,7.6,11.8,64.4,2.4,6.4,8.8,5.6,0.6,0.8,2.2,2.6,29.0,37.0,65.0,11.0,32.0,28.0,4.0,32.0,48.0,97.0,38.0,59.0,182.0,12.0,13.0,145.0,3.0,11.0,44.0,109.0,121.0,0.33,29.2,2.1,6.6,0.8,21.6,0.3,0.608,5.8,7.3,0.6,25.4,0.8,8.2,14.1,59.0,32.0,0.4,0.8,0.222
5,Jayson Tatum,26.0,BOS,PF,33.0,33.0,36.4,9.3,20.1,46.2,3.9,10.5,36.9,5.4,9.6,56.5,55.9,5.6,7.1,79.8,0.6,8.8,9.4,5.5,1.3,0.4,2.7,2.4,28.1,179.0,317.0,128.0,347.0,180.0,14.0,289.0,307.0,664.0,186.0,233.0,1200.0,21.0,80.0,928.0,44.0,90.0,310.0,108.0,120.0,0.523,24.7,1.1,7.0,1.2,26.3,2.0,0.351,5.8,1.9,3.3,23.9,1.8,10.5,14.0,60.5,30.9,2.7,5.4,0.214
6,Luka Dončić,25.0,DAL,PG,22.0,22.0,35.7,9.8,21.2,46.4,3.4,9.6,35.4,6.4,11.5,55.5,54.4,5.1,6.6,76.7,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,141.0,254.0,75.0,212.0,172.0,9.0,167.0,216.0,466.0,112.0,146.0,785.0,16.0,58.0,619.0,44.0,75.0,183.0,109.0,117.0,0.455,36.0,1.0,6.9,1.2,22.4,1.2,0.313,5.7,2.3,1.9,24.8,2.7,12.4,12.6,58.4,32.8,1.8,3.1,0.191
7,Kevin Durant,36.0,PHO,PF,23.0,23.0,36.1,9.7,18.9,51.6,2.3,5.7,40.9,7.4,13.1,56.3,57.8,5.8,6.9,84.2,0.5,6.1,6.6,4.0,0.7,1.3,3.4,1.7,27.6,170.0,302.0,54.0,132.0,91.0,30.0,140.0,224.0,434.0,133.0,158.0,830.0,11.0,40.0,635.0,17.0,78.0,151.0,117.0,114.0,0.304,19.7,3.4,2.8,-0.7,18.5,0.5,0.364,3.5,1.5,1.3,21.8,1.0,13.4,10.1,63.1,30.9,1.0,1.9,0.109
8,De'Aaron Fox,27.0,SAC,PG,35.0,35.0,37.3,9.9,20.3,48.6,2.0,6.3,32.1,7.8,14.0,56.0,53.6,4.9,6.1,80.8,1.1,3.9,4.9,6.2,1.6,0.3,3.3,2.8,26.7,274.0,489.0,71.0,221.0,216.0,12.0,136.0,345.0,710.0,172.0,213.0,1305.0,37.0,98.0,933.0,57.0,115.0,173.0,114.0,114.0,0.311,26.7,0.9,1.8,-0.9,11.8,1.2,0.3,2.7,3.2,2.2,20.3,2.1,12.5,7.5,58.0,30.2,1.2,3.4,0.125
9,Anthony Davis,31.0,LAL,C,33.0,33.0,34.9,9.4,18.0,52.3,0.7,2.2,32.4,8.7,15.7,55.1,54.3,6.5,8.2,78.6,2.6,9.2,11.8,3.5,1.3,2.2,2.3,2.0,26.0,286.0,519.0,24.0,74.0,116.0,71.0,304.0,310.0,593.0,213.0,271.0,1152.0,87.0,66.0,857.0,43.0,76.0,391.0,109.0,120.0,0.125,17.5,5.6,5.6,1.1,29.4,1.8,0.457,4.4,8.5,3.0,26.9,1.8,9.6,19.0,60.2,29.9,2.2,4.8,0.2
