In [1]:
import requests 
from bs4 import BeautifulSoup
import html
import pandas as pd, numpy as np 
from io import StringIO
import math  
import sqlalchemy 
import re 

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 50)  

In [2]:
def clean_player(name): 
    i = name.find('*')   
    name = name[:i] if i > -1 else name 
    return name 

def load_soup(URL): 
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html') 

    return soup 

def select_team(group):
    value = group.head(1) 
    if 'Team' in value.columns:
        if len(group) > 1: 
            value['Team'] = "" 
            for i in range(1, len(group)):
                value['Team'] += group.iloc[i]['Team'] 
                value['Team'] += '-' if i < len(group)-1 else ''  
    else: 
        if len(group) > 1: 
            value['Tm'] = "" 
            for i in range(1, len(group)):
                value['Tm'] += group.iloc[i]['Tm'] 
                value['Tm'] += '-' if i < len(group)-1 else ''  
    return value 

def html_df(soup, key, value, remove_cols=['Rk'], drop_cols=True, drop_lvl=False): 
    stat_df = soup.find('table', attrs={key:value}) 
    stat_df = pd.read_html(StringIO(str(stat_df))) 
    stat_df = pd.DataFrame(stat_df[0]) 
    if drop_lvl: 
        stat_df.columns = stat_df.columns.droplevel() 
    stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True) 
    stat_df['Player'] = stat_df['Player'].apply(lambda x: clean_player(x)) 
    if drop_cols==True:  
        stat_df = stat_df.drop(remove_cols, axis=1) 

    return stat_df 

def combine_dfs(dfs):
    curr = None
    for name in dfs.keys():
        if curr is None: 
            curr = dfs[name]
        else:
            diff = dfs[name].columns.difference(curr.columns).tolist()
            diff.append('Player') 
            curr = pd.merge(curr, dfs[name].loc[:,diff], on = 'Player') 
    return curr  

def clean_if_dirty(master):
    filtered = master[master['Player']=='Player']
    pos = filtered.index
    master = master.drop(pos, axis=0) 
    master['Player'] = master['Player'].apply(lambda x: clean_player(x)) 
    master = master.dropna(axis=1, how='all')   
    if ('Tm' in master.columns) & ('Team' in master.columns): 
        master = master.drop('Tm', axis=1) 
         
    return master 

def mod_types_pct(master): 
    for col_name in master.columns: 
        if col_name not in ['Player', 'Team', 'Tm', 'Pos', 'Awards']:  
            master[col_name] = master[col_name].astype("float64")
        if col_name in ['FG%', '3P%', 'FT%', 'eFG%', 'TS%', '2P%']:
            master[col_name] *= 100 
    return master

def process_totals(totals, names): 
    if '3P' in totals.columns:  
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'BLK' in totals.columns: 
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'PF', 'MP', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    elif 'TRB':
        totals = totals.loc[:,['PTS', 'TRB', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    else: 
        totals = totals.loc[:,['PTS', 'AST', 'PF', 'FG', 'FGA', '2P', '2PA', 'FT', 'FTA']] 
    totals.columns = 'T' + totals.columns 
    totals['Player'] =  names 

    return totals 

def process_averages(master):
    if 'TOV' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'TOV':'ToPG', 'PTS':'PPG'}, errors="raise")
    elif 'BLK' in master.columns: 
        new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'PTS':'PPG'}, errors="raise") 
    elif 'TRB' in master.columns: 
        new = master.rename(columns={'G':'GP', 'TRB':'RPG', 'AST':'APG', 'PTS':'PPG'}, errors="raise") 
    else: 
        new = master.rename(columns={'G':'GP', 'AST':'APG','PTS':'PPG'}, errors="raise") 

    return new  

In [10]:
engine1 = sqlalchemy.create_engine("sqlite:///../../DB/ballbase.db") 
engine2 = sqlalchemy.create_engine('postgresql+psycopg2://brandon:access@localhost:5432/ballbase' ) 
start_ssn = 2024     
stop_ssn = 2025                    

for season in range(start_ssn, stop_ssn): 
    
    URL_averages = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_game.html' 
    soup_averages = load_soup(URL_averages) 
    averages = html_df(soup_averages, 'id', 'per_game_stats', ['Rk'])  

    URL_totals = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_totals.html' 
    soup_totals = load_soup(URL_totals) 
    totals = html_df(soup_totals, 'id', 'totals_stats', ['Rk']) 
    totals = process_totals(totals, averages['Player'])   
    
    if season >= 1973: 
        URL_per100 = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_poss.html' 
        soup_per100 = load_soup(URL_per100) 
        try:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk', 'Unnamed: 29']) 
        except:
            per100 = html_df(soup_per100, 'id', 'per_poss', ['Rk']) 

    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_advanced.html'  
    soup_advanced = load_soup(URL_advanced) 
    try: 
        advanced = html_df(soup_advanced, 'id', 'advanced', ['Rk', 'Unnamed: 19', 'Unnamed: 24'])  
    except:
        advanced = html_df(soup_advanced, 'id', 'advanced', ['Rk'])  

    if season >= 1973: 
        dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced} 
    else: 
        dfs = {"averages": averages, "totals": totals, "advanced": advanced} 

    season_str = str(season) + '_' + str(season+1)[2:]
    master = combine_dfs(dfs) 
    master = clean_if_dirty(master) 
    master = mod_types_pct(master) 
    master = process_averages(master) 
    
    master.to_sql('master_'+season_str, con=engine1, if_exists='replace', index=False) 
    master.to_sql('master_'+season_str, con=engine2, if_exists='replace', index=False)
    
    



  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_team(x)).reset_index(drop=True)


# Clean Tables 

In [11]:
engine1 = sqlalchemy.create_engine("sqlite:///../../DB/ballbase.db") 
engine2 = sqlalchemy.create_engine('postgresql+psycopg2://brandon:access@localhost:5432/ballbase' ) 

start_ssn = 1973   
stop_ssn = 2025     

for season in range(start_ssn, stop_ssn):   
    season_str = str(season) + '_' + str(season+1)[2:] 
    master = pd.read_sql("master"+"_"+season_str, con=engine1)  

    # Cleaning code here ... 
    master.columns = master.columns.str.upper()    
    try:
        master = master.sort_values(by='ppg', ascending=False)
    except:
        master = master.sort_values(by='PPG', ascending=False)
    master.to_sql('master'+'_'+season_str, con = engine1, if_exists = 'replace', index = False)  
        

In [12]:
engine1 = sqlalchemy.create_engine("sqlite:///../../DB/ballbase.db")  
engine2 = sqlalchemy.create_engine('postgresql+psycopg2://brandon:access@localhost:5432/ballbase' ) 
pd.read_sql('master_2024_25', con=engine1).head(20)   

Unnamed: 0,PLAYER,AGE,TEAM,POS,GP,GS,MPG,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,EFG%,FT,FTA,FT%,ORPG,DRPG,RPG,APG,SPG,BPG,TOPG,PF,PPG,T2P,T2PA,T3P,T3PA,TAST,TBLK,TDRB,TFG,TFGA,TFT,TFTA,TMP,TORB,TPF,TPTS,TSTL,TTOV,TTRB,DRTG,ORTG,3PAR,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTR,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
0,Shai Gilgeous-Alexander,26.0,OKC,PG,43.0,43.0,34.3,11.3,21.3,52.9,2.0,5.7,34.8,9.3,15.6,59.6,57.6,7.4,8.2,90.4,0.9,4.4,5.3,6.1,2.0,1.1,2.7,2.2,32.0,399.0,670.0,86.0,247.0,263.0,47.0,190.0,485.0,917.0,320.0,354.0,1476.0,38.0,95.0,1376.0,88.0,116.0,228.0,103.0,127.0,0.269,30.9,3.3,11.8,3.4,13.6,3.2,0.386,8.4,2.8,6.4,30.8,2.9,9.8,8.3,64.1,34.4,5.2,9.6,0.313
1,Giannis Antetokounmpo,30.0,MIL,PF,37.0,37.0,34.8,12.5,20.7,60.2,0.1,0.7,14.8,12.4,20.0,61.8,60.4,6.4,10.8,59.3,2.2,9.8,12.0,5.8,0.7,1.4,3.5,2.4,31.4,457.0,739.0,4.0,27.0,216.0,52.0,363.0,461.0,766.0,237.0,400.0,1287.0,81.0,88.0,1163.0,27.0,129.0,444.0,108.0,118.0,0.035,32.9,3.8,8.1,2.1,29.3,2.2,0.522,6.0,7.1,3.7,29.8,1.0,12.0,18.6,61.7,36.3,3.3,5.9,0.22
2,Nikola Jokić,29.0,DEN,C,39.0,39.0,36.3,11.4,20.1,56.6,2.1,4.3,47.9,9.3,15.8,59.0,61.8,5.2,6.4,81.0,3.4,9.8,13.1,10.1,1.8,0.6,3.1,2.1,29.9,363.0,615.0,80.0,167.0,393.0,25.0,381.0,443.0,782.0,201.0,248.0,1417.0,131.0,80.0,1167.0,71.0,121.0,512.0,109.0,134.0,0.214,44.8,1.6,14.3,3.4,28.1,2.2,0.317,10.9,10.6,7.5,33.2,2.4,12.0,19.8,65.5,30.2,5.8,9.7,0.328
3,LaMelo Ball,23.0,CHO,PG,30.0,30.0,34.1,10.0,23.9,41.9,4.2,12.6,33.4,5.8,11.3,51.3,50.7,4.7,5.7,82.4,1.0,4.4,5.4,7.5,1.4,0.3,3.8,3.5,28.9,174.0,339.0,126.0,377.0,225.0,10.0,133.0,300.0,716.0,140.0,170.0,1023.0,30.0,106.0,866.0,42.0,114.0,163.0,114.0,111.0,0.527,43.4,1.0,5.0,-0.2,14.4,1.0,0.237,5.2,3.1,1.4,21.3,2.0,12.6,8.5,54.8,37.3,1.8,2.4,0.113
4,Luka Dončić,25.0,DAL,PG,22.0,22.0,35.7,9.8,21.2,46.4,3.4,9.6,35.4,6.4,11.5,55.5,54.4,5.1,6.6,76.7,0.7,7.6,8.3,7.8,2.0,0.4,3.4,2.6,28.1,141.0,254.0,75.0,212.0,172.0,9.0,167.0,216.0,466.0,112.0,146.0,785.0,16.0,58.0,619.0,44.0,75.0,183.0,109.0,116.0,0.455,36.8,1.0,7.1,1.4,22.3,1.2,0.313,5.7,2.3,1.8,24.7,2.7,12.4,12.6,58.4,32.9,1.8,3.0,0.182
5,Kevin Durant,36.0,PHO,PF,34.0,34.0,35.9,9.8,18.7,52.4,2.2,5.7,39.4,7.6,13.1,58.1,58.4,5.3,6.4,83.0,0.5,6.0,6.4,4.1,0.8,1.3,3.1,1.7,27.2,258.0,444.0,76.0,193.0,140.0,45.0,203.0,334.0,637.0,181.0,218.0,1222.0,16.0,57.0,925.0,26.0,105.0,219.0,116.0,115.0,0.303,20.3,3.4,3.4,-0.5,17.7,0.9,0.342,3.9,1.5,2.2,22.1,1.1,12.5,9.9,63.1,30.3,1.7,3.1,0.123
6,Jayson Tatum,26.0,BOS,PF,43.0,43.0,36.3,9.1,20.1,45.4,3.6,10.3,35.4,5.5,9.8,55.9,54.5,5.2,6.6,79.5,0.6,8.5,9.0,5.4,1.3,0.5,2.8,2.3,27.1,236.0,422.0,156.0,441.0,233.0,22.0,364.0,392.0,863.0,225.0,283.0,1560.0,25.0,100.0,1165.0,56.0,121.0,389.0,108.0,117.0,0.511,24.7,1.3,5.8,1.0,25.5,2.6,0.328,4.7,1.7,3.4,22.4,1.8,10.9,13.5,59.0,30.8,3.0,6.0,0.185
7,Tyrese Maxey,24.0,PHI,PG,37.0,37.0,37.9,9.2,21.4,43.2,3.3,9.6,33.9,5.9,11.7,50.8,50.8,4.9,5.7,86.3,0.2,3.2,3.5,6.0,2.0,0.4,2.4,2.2,26.6,220.0,433.0,121.0,357.0,223.0,16.0,120.0,341.0,790.0,182.0,211.0,1403.0,9.0,80.0,985.0,73.0,90.0,129.0,115.0,113.0,0.452,28.7,1.2,2.8,-0.7,10.5,1.1,0.267,3.5,0.7,2.1,20.2,2.6,9.3,5.4,55.8,30.6,1.7,3.3,0.111
8,Anthony Edwards,23.0,MIN,SG,45.0,45.0,36.7,9.0,20.3,44.5,4.2,9.8,42.5,4.9,10.5,46.3,54.7,4.1,5.0,83.4,0.7,4.9,5.7,4.4,1.1,0.5,3.4,1.9,26.3,219.0,473.0,187.0,440.0,198.0,24.0,222.0,406.0,913.0,186.0,223.0,1650.0,33.0,86.0,1185.0,51.0,151.0,255.0,112.0,111.0,0.482,20.9,1.4,3.0,-0.4,14.6,2.0,0.244,3.4,2.3,1.9,18.6,1.5,13.0,8.6,58.6,30.8,2.1,3.9,0.113
9,Anthony Davis,31.0,LAL,C,40.0,40.0,34.8,9.5,18.2,52.4,0.7,2.3,30.4,8.8,15.9,55.6,54.3,6.1,7.8,78.8,2.7,9.1,11.9,3.5,1.3,2.2,2.2,2.0,25.9,353.0,635.0,28.0,92.0,139.0,88.0,365.0,381.0,727.0,245.0,311.0,1393.0,109.0,78.0,1035.0,53.0,88.0,474.0,108.0,120.0,0.127,17.4,5.8,5.6,1.2,29.3,2.3,0.428,4.4,9.0,3.6,27.1,1.9,9.2,19.3,59.9,30.1,2.7,5.9,0.203


# Output to HTML Templates 

In [13]:
import sys, os 
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('functions.py'), '..', '..', 'PY')))   
import functions 

engine1 = sqlalchemy.create_engine("sqlite:///../../DB/ballbase.db") 
seasons = functions.generate_seasons(1949, 2025)  
for season in seasons:
    season_str = season 
    master = pd.read_sql("master_"+season, con=engine1).fillna(0) 
    if '3P' in master.columns: 
        averages = master[['PLAYER', 'POS', 'TEAM', 'AGE', 'PPG', 'RPG', 'APG', 'SPG', 'BPG', 'TOPG', 'MPG', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P%', 'FT', 'FTA', 'FT%', 'PF']] 
        totals = master.loc[:, ['PLAYER', 'TPTS', 'TSTL',
            'TTOV', 'TTRB', 'T2P', 'T2PA', 'T3P', 'T3PA', 'TAST', 'TBLK', 'TDRB',
            'TFG', 'TFGA', 'TFT', 'TFTA', 'TMP', 'TORB', 'TPF']]
        advanced = master.loc[:, ['PLAYER', 'DRTG', 'ORTG',
            '3PAR', 'AST%', 'BLK%', 'BPM', 'DBPM', 'DRB%', 'DWS', 'FTR', 'OBPM',
            'ORB%', 'OWS', 'PER', 'STL%', 'TOV%', 'TRB%', 'TS%', 'EFG%', 'USG%', 'VORP',
            'WS', 'WS/48']] 
    elif 'BPG' in master.columns:  
        averages = master[['PLAYER', 'POS', 'TEAM', 'AGE', 'PPG', 'RPG', 'APG', 'SPG', 'BPG', 'MPG', 'FGA', 'FG%', '2P%', 'FTA', 'FT%', 'PF']] 
        totals = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'TPTS', 'TSTL', 'TTRB', 'T2P', 'T2PA', 'TAST', 'TBLK', 'TDRB', 'TFG', 'TFGA', 'TFT', 'TFTA', 'TMP', 'TORB', 'TPF']]
        advanced = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'DRTG', 'AST%', 'BLK%', 'BPM', 'DBPM', 'DRB%', 'DWS', 'FTR', 'OBPM',
            'ORB%', 'OWS', 'PER', 'STL%', 'TRB%', 'TS%', 'EFG%', 'VORP',
            'WS', 'WS/48']] 
    elif 'RPG' in master.columns: 
        averages = master[['PLAYER', 'POS', 'TEAM', 'AGE', 'PPG', 'RPG', 'APG', 'GP', 'FGA', 'FG%', '2P%', 'FTA', 'FT%', 'PF']]  
        totals = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'TPTS', 'TAST', 'TTRB', 'T2P', 'T2PA', 'TFG', 'TFGA', 'TFT', 'TFTA', 'TPF']] 
        advanced = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'DWS', 'FTR', 'OWS', 'TS%', 'EFG%', 'WS']]   
    else: 
        averages = master[['PLAYER', 'POS', 'TEAM', 'AGE', 'PPG', 'APG', 'FGA', 'FG%', '2P%', 'FTA', 'FT%', 'PF']] 
        totals = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'T2P', 'T2PA', 'TAST', 'TFG', 'TFGA', 'TFT', 'TFTA', 'TPF', 'TPTS']] 
        advanced = master.loc[:, ['PLAYER', 'POS', 'TEAM', 'AGE', 'DWS', 'FTR', 'OWS', 'TS%', 'WS']] 


    contents = [averages, totals, advanced] 
    # Create HTML table version 
    for i in range(len(contents)):
        contents[i] = contents[i].to_html(table_id="table", classes = "table table-hover table-stripped table-bordered table-striped", border=1, index=False)
        contents[i] = contents[i].replace('<table border="1" class="dataframe table table-hover table-stripped table-bordered table-striped" id="table">', '<table border="1" class="dataframe table table-hover table-stripped table-bordered" style="width:100%" id="table">')
    # Create page content 
    for i in range(len(contents)):
        contents[i] = f"""
{{%extends 'Main/TableBase.html'%}}
{{% block table %}} 

{contents[i]}
{{% endblock %}}
"""
    def write_to_file(path, content): 
        file = open(path, "w", encoding="utf-8") 
        file.write(content) 
        file.close 


    file_names=[f"Averages_{season}.html", f"Totals_{season}.html", f"Advanced_{season}.html"] 

    for i in range(len(contents)): 
        path = f"../../templates/Tables/{file_names[i]}" 
        write_to_file(path, contents[i]) 