In [20]:
import requests 
from bs4 import BeautifulSoup
import html
import pandas as pd, numpy as np 
from io import StringIO
import math  
import sqlite3, sqlalchemy 
import re 

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 15)  

In [21]:
def clean_player(name): 
    i = name.find('*')   
    name = name[:i] if i > -1 else name 
    return name 

def load_soup(URL): 
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html') 

    return soup 

def select_tm(group):
    value = group.head(1) 
    if len(group) > 1: 
        value['Tm'] = "" 
        for i in range(1, len(group)):
            value['Tm'] += group.iloc[i]['Tm'] 
            value['Tm'] += '-' if i < len(group)-1 else ''  
    return value 

def html_df(soup, key, value, remove_cols=['Rk'], drop_cols=True, drop_lvl=False): 
    stat_df = soup.find('table', attrs={key:value}) 
    stat_df = pd.read_html(StringIO(str(stat_df))) 
    stat_df = pd.DataFrame(stat_df[0]) 
    if drop_lvl: 
        stat_df.columns = stat_df.columns.droplevel() 
    stat_df = stat_df.groupby('Player').apply(lambda x: select_tm(x)).reset_index(drop=True) 
    stat_df['Player'] = stat_df['Player'].apply(lambda x: clean_player(x)) 
    if drop_cols==True:  
        stat_df = stat_df.drop(remove_cols, axis=1) 

    return stat_df 

def combine_dfs(dfs):
    curr = None
    for name in dfs.keys():
        if curr is None: 
            curr = dfs[name]
        else:
            diff = dfs[name].columns.difference(curr.columns).tolist()
            diff.append('Player') 
            curr = pd.merge(curr, dfs[name].loc[:,diff], on = 'Player') 
    return curr  

def clean_if_dirty(master):
    filtered = master[master['Player']=='Player']
    pos = filtered.index
    master = master.drop(pos, axis=0) 
    master['Player'] = master['Player'].apply(lambda x: clean_player(x)) 
    return master 

def mod_types_pct(master): 
    for col_name in master.columns: 
        if col_name not in ['Player', 'Tm', 'Pos']: 
            master[col_name] = master[col_name].astype("float64")
        if col_name in ['FG%', '3P%', 'FT%', 'eFG%', 'TS%', '2P%']:
            master[col_name] *= 100 
    return master

def process_totals(totals, names): 
    totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    totals.columns = 'T' + totals.columns 
    totals['Player'] =  names 

    return totals 

def process_averages(master):
    new = master.rename(columns={'G':'GP', 'MP':'MPG', 'ORB':'ORPG', 'DRB':'DRPG', 'TRB':'RPG', 'AST':'APG', 'STL':'SPG', 'BLK':'BPG', 'TOV':'ToPG', 'PTS':'PPG'}, errors="raise") 
    return new  

In [30]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

start_ssn = 1985   
stop_ssn = 1986                

for season in range(start_ssn, stop_ssn): 
    
    URL_averages = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_game.html' 
    soup_averages = load_soup(URL_averages) 
    averages = html_df(soup_averages, 'id', 'per_game_stats', ['Rk']) 

    URL_totals = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_totals.html' 
    soup_totals = load_soup(URL_totals) 
    totals = html_df(soup_totals, 'id', 'totals_stats', ['Rk']) 
    totals = process_totals(totals, averages['Player']) 
    

    URL_per100 = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_poss.html' 
    soup_per100 = load_soup(URL_per100) 
    per100 = html_df(soup_per100, 'id', 'per_poss_stats', ['Rk', 'Unnamed: 29'])

    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_advanced.html'  
    soup_advanced = load_soup(URL_advanced) 
    advanced = html_df(soup_advanced, 'id', 'advanced_stats', ['Rk', 'Unnamed: 19', 'Unnamed: 24']) 

    dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced} 

    season_str = str(season) + '_' + str(season+1)[2:]
    master = combine_dfs(dfs) 
    master = clean_if_dirty(master) 
    master = mod_types_pct(master) 
    master = process_averages(master) 
    
    master.to_sql('master_'+season_str, con=engine, if_exists='replace', index=False) 
    
    



  stat_df = stat_df.groupby('Player').apply(lambda x: select_tm(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_tm(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_tm(x)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: select_tm(x)).reset_index(drop=True)


# Clean Tables 

In [4]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

start_ssn = 1979   
stop_ssn = 1985   

for season in range(start_ssn, stop_ssn):   
    season_str = str(season) + '_' + str(season+1)[2:] 
    master = pd.read_sql("master"+"_"+season_str, con=engine)  

    # Clean here 
            
    master.to_sql('master'+'_'+season_str, con = engine, if_exists = 'replace', index = False)  
        

In [29]:
pd.read_sql('master_1979_80', con=engine) 

Unnamed: 0,Player,Pos,Age,Tm,GP,GS,MPG,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORPG,DRPG,RPG,APG,SPG,BPG,ToPG,PF,PPG,T2P,T2PA,T3P,T3PA,TAST,TBLK,TDRB,TFG,TFGA,TFT,TFTA,TMP,TORB,TPF,TPTS,TSTL,TTOV,TTRB,DRtg,ORtg,3PAr,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTr,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
0,Abdul Jeelani,SF,25.0,POR,77.0,,16.7,3.7,7.3,51.0,0.0,0.1,0.0,3.7,7.3,51.5,51.0,2.1,2.6,78.9,1.5,2.0,3.5,1.2,0.5,0.5,1.5,2.0,9.6,288.0,559.0,0.0,6.0,95.0,40.0,156.0,288.0,565.0,161.0,204.0,1286.0,114.0,155.0,737.0,40.0,117.0,270.0,104.0,109.0,0.011,11.6,1.8,1.5,-0.1,13.5,1.5,0.361,1.7,9.6,2.3,18.7,1.5,15.2,11.5,56.3,24.6,1.1,3.8,0.141
1,Adrian Dantley,SF,24.0,UTA,68.0,,39.3,10.7,18.6,57.6,0.0,0.0,0.0,10.7,18.6,57.7,57.6,6.5,7.7,84.2,2.7,4.9,7.6,2.8,1.4,0.2,3.4,3.1,28.0,730.0,1265.0,0.0,2.0,191.0,14.0,333.0,730.0,1267.0,443.0,526.0,2674.0,183.0,211.0,1903.0,96.0,233.0,516.0,110.0,119.0,0.002,12.3,0.3,3.7,-1.6,14.0,1.1,0.415,5.3,8.3,9.5,24.3,1.8,13.5,11.3,63.5,27.8,3.8,10.5,0.189
2,Al Skinner,SG,27.0,PHI,2.0,0.0,5.0,0.5,1.0,50.0,0.0,0.0,,0.5,1.0,50.0,50.0,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.5,1.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,10.0,0.0,1.0,2.0,0.0,2.0,0.0,111.0,65.0,0.000,25.4,0.0,-4.8,-0.2,0.0,0.0,0.000,-4.6,0.0,0.0,-0.6,0.0,50.0,0.0,50.0,16.0,0.0,0.0,-0.152
3,Alex English,SF,26.0,IND-DEN,78.0,,30.8,7.1,14.3,49.7,0.0,0.1,33.3,7.1,14.2,49.8,49.8,2.7,3.4,78.9,3.4,4.3,7.8,2.9,0.9,0.8,2.7,2.6,16.9,551.0,1107.0,2.0,6.0,224.0,62.0,336.0,553.0,1113.0,210.0,266.0,2401.0,269.0,206.0,1318.0,73.0,214.0,605.0,107.0,106.0,0.005,13.9,1.4,0.4,-1.2,14.9,2.0,0.239,1.6,11.3,3.3,18.0,1.4,14.8,13.1,53.6,23.4,1.5,5.3,0.105
4,Allan Bristow,SF,28.0,UTA,82.0,,28.1,4.6,9.6,48.0,0.0,0.1,28.6,4.6,9.5,48.2,48.2,2.4,3.0,81.1,2.1,4.2,6.2,4.2,1.1,0.1,2.2,2.6,11.6,375.0,778.0,2.0,7.0,341.0,6.0,342.0,377.0,785.0,197.0,243.0,2304.0,170.0,211.0,953.0,88.0,179.0,512.0,110.0,108.0,0.009,21.4,0.1,0.6,-0.8,16.7,1.2,0.310,1.4,9.0,3.2,16.6,1.9,16.7,13.0,53.4,20.0,1.5,4.4,0.091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,Wes Unseld,C,33.0,WSB,82.0,82.0,36.3,4.0,7.8,51.3,0.0,0.0,50.0,4.0,7.7,51.3,51.4,1.7,2.5,66.5,4.1,9.3,13.3,4.5,0.8,0.7,1.9,3.0,9.7,326.0,635.0,1.0,2.0,366.0,61.0,760.0,327.0,637.0,139.0,209.0,2973.0,334.0,249.0,794.0,65.0,153.0,1094.0,103.0,114.0,0.003,15.6,1.1,2.5,1.5,25.9,4.1,0.328,0.9,11.2,4.3,15.4,1.0,17.3,18.5,54.5,11.7,3.3,8.4,0.135
282,Wiley Peck,SF,22.0,SAS,52.0,,12.1,1.4,3.3,43.2,0.0,0.0,0.0,1.4,3.2,43.7,43.2,0.7,1.1,61.8,1.3,2.3,3.5,0.6,0.3,0.4,0.9,1.9,3.5,73.0,167.0,0.0,2.0,33.0,23.0,117.0,73.0,169.0,34.0,55.0,628.0,66.0,100.0,180.0,17.0,48.0,183.0,108.0,94.0,0.012,6.1,1.9,-4.2,-0.9,19.6,0.5,0.325,-3.4,11.5,-0.2,8.9,1.2,19.9,15.6,46.6,14.5,-0.4,0.3,0.023
283,Willie Smith,PG,26.0,CLE,62.0,,17.0,2.0,5.1,38.4,0.3,1.1,23.9,1.7,3.9,42.6,41.1,0.6,0.8,76.9,0.9,1.0,2.0,4.2,1.2,0.0,1.5,1.8,4.8,104.0,244.0,17.0,71.0,259.0,1.0,65.0,121.0,315.0,40.0,52.0,1051.0,56.0,110.0,299.0,75.0,95.0,121.0,106.0,97.0,0.225,29.3,0.1,0.0,0.7,6.8,1.0,0.165,-0.7,5.4,0.0,12.2,3.2,21.9,6.1,44.2,15.8,0.5,1.0,0.047
284,Winford Boynes,SG,22.0,NJN,64.0,,17.2,3.5,7.3,47.3,0.0,0.1,0.0,3.5,7.2,47.7,47.3,1.6,2.1,76.5,0.8,1.3,2.1,1.5,0.9,0.3,1.5,2.1,8.5,221.0,463.0,0.0,4.0,95.0,19.0,82.0,221.0,467.0,104.0,136.0,1102.0,51.0,132.0,546.0,59.0,96.0,133.0,104.0,99.0,0.009,12.8,0.9,-1.2,0.3,7.7,1.3,0.291,-1.5,4.8,0.3,13.1,2.4,15.4,6.3,51.8,21.8,0.2,1.7,0.072
