In [5]:
import requests 
from bs4 import BeautifulSoup
import html
import pandas as pd, numpy as np 
from io import StringIO
import math  
import sqlite3, sqlalchemy 
import re 

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 15)  

In [6]:
def clean_player(name): 
    i = name.find('*')   
    name = name[:i] if i > -1 else name 
    return name 

def load_soup(URL): 
    r = requests.get(URL) 
    soup = BeautifulSoup(r.content, 'html') 

    return soup 

def html_df(soup, key, value, remove_cols=['Rk'], drop_cols=True, drop_lvl=False): 
    stat_df = soup.find('table', attrs={key:value}) 
    stat_df = pd.read_html(StringIO(str(stat_df))) 
    stat_df = pd.DataFrame(stat_df[0]) 
    if drop_lvl: 
        stat_df.columns = stat_df.columns.droplevel() 
    stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True) 
    stat_df['Player'] = stat_df['Player'].apply(lambda x: clean_player(x)) 
    if drop_cols==True:  
        stat_df = stat_df.drop(remove_cols, axis=1) 

    return stat_df 

def combine_dfs(dfs):
    curr = None
    for name in dfs.keys():
        if curr is None: 
            curr = dfs[name]
        else:
            diff = dfs[name].columns.difference(curr.columns).tolist()
            diff.append('Player') 
            curr = pd.merge(curr, dfs[name].loc[:,diff], on = 'Player') 
    return curr    

def process_totals(totals, names): 
    totals = totals.loc[:,['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB']] 
    totals.columns = 'T' + totals.columns 
    totals['Player'] =  names 
    
    return totals 

In [7]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

start_ssn = 1990  
stop_ssn = 1995      

for season in range(start_ssn, stop_ssn): 
    
    URL_averages = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_game.html' 
    soup_averages = load_soup(URL_averages) 
    averages = html_df(soup_averages, 'id', 'per_game_stats', ['Rk']) 

    URL_totals = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_totals.html' 
    soup_totals = load_soup(URL_totals) 
    totals = html_df(soup_totals, 'id', 'totals_stats', ['Rk']) 
    totals = process_totals(totals, averages['Player']) 
    

    URL_per100 = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_per_poss.html' 
    soup_per100 = load_soup(URL_per100) 
    per100 = html_df(soup_per100, 'id', 'per_poss_stats', ['Rk', 'Unnamed: 29'])

    URL_advanced = 'https://www.basketball-reference.com/leagues/NBA_' + str(season+1) + '_advanced.html'  
    soup_advanced = load_soup(URL_advanced) 
    advanced = html_df(soup_advanced, 'id', 'advanced_stats', ['Rk', 'Unnamed: 19', 'Unnamed: 24']) 

    dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced} 

    season_str = str(season) + '_' + str(season+1)[2:]
    master = combine_dfs(dfs)
    for name in dfs.keys():
        dfs[name].to_sql(name+'_'+season_str, con = engine, if_exists = 'replace', index = False)  
        
    master.to_sql('master_'+season_str, con=engine, if_exists='replace', index=False) 
    
    



  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Player').apply(lambda x: x.head(1)).reset_index(drop=True)
  stat_df = stat_df.groupby('Pla

# Clean Tables 

In [8]:
engine = sqlalchemy.create_engine('sqlite:///../../DB/ballbase.db') 

for season in range(start_ssn, stop_ssn):   
    season_str = str(season) + '_' + str(season+1)[2:] 
    
    averages = pd.read_sql("averages"+"_"+season_str, con=engine) 
    totals = pd.read_sql("totals"+"_"+season_str, con=engine) 
    per100 = pd.read_sql("per100"+"_"+season_str, con=engine) 
    advanced = pd.read_sql("advanced"+"_"+season_str, con=engine) 
    master = pd.read_sql("master"+"_"+season_str, con=engine)  

    dfs = {"averages": averages, "totals": totals, "per100": per100, "advanced": advanced, "master": master} 

    for name in dfs.keys():
        filtered = dfs[name][dfs[name]['Player']=='Player']
        pos = filtered.index
        dfs[name] = dfs[name].drop(pos, axis=0) 
        dfs[name]['Player'] = dfs[name]['Player'].apply(lambda x: clean_player(x)) 
        
        for col_name in dfs[name].columns: 
            if col_name not in ['Player', 'Tm', 'Pos']: 
                dfs[name][col_name] = dfs[name][col_name].astype("float64")
        dfs[name].to_sql(name+'_'+season_str, con = engine, if_exists = 'replace', index = False)  
        