In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
def _extract_data_from_div(d):
    """
    Returns list of data points
    """
    position = d['data-position']
    team = d['data-team']
    first = d['data-first-name']
    last = d['data-last-name']
    # convert full suffix to player-specific
    suffix = d.find('a', href=True)['href']
    suffix = suffix[suffix.find('players/')+8:]
    
    return [first, last, position, team, suffix]

def _build_df(data):
    """
    Build dataframe by iterating through player list
    """
    df = pd.DataFrame(columns=['first', 'last', 'position', 'team', 'url_suffix'])
    for d in data:
        df.loc[df.shape[0]] = _extract_data_from_div(d)
    return df

def download_nf_players():
    """
    Downloads players from numberfire.com/nba/players
    
    Returns dataframe: first, last, position, team, url_suffix
    """
    page = requests.get('https://www.numberfire.com/nba/players')
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # find data via class attribute
    plys = soup.find_all(class_='all-players__indiv')
    return _build_df(plys)

In [4]:
def get_game_log(url_suffix):
    """
    Returns dict of tables from numberfire.com/nba/players/daily-fantasy/<player-suffix>
    
    Keys: 'past', 'upcoming'
    """
    url = 'https://www.numberfire.com/nba/players/daily-fantasy/' + url_suffix
    tables = pd.read_html(url)
    
    return {
        'past': pd.concat([tables[2], tables[3]], axis='columns'),
        'upcoming': pd.concat([tables[0], tables[1]], axis='columns')
    }

In [5]:
def concatenate_logs(urls):
    """
    Gets game logs for all players and concatenates tables
    
    Returns dict of dataframes
    Keys: 'past', 'upcoming'
    """
    past_logs = []
    upcoming_logs = []
    errored_urls = []
    
    for i, url_suffix in enumerate(urls):
        try:
            logs = get_game_log(url_suffix)

            # add identifying column to each
            logs['past']['url_suffix'] = url_suffix
            past_logs.append(logs['past'])

            logs['upcoming']['url_suffix'] = url_suffix
            upcoming_logs.append(logs['upcoming'])
        
        except:
            print(f'Error with {i}, {url_suffix}')
            errored_urls.append(url_suffix)

        # status bar
        print(f"{(i+1)/len(urls):6.1%} completed\r", end="")
        
    return {'past': pd.concat(past_logs), 'upcoming': pd.concat(upcoming_logs), 'errors': errored_urls}        
        

In [6]:
# %time r = concatenate_logs(df.loc[:, 'url_suffix'])

In [7]:
# from concurrent.futures import ThreadPoolExecutor

# def multithread_(urls):
#     p = ThreadPoolExecutor(max_workers=16)
#     results = p.map(concatenate_logs, urls)
        
#     return results, p.join()

In [8]:
# from multiprocessing import Pool
# from multiprocessing import cpu_count

# def simple_multiprocess(urls):
#     with Pool(cpu_count()) as p:
#         results = p.map(concatenat_logs, urls)
#     return results

# def multiprocess_(urls):
#     p = Pool(cpu_count())
#     results = p.map(concatenate_logs, urls)
#     p.terminate()
#     p.join()
#     return results

In [9]:
%time plys = download_nf_players()
plys

CPU times: total: 609 ms
Wall time: 2.21 s


Unnamed: 0,first,last,position,team,url_suffix
0,A.J.,Green,SG,MIL,a-j-green
1,A.J.,Lawson,SG,DAL,a-j-lawson
2,Aaron,Gordon,F,DEN,aaron-gordon
3,Aaron,Holiday,PG,HOU,aaron-holiday
4,Aaron,Nesmith,SF,IND,aaron-nesmith
...,...,...,...,...,...
487,Yuta,Watanabe,PF,MEM,yuta-watanabe
488,Zach,Collins,C,SA,zach-collins
489,Zeke,Nnaji,PF,DEN,zeke-nnaji
490,Ziaire,Williams,SF,MEM,ziaire-williams


In [11]:
%time logs = concatenate_logs(plys.loc[:, 'url_suffix'])

Error with 203, jalen-crutcher
CPU times: total: 14.6 s
Wall time: 8min 51s


In [22]:
# with pd.ExcelWriter('20240228 NumberFire.xlsx') as writer:
#     plys.to_excel(writer, sheet_name='players', index=False)
#     logs['past'].to_excel(writer, sheet_name='past', index=False)
#     logs['upcoming'].to_excel(writer, sheet_name='upcoming', index=False)

In [25]:
logs['past'].groupby('url_suffix').head(10).groupby('url_suffix').agg({'FP': 'mean', 'MIN': 'mean'}).sort_values('FP', ascending=False)

Unnamed: 0_level_0,FP,MIN
url_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1
luka-doncic,63.55,37.348
nikola-jokic,62.68,35.159
shai-gilgeous-alexander,55.34,34.840
domantas-sabonis,54.98,37.220
anthony-davis,53.97,33.935
...,...,...
pete-nance,0.00,2.075
pat-spencer,0.00,1.750
isaiah-wong,0.00,1.170
onuralp-bitim,0.00,2.865


In [16]:
logs['past'].columns

Index(['Date', 'OPP', 'MIN', 'PTS', 'FGM-A', '3PM-A', 'FTM-A', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'Salary', 'FP', 'Value', 'url_suffix'],
      dtype='object')