In [1]:
# takes 60s per 40 api calls.
# download player tournament history. skip if already downloaded. player_id list is from BridgePowerRatings.

# previous steps:
# none

# next steps:
# create sql db of player tournament data.

# todo:
# Improve the method of issuing api calls only when needed. Write update info to file?
# Identify the correct player_id from list for each elite player name. Issue is that multiple players can have same name.

In [2]:
import pandas as pd
import pathlib
from collections import defaultdict
import mlBridgeLib

In [3]:
# override pandas display options
mlBridgeLib.pd_options_display()

In [4]:
rootPath = pathlib.Path('e:/bridge/data')
acblPath = rootPath.joinpath('acbl')
acblPath.mkdir(parents=True,exist_ok=True)
bprPath = rootPath.joinpath('bpr')
bprPath.mkdir(parents=True,exist_ok=True)

In [5]:
# takes 60s per 40 api calls.
# call acbl api to retrieve player's tournament data.
# todo: inexplicable variance between 'total' in json and sessions_count. Why? varaince is 0 to -15-ish.
import requests
import urllib
import time
import json
def download_tournaments(df,bearer_file):
    headers = {'accept':'application/json', 'Authorization':bearer[len('Authorization: '):]}
    start_time = time.time()
    get_count = 0
    for n,r in df.iterrows():
        player_ids = r['player_id']
        assert type(player_ids) is list, [type(player_ids), player_ids]
        #print(f"{time.strftime('%X')} player_id list:{player_ids} Name:{r['Name']} Rank:{r['Rank']}")
        print(f"{time.strftime('%X')} Name:{r['Name']} player_id list:{player_ids}")
        for player_id in player_ids:
            if player_id.startswith('tmp:') or player_id.startswith('#'): # somehow #139 crept into player_id
                print(f'Skipping player_id:{player_id}')
                continue
            else:
                print(f'Processing player_id:{player_id}')
            dirPath = acblPath.joinpath('players/'+player_id+'/tournaments')
            if dirPath.exists():
                session_file_count = len(list(dirPath.glob('*.session.json')))
                print(f'dir exists: file count:{session_file_count} dir:{dirPath}')
                if session_file_count > 0: # todo: temp?
                    print(f'dir not empty -- skipping')
                    continue
            else:
                print(f'Creating dir:{dirPath}')
                dirPath.mkdir(parents=True,exist_ok=True)
                session_file_count = 0
            path = 'https://api.acbl.org/v1/tournament/player/history_query'
            query = {'acbl_number':player_id,'page':1,'page_size':50,'start_date':'1900-01-01'}
            params = urllib.parse.urlencode(query)
            url = path+'?'+params
            sessions_count = 0
            while url:
                get_count += 1
                print(f"{n}/{len(df)} gets:{get_count} rate:{round((time.time()-start_time)/get_count,2)} url:{url}")
                #time.sleep(1) # throttle api calling. Maybe not needed as api is taking longer than 1s.
                response = requests.get(url, headers=headers)
                if response.status_code in [500]: # 500 is unknown response code. try skipping player
                    print(f'500: count:{get_count} skipping') # 4476921 - Thx Merle.
                    next_page_url = None
                    break
                assert response.status_code == 200, (n, url, response.status_code)
                json_response = response.json()
                json_pretty = json.dumps(json_response, indent=4)
                #print(json_pretty)
                next_page_url = json_response['next_page_url']
                sessions_total = json_response['total'] # is same for every page
                if sessions_total == session_file_count: # sometimes won't agree because identical sessions. older scores?
                    print(f'File count correct: {dirPath}: terminating {player_id} early.')
                    next_page_url = None
                    sessions_count = sessions_total
                    break
                for data in json_response['data']:
                    sessions_count += 1 # todo: oops, starts first one at 2. need to move
                    session_id = data['session_id']
                    filePath = dirPath.joinpath(session_id+'.session.json')
                    if filePath.exists():
                        print(f'{sessions_count}/{sessions_total}: File exists: {filePath}: skipping')
                        continue
                    print(f'{sessions_count}/{sessions_total}: Writing:',filePath)
                    with open(filePath,'w',encoding='UTF8') as f:
                        f.write(json.dumps(data, indent=4))
                url = next_page_url
            if sessions_count != sessions_total:
                print(f'Session count mismatch: {dirPath}: variance:{sessions_count-sessions_total}')

In [6]:
# read file containing api bearer secret. acbl calls it a "Personal Access Token". Available at https://api.acbl.org/
bearer_file = pathlib.Path('../../../acbl/acbl_api_authorization_bearer_secret.txt')
with open(bearer_file,'rb') as f:
    bearer = f.read()

In [10]:
# read all files downloaded from BridgePowerRatings. Download players tournament history.
# todo: rewrite to pass unique list of player_ids, names. 
# todo: implement download history of pairs and each partners.
files = list(bprPath.glob('*.pkl'))
for f in files:
    df = pd.read_pickle(bprPath.joinpath(f))
    display(f'Processing file:{f} len:{len(df)}',df.head())
    if 'player_id' in df:
        assert 'Name' in df
        # todo: sort df (e.g. Rank) to better track progress of downloads?
        #df = df['Rank'].astype('float32').sort_values()
        download_tournaments(df,bearer_file)
print(f'Done: Total files processed:{len(files)}')

Done: Total files processed:0
