In [None]:
# takes 6h
# ACBL is throttling 45 club files at a time(?). Afterwards returns 403 forbidden. Their throttling is a moving target.
# Performs following steps:
# 1) Read club-results file for each club in list. List of clubs is predefined in config.py.
# 2) Create list of club-result-detail files for each club.
# 3) Download html file containing game details provided neither html nor json file exists.
# 4) Save downloaded game details to local html file. They can be deleted/archived once json is created.
# 5) Seach game details file for script tag containing embedded json info (var data = ...).
# 6) Write json data to json file.

# Next steps:
# acbl-club-results-json-to-sql.ipynb creates a sql script file from each json file. The scripts are executed producing a single sql db.

# Previous steps:
# Requires a list of clubs numbers to be processed. See config.py for instructions.

# todo:
# implement error checking for request? Throttling produces errors such as 'Forbidden'.
#    try
#        r = requests.get(url)
#        r.raise_for_status()
#    except requests.exceptions.HTTPError as e:
#        print (e.response.text)
# automatically download newer club file if new month has started. download new data upto 1st day of current month.

In [None]:
import config # contains configurations/settings.
import pandas as pd
import pathlib
import requests
import re
import json
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, inspect
from sqlalchemy_utils.functions import database_exists, create_database
import time
import mlBridgeLib

In [None]:
# override pandas display options
mlBridgeLib.pd_options_display()

In [None]:
rootPath = pathlib.Path('e:/bridge/data')
acblPath = rootPath.joinpath('acbl')

In [None]:
# takes 2m to read 2517 existing (local file) clubs. 2h to download all files.
# May require 3 or more attempts to get all urls.
# DEL/S club-results\*.html files to refresh club-results. e.g. Do DEL/S each month.
# request club html files. Use read_local to either force downloading of html or reading local html.
# There doesn't seem to be any permanent failures to read any club. Just try again.

read_local = config.option_read_local
#read_local = False # force requesting of all club html files from web. do this each month to update club results.
htmls = {}
total_clubs = len(config.option_club_numbers)
failed_urls = []
headers={"user-agent":None} # Not sure why this has become necessary
for ncn,cn in enumerate(sorted(config.option_club_numbers)):
    ncn += 1
    url = config.option_base_url+str(cn)+'/'
    file = url.replace(config.option_acbl_url,'')+str(cn)+'.html'
    print(f'Processing file ({ncn}/{total_clubs}): {file}')
    path = acblPath.joinpath(file)
    if read_local and path.exists() and path.stat().st_size > 200:
        html = path.read_text(encoding="utf-8")
        print(f'Reading local {file}: len={len(html)}')
    else:
        print(f'Requesting {url}')
        r = requests.get(url,headers=headers)
        html = r.text
        print(f'Creating {file}: len={len(html)}')
        if r.status_code != 200:
            print(f'Error: status:{r.status_code} {url}')
            time.sleep(60) # obsolete?
            failed_urls.append(url)
            continue
        # pathlib.Path.mkdir(path.parent, parents=True, exist_ok=True)
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(html, encoding="utf-8")
        time.sleep(1) # need to self-throttle otherwise acbl returns 403 "forbidden". obsolete?
    htmls[str(cn)] = html
print(len(failed_urls),failed_urls)
print(f"Done: Total clubs processed:{total_clubs}: Total url failures:{len(failed_urls)}")

In [None]:
# takes 6m
# Extract list of result html links embedded within each club's results file. Download any new results.

dfs = {}
ClubInfos = {}
total_htmls = len(htmls)
for n,(cn,html) in enumerate(htmls.items()):
    n += 1
    print(f'Processing club ({n}/{total_htmls}) {cn}')
    bs = BeautifulSoup(html)
    html_table = bs.find('table')
    if html_table is None:
        print(f'Invalid club-result for {cn}')
        continue
    # /html/body/div[2]/div/div[2]/div[1]/div[2]
    ClubInfo = bs.find('div', 'col-md-8')
    #print(ClubInfo)
    ci = {}
    ci['Name'] = ClubInfo.find('h1').contents[0].strip() # get first text and strip
    ci['Location'] = ClubInfo.find('h5').contents[0].strip() # get first text and strip
    if ClubInfo.find('a'):
        ci['WebSite'] = ClubInfo.find('a')['href'] # get href of first a
    ClubInfos[cn] = ci
    print(f'{ci}')
    # assumes first table is our target
    d = pd.read_html(str(html_table))
    assert len(d) == 1
    df = pd.DataFrame(d[0])
    df.insert(0,'Club',cn)
    df.insert(1,'EventID','?')
    hrefs = [config.option_acbl_url+link.get('href')[1:] for link in html_table.find_all('a', href=re.compile("^/club-results/details/\d*$"))]
    df.drop('Unnamed: 6', axis=1, inplace=True)
    df['ResultID'] = [result.rsplit('/', 1)[-1] for result in hrefs]
    df['ResultUrl'] = hrefs
    dfs[cn] = df
print(f"Done: Total clubs processed:{len(dfs)}")

In [None]:
# takes 1h15m for zero updates. 6h for a 1 month update.
# Skips files already downloaded. Ordering is newest result to oldest.
# For each html result file, extract the embedded json data, write json to file.

# todo:
# sort enumerate(filtered_clubs.items())

import time

starting_nclub = 0 # beginning slice
ending_nclub = 0 # ending slice. 0 means all files

total_execution_time = 0 # todo:
total_executed = 0 # todo:
if ending_nclub == 0: ending_nclub = len(dfs)
filtered_clubs = dfs # todo
total_filtered_clubs = len(filtered_clubs)
total_urls_processed = 0
total_urls_to_process = '?' # todo: total sum of all result files in all clubs
total_local_files = '?' # todo: total number of all local files to be processed
total_local_files_read = 0
start_time = time.time() # todo:

total_clubs = len(filtered_clubs)
failed_urls = []
#headers={"user-agent":None} # Not sure why this has become necessary. Failed 2021-Sep-02 so using Chrome curl user-agent.
headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
for ndf,(kdf,df) in enumerate(filtered_clubs.items()):
    if ndf < starting_nclub or ndf >= ending_nclub:
        print(f"Skipping club #{ndf} {kdf}") # obsolete when filtered_clubs works
        continue
    ndf += 1
    total_results = len(df['ResultUrl'])
    for cn, (nurl, url) in zip(df['Club'],enumerate(df['ResultUrl'])):
        nurl += 1
        total_urls_processed += 1
        html_file = url.replace(config.option_acbl_url,'').replace('club-results','club-results/'+str(cn))+'.html'
        json_file = html_file.replace('.html','.data.json')
        print(f'Processing club ({ndf}/{total_clubs}): result file ({nurl}/{total_results}): {html_file}')
        html_path = acblPath.joinpath(html_file)
        json_path = acblPath.joinpath(json_file)
        html = None
        data_json = None
        if config.option_read_local and json_path.exists():
            if html_path.exists():
                print(f'Found local html file: {html_file}')
            else:
                print(f'Missing local html file: {html_file}')
            with open(json_path, 'r') as f:
                data_json = json.load(f)
            total_local_files_read += 1
            print(f'Reading local ({total_local_files_read}/{total_local_files}) file:{json_path}: len:{json_path.stat().st_size}')
        else:
            print(f'Requesting {url}')
            r = requests.get(url,headers=headers)
            html = r.text
            print(f'Creating {html_file}: len={len(html)}')
            # some clubs return 200 (ok) but with instructions to login (len < 200).
            # skip clubs returning errors or tiny files. assumes one failed club result will be true for all club's results.
            if r.status_code != 200 or len(html) < 200:
                failed_urls.append(url)
                break
            # pathlib.Path.mkdir(html_path.parent, parents=True, exist_ok=True)
            html_path.parent.mkdir(parents=True, exist_ok=True)
            html_path.write_text(html, encoding="utf-8")
            bs = BeautifulSoup(html) # do this once and reuse? does it matter?
            scripts = bs.find_all('script')
            #print(scripts)
            for script in scripts:
                if script.string: # not defined for all scripts
                    #print(script.string)
                    vardata = re.search('var data = (.*);\n', script.string)
                    if vardata:
                        data_json = json.loads(vardata.group(1))
                        #print(json.dumps(data_json, indent=4))
                        print(f"Writing {json_path}")
                        with open(json_path, 'w') as f:
                            json.dump(data_json, f, indent=2)
                        bbo_tournament_id = data_json["bbo_tournament_id"]
                        print(f'bbo_tournament_id: {bbo_tournament_id}')
            #time.sleep(1) # obsolete?
        # if no data_json file read, must be an error so delete both html and json files.
        if not data_json:
            html_path.unlink(missing_ok=True)
            json_path.unlink(missing_ok=True)
        print(f'Files processed ({total_urls_processed}/{total_local_files_read}/{total_urls_to_process})')
print(len(failed_urls),failed_urls)
print(f"Done: Totals: clubs:{total_clubs} urls:{total_urls_processed} local files read:{total_local_files_read}: failed urls:{len(failed_urls)}")