In [None]:
# This scrapes the ACHD covid-19 reporting site.  It was checked into github 
# on Randy's laptop on 4/14/20, and modified on Anne's laptop to try to include new 
# data on race 

# WARNING
### Be sure to change data_date at the bottom of this notebook
### before scraping

In [None]:
import datetime, json, re, time
from selenium import webdriver
import pandas as pd
from sqlitedict import SqliteDict
from collections import defaultdict
from shapely.geometry import Polygon, MultiPolygon, shape, Point
import math
from datetime import timedelta
#!conda install -y selenium
#!pip install sqlitedict

In [None]:
def init():
    global driver
    try:
        driver.close()
    except:
        pass
    options = webdriver.ChromeOptions()
    options.add_argument('install-autogenerated-theme=255,0,255')
    driver = webdriver.Chrome('chromedriver', options=options)
    #driver.implicitly_wait(10)

In [None]:
def achd_get_display_selections():
    # Sometimes there's a clear-glass widget that traps all the clicks.  Check if it's there 
    # and if so click on it
    for retries in range(0,5):
        try:
            glass_elt = driver.find_element_by_css_selector('body > div.tab-glass.clear-glass.tab-widget')
            if glass_elt:
                print("Clicking away clear-glass widget")
                glass_elt.click()
                time.sleep(1)
            else:
                break
        except:
            break
        
    # Need to click once on the thing that looks like a dropdown to open it up to find the selctions
    for try_id in ['tableau_base_widget_ParameterControl_1','tableau_base_widget_ParameterControl_3']:
        try:
            stat_type_dropdown = driver.find_element_by_id(try_id)
            if stat_type_dropdown:
                print("Found dropdown with id %r"%(try_id))
                stat_type_dropdown.click()
                break
        except Exception as e:
            print("Failed to find dropdown with id %r, exception %r"%(try_id,e))

        ##tableau_base_widget_ParameterControl_1 > div > div.PCContent > span > div.tabComboBoxNameContainer.tab-ctrl-formatted-fixedsize
        ##tableau_base_widget_ParameterControl_3 > div > div.PCContent > span > div.tabComboBoxNameContainer.tab-ctrl-formatted-fixedsize
        ##tableau_base_widget_ParameterControl_3 > div > div.PCContent > span > div.tabComboBoxNameContainer.tab-ctrl-formatted-fixedsize

    # Need to get the parent element to find the selection options
    for try_css in ['body > div.tabMenu.tab-widget.tabMenuComboDropdownTheme.tab-ctrl-formatted-widget.tableau_base_widget_ParameterControl_1_menu.tabMenuNoIcons.tabMenuNoDesc.tabComboBoxMenu',
                    'body > div.tabMenu.tab-widget.tabMenuComboDropdownTheme.tab-ctrl-formatted-widget.tableau_base_widget_ParameterControl_3_menu.tabMenuNoIcons.tabMenuNoDesc.tabComboBoxMenu'
                   ]:
        try:
            display_sel_parent = driver.find_element_by_css_selector(try_css)
            if display_sel_parent:
                print("Found display selector parent with css selector %r"%(try_css))
                break
        except Exception as e:
            print("Failed to find display selector parent with css selector %r, exception %r"%(try_css,e))

        ##tableau_base_widget_ParameterControl_1 > div > div.PCContent > span > div.tabComboBoxNameContainer.tab-ctrl-formatted-fixedsize
    
    # Get all the spans under the parent.  These are the selection options.
    display_sel_spans = display_sel_parent.find_elements_by_css_selector('span')

    # Create a map of type names to selection elements
    ret_map = {x.text:x for x in display_sel_spans}
    
    return ret_map

In [None]:
def achd_get_display_panes():
    ret_map = {}
    ret_map['map'] = driver.find_element_by_id('tabZoneId3')
    ret_map['age_bars'] = driver.find_element_by_css_selector('#tabZoneId111')#driver.find_element_by_css_selector("div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)")
    ret_map['race_pie'] = driver.find_element_by_css_selector('#view5368947392126195861_4899206460912332246')
    #> div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)')#"div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)")
    ##view5368947392126195861_4899206460912332246 > div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)
    ##view5368947392126195861_4899206460912332246 > div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)
    ret_map['gender_bars'] = driver.find_element_by_css_selector('#view5368947392126195861_3289867524579464219')
    #> div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)')
    #driver.find_element_by_css_selector("div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)")
    #view5368947392126195861_3289867524579464219 > div.tvScrollContainer.tvmodeRectSelect > div.tvimagesContainer > canvas:nth-child(2)
    return ret_map

In [None]:
def get_age_bars_coord(elt, i):
    x = 80
    y = 15+i*15
    height = elt.size['height']
    if y>= height:
        return None
    return(Point(x, y))

In [None]:
def get_gender_bars_coord(elt, i):
    x = 40
    y = 10+i*15
    height = elt.size['height']
    if y>= height:
        return None
    return(Point(x, y))

In [None]:
def get_race_pie_coord(elt, i):
    # Set how fine grained the wedges to sample are
    t_div = 100
    # Stop after going all the way around the circle once
    if i>t_div*2:
        return None
    
    height = elt.size['height']
    width = elt.size['width']
    
    center_x = width/2
    center_y = height/2
    radius = min(center_x,center_y)*0.8
    theta = (float(i)/t_div)*math.pi
    x = radius*math.sin(theta)+center_x
    y = radius*math.cos(theta)+center_y
    return(Point(x, y))

In [None]:
def get_age_bars_info_name(info):
    if 'Age Group' in info:
        return info['Age Group']
    return None

In [None]:
def get_gender_bars_info_name(info):
    if 'Gender' in info:
        return info['Gender']
    return None

In [None]:
def get_race_pie_info_name(info):
    if 'Race' in info:
        return info['Race']
    return None

In [None]:
# Define the paths for each pane
pane_path_func_map = {
    'age_bars':get_age_bars_coord,
    'gender_bars':get_gender_bars_coord,
    'race_pie':get_race_pie_coord
}

In [None]:
# Define the paths for each pane
pane_name_func_map = {
    'age_bars':get_age_bars_info_name,
    'gender_bars':get_gender_bars_info_name,
    'race_pie':get_race_pie_info_name
}

In [None]:
def achd_scan_pane(pane_name):
    global disp_pane_map

    assert pane_name in disp_pane_map, "Missing %s in disp_pane_map"%(pane_name)
    assert pane_name in pane_path_func_map, "Missing %s in pane_path_func_map"%(pane_name)
    assert pane_name in pane_name_func_map, "Missing %s in pane_name_func_map"%(pane_name)
    
    elt = disp_pane_map[pane_name]
    path_func = pane_path_func_map[pane_name]
    name_func = pane_name_func_map[pane_name]
    i = 0
    retry = 0
    
    info_map = {}
    pixels_map = defaultdict(lambda:[])
    while True:
        try:
            chk_point = path_func(elt, i)
            if chk_point is None:
                break
            location_info = get_info(elt, chk_point.x,chk_point.y)
            if location_info:
                # Get name for this info map for this pane
                name = name_func(location_info)
                if not name:
                    print("WARNING: pane %r, no name for %r"%(pane_name,location_info))

                if not name in info_map:
                    print(location_info)
                    info_map[name] = location_info

                # Keep track of the pixels where each element was found
                pixels_map[name].append((chk_point.x,chk_point.y))
        except Exception as e:
            print("    %s: Disp retry %d: Exception %r"%(pane_name, retry,e))
            panes_ok=False
            for pane_retry in range(0,5):
                try:
                    # Wait a couple seconds for the display to stabilize
                    time.sleep(10)
                    # Get the display elements to scan
                    disp_pane_map = achd_get_display_panes()
                    panes_ok=True
                    break
                except Exception as e:
                    print("  %s: Pane retry %d: Exception %r"%(disp_name, pane_retry,e))
            if not panes_ok:
                raise
            # Get the new element pointer for the pane
            assert pane_name in disp_pane_map, "Missing %s in disp_pane_map"%(pane_name)
            elt = disp_pane_map[pane_name]
            retry+=1
        i+=1
        
    return info_map, pixels_map

In [None]:
def achd_scan_display_type(disp_name):
    global disp_sel_map
    global disp_pane_map
    
    got_pane = False
    for disp_retry in range(0,5):
        try:
            # Get the display selections map
            disp_sel_map = achd_get_display_selections()

            assert disp_name in disp_sel_map and disp_sel_map[disp_name], "Missing %s in disp_sel_map"%(disp_name)
            # Select the desired display type
            disp_sel_map[disp_name].click()


            for pane_retry in range(0,5):
                try:
                    # Wait a couple seconds for the display to stabilize
                    time.sleep(10)
                    # Get the display elements to scan
                    disp_pane_map = achd_get_display_panes()
                    got_pane = True
                    break
                except Exception as e:
                    print("  %s: Pane retry %d: Exception %r"%(disp_name, pane_retry,e))
                    
            break
        except Exception as e:
            print("  %s: Disp retry %d: Exception %r"%(disp_name, disp_retry,e))
            
    if not got_pane:
        raise
        
    info_map = {}
    pixels_map = {}
    
    # Scan the various pane types
    for pane_name in pane_name_func_map.keys():
        pane_info_map, pane_pixels_map = achd_scan_pane(pane_name)
        info_map[pane_name] = pane_info_map
        pixels_map[pane_name] = pane_pixels_map
        
    return info_map, pixels_map

In [None]:
def get_info(elt, x,y):
    actions = webdriver.common.action_chains.ActionChains(driver)
    actions.move_to_element_with_offset(elt, x, y)
    actions.perform()
    try:
        popup = driver.find_element_by_css_selector('.tab-tooltipBR')
    except:
        return None
    offset_x = popup.location['x'] - elt.location['x'] - x
    offset_y = popup.location['y'] - elt.location['y'] - y
    assert(abs(offset_x-16)<2 and abs(offset_y-16)<2)
    info = popup.find_element_by_css_selector('span').text

    # Split on newlines and ignore blank lines
    lines = [line.strip() for line in info.split('\n') if len(line.strip())]
    location_name = lines[0]
    location_info = {x[0].strip():x[1].strip() for x in [line.split(':') for line in lines[1:]]}
    location_info['name'] = location_name
    location_info['x'] = x
    location_info['y'] = y
    return location_info

In [None]:
def watch_for_info(elt,timeout):
    seen_locations = {}
    seen_pixels = defaultdict(lambda:set())
    start_time = datetime.datetime.now()
    end_time = start_time + timedelta(seconds=timeout)

    popup = None
    while datetime.datetime.now()<end_time:
        print("Watching until %s"%(end_time))
        while datetime.datetime.now()<end_time:
            try:
                popup = driver.find_element_by_css_selector('.tab-tooltipBR')
                if popup:
                    break
            except:
                continue

        if not popup:
            return None

        x = popup.location['x'] - elt.location['x'] - 16
        y = popup.location['y'] - elt.location['y'] - 16

        try:
            info = popup.find_element_by_css_selector('span').text
        except:
            # May have lost the element, assume this is the map
            elt = driver.find_element_by_id('tabZoneId3')

        # Split on newlines and ignore blank lines
        lines = [line.strip() for line in info.split('\n') if len(line.strip())]
        location_name = lines[0]
        
        # Add pixel location for this location
        seen_pixels[location_name].add((x,y))
        
        if location_name in seen_locations:            
            continue
            
        location_info = {x[0].strip():x[1].strip() for x in [line.split(':') for line in lines[1:]]}
        location_info['name'] = location_name
        location_info['x'] = x
        location_info['y'] = y        
        
        seen_locations[location_name] = location_info
        print(location_info)
        
    return seen_locations, seen_pixels

In [None]:
# Keep track of results for each type of info.  Keys are display types + 'Map'
scrape_info_map = {}
scrape_pixels_map = {}

In [None]:
init()
url = 'https://tableau.alleghenycounty.us/t/PublicSite/views/COVID-19AlleghenyCounty/COVID-19?iframeSizedToWindow=true&:embed=y&:showAppBanner=false&:display_count=no&:showVizHome=no&:origin=viz_share_link'
driver.get(url)

In [None]:
for disp_type in ['All Cases', 'All Tests', 'Hospitalizations', 'Admitted to ICU', 'Deaths']:
    if  disp_type in scrape_info_map and len(scrape_info_map[disp_type]['race_pie'])>0:
        print("Skipping %s"%(disp_type))
        continue
        
    print("Scraping %s"%(disp_type))
    info_map, pixels_map = achd_scan_display_type(disp_type)
    scrape_info_map[disp_type] = info_map
    scrape_pixels_map[disp_type] = pixels_map

In [None]:
info_map

In [None]:
# Scrape map info
locations = {}
pixels = defaultdict(lambda:set())

In [None]:
# achd_get_display_panes()
# map_elt = disp_pane_map['map']
map_elt = driver.find_element_by_id('tabZoneId3')
map_elt_size = map_elt.size
print(f'map_elt.size: {map_elt_size}')

In [None]:
for y in range(0, map_elt.size['height'], 1):
    for x in range(0, map_elt.size['width'], 1):
        # Split on newlines and ignore blank lines
        location_info = get_info(map_elt, x,y)
        if location_info and not location_info['name'] in locations:
            print(location_info)
            locations[location_info['name']] = location_info
        if location_info:
            pixels[location_info['name']].add((x,y))

In [None]:
# If len(locations)<219 we have an issue
len(locations)

In [None]:
# Run this then manually scan for missing areas.  
# This is generally going to be 
#   Lincoln-Lemington-Belmar (Pittsburgh) (to the east of the zoo)
# It will gather any entries that you hover over for 5 seconds
seen_locations, seen_pixels = watch_for_info(map_elt, 5)

In [None]:
seen_pixels

In [None]:
# Apply any manually scanned areas that were missing before
# to the locations map.
for loc_name, info in seen_locations.items():
    pixels[loc_name] = pixels[loc_name].union(seen_pixels[loc_name])
    if not loc_name in locations:
        print("Adding %s: %r"%(loc_name,info))
        locations[loc_name] = info

#### fixup (don't do this in general)

In [None]:
# If missing locations, this can potentially figure out which are missing
# It assumes that there's a copy of the original achd_covid.db available
# to compare against.
curr_names = set(locations.keys())
curr_names

In [None]:
orig_db_path = f'achd_covid.db'
orig_achd_covid_table = SqliteDict(orig_db_path, autocommit=True)
print(f"Opened {orig_db_path}")

In [None]:
prev_names = set(orig_achd_covid_table[orig_achd_covid_table['scrape_sessions'][-2]]['Map'].keys())
prev_names

In [None]:
prev_names-curr_names

In [None]:
# Get an earlier version of a missing item
# achd_covid_table[achd_covid_table['scrape_sessions'][-2]]['Lincoln-Lemington-Belmar (Pittsburgh)']
                 

### Save scrape_info_map to achd_covid_table

In [None]:
# Store map info in scrape_info_map and scrape_pixels_map
scrape_info_map['Map'] = locations
scrape_pixels_map['Map'] = pixels

In [None]:
# IMPORTANT: Manually set data_date by reading it off the ACHD page; it can't be parsed
data_date = '2020-04-17'

In [None]:
db_path = f'achd_covid-{data_date}.db'
achd_covid_table = SqliteDict(db_path, autocommit=True)
print(f"Opened {db_path}")

In [None]:
# Write data from scrape_info_map into achd_covid_table
for k,v in scrape_info_map.items():
    achd_covid_table[k]=v

In [None]:
# Update metadata
md_elt = {'map_size':map_elt_size,'data_date':data_date}
achd_covid_table['metadata'] = md_elt

In [None]:
achd_covid_table['metadata']

In [None]:
##### Below this line isn't ready for prime time yet
scrape_pixels_save_map = {k1:{k2:dict(v2) for k1,v1 in scrape_pixels_map.items() for k2,v2 in v1.items()}}
scrape_pixels_save_map

In [None]:
scrape_pixels_map

In [None]:
scrape_pixels_save_map={}
for k1,v1 in scrape_pixels_map.items():
    v1_save = {}
    for k2,v2 in v1.items():
        v1_save[k2] = dict(v2)
    scrape_pixels_save_map[k1] = v1_save
    
scrape_pixels_save_map

In [None]:
scrape_pixels_save_map['Map']['Lincoln-Lemington-Belmar (Pittsburgh)']