In [1]:
import requests
import time
import pandas as pd
import random
from lxml import html
import lmdbm
import lzma
import pickle


In [2]:
http_cache = lmdbm.open("wnv_http_cache.lmdb", flag="c")
def cache_get(url):
    if url not in http_cache:
        http_cache[url] = lzma.compress(pickle.dumps(requests.get(url).text))
        time.sleep(1)
        http_cache.sync()
    return pickle.loads(lzma.decompress(http_cache[url]))

In [3]:

def get_counties_for_year(year):
    text = cache_get(f"http://www.idph.state.il.us/envhealth/wnvsurveillance_data_{year:02}.htm")
    root = html.fromstring(text)
    return set([e.text.strip() for e in root.xpath("//td[@width='475']//table//td[position()=1]/font/a")])

def get_all_counties():
    counties = set()
    for year in range(2, 16):
        counties |= get_counties_for_year(year)
    return counties

all_counties = get_all_counties()


In [4]:

def get_tables(logs):
    human_tables = []
    animal_tables = []

    for county in all_counties:
        for year in range(2, 16):
            def emit(m):
                print(f"{county} {year}: {m}")
                logs.append((county, year, m))
            emit("start")
            try:
                clean_county_name = county.lower().replace(".", "").replace(" ", "")
                table_page_text = cache_get(f"http://www.idph.state.il.us/envhealth/wnvcounty/wnv{clean_county_name}{year:02}.htm")
            except Exception as e:
                emit(("Failed to download ", e))
                continue
            try:
                table_page_root = html.fromstring(table_page_text)
            except Exception as e:
                emit(("Failed to parse ", e))
                continue
            try:
                tables_in_detail_page = [
                    t
                    for t in table_page_root.xpath("//table")
                    if "municipality" in html.tostring(t.xpath("//tr[position()=1]")).lower()
                ]
                
            except Exception as e:
                emit(("Failed to extract ", e))
                continue
            
            if len(tables_in_detail_page) == 1:
                # only animals
                animal_table = tables_in_detail_page[0]
                animal_table = html.tostring(animal_table)
                animal_table = pd.read_html(animal_table, header=0)[0]
                animal_tables.append(animal_table)
                emit("has animals")
            elif len(tables_in_detail_page) == 2:
                # humans and animals
                human_table = tables_in_detail_page[0]
                human_table = html.tostring(human_table)
                human_table = pd.read_html(human_table, header=0)[0]
                human_tables.append(human_table)
                emit("has humans")

                animal_table = tables_in_detail_page[1]
                animal_table = html.tostring(animal_table)
                animal_table = pd.read_html(animal_table, header=0)[0]
                animal_tables.append(animal_table)
                emit("has animals")
            else:
                emit("has no tables")
            
            emit("success")
    
    return pd.concat(human_tables) if human_tables else None, pd.concat(animal_tables) if animal_tables else None


In [5]:
logs = []
human_tables, animal_tables = get_tables(logs)

Jefferson 2: start


NameError: name 'e' is not defined

In [6]:
http_cache.sync()

In [7]:
text = requests.get("http://www.idph.state.il.us/envhealth/wnvsurveillance_data_09.htm").text
root = html.fromstring(text)
[e.text for e in root.xpath("//td[@width='475']//table//td[position()=1]/font/a")]

[' Adams',
 'Bond',
 ' Bureau',
 ' Champaign',
 ' Clinton',
 ' Cook',
 ' DuPage',
 ' Edgar',
 ' Franklin',
 ' Gallatin',
 ' Grundy',
 ' Jackson',
 ' Jersey',
 ' Kane',
 ' Kendall',
 ' Knox',
 ' Lake',
 ' LaSalle',
 ' Macon',
 ' Macoupin',
 ' Madison',
 ' Marion',
 ' Massac',
 ' Mercer',
 ' Ogle',
 ' Perry',
 ' St. Clair',
 ' Sangamon',
 ' Stephenson',
 ' Tazewell',
 ' Warren',
 ' Washington',
 ' Wayne',
 ' Will',
 ' Williamson',
 ' Winnebago']

In [8]:
human_tables.to_csv("/Users/ericliao/Desktop/human_tables.csv")
animal_tables.to_csv("/Users/ericliao/Desktop/animal_tables.csv")