In [19]:
import datetime, dateutil, glob, math, re, time, urllib2
from dateutil import parser, rrule, tz

# scrapy web scraper
# To install, run
# !pip install scrapy
# then restart kernel
# If you see an error about zope version, double-check that you restarted the kernel
from scrapy.selector import Selector

def exec_ipynb(url):
    import json, re, urllib2
    nb = (urllib2.urlopen(url) if re.match(r'https?:', url) else open(url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('python-utils/esdr-library.ipynb')

In [4]:
# First time uploading, create a new client like so:

# Esdr.save_client('esdr-auth-fenceline-uploader.json', 'fenceline.org uploader for timemachine1')

# and then follow the directions it prints, which include visiting esdr.cmucreatelab.org and creating
# a client with given parameters, and also editing esdr-auth-baaqm-uploader.json to include your
# username and password

# Do not add esdr-auth-baaqm-uploader.json to the git repo

In [6]:
esdr = Esdr('esdr-auth-fenceline-uploader.json')
product = esdr.get_or_create_product('fenceline_org', 'fenceline_org', 'Sensor networks run fenceline.org')
product

{u'created': u'2015-05-30T12:00:35.000Z',
 u'creatorUserId': 3,
 u'defaultChannelSpecs': {},
 u'description': u'Sensor networks run fenceline.org',
 u'id': 36,
 u'modified': u'2015-05-30T12:00:35.000Z',
 u'name': u'fenceline_org',
 u'prettyName': u'fenceline_org',
 u'vendor': u'fenceline_org'}

In [23]:
def makeIdentifier(prettyName):
    return re.sub('\W+', '_', prettyName).strip('_')

def tableToArray(sel):
    ret = []
    rows = sel.xpath('tr')
    
    for row in rows:
        cols = row.xpath('td')
        rowText = []
        for col in cols:
            text = ' '.join(col.xpath('.//text()[not(ancestor::script)]').extract())
            val = re.sub(r'\s+', ' ', text).strip()
            rowText.append(val)
        ret.append(rowText)
    return ret

def fetchTable(selector, title):
    # Find table with title in <thead>
    table = tableToArray(selector.xpath("//table[thead[contains(.,'%s')]]" % title))
    return table

# Convert to float, if possible
def tryConvertToFloat(datum):
    try:
        return float(datum)
    except:
        return datum

def hardConvertToFloat(datum):
    try:
        val = float(datum)
        if not math.isnan(val):
            return val
    except:
        pass
    
    #if datum is string or NAN, upload nothing
    return False
    
def parseColumn(table, col, columnPrefix=None, weather=False):
    date = None
    time = None
    channels = []
    data = []
    offline = False
    for row in table[1:]:
        if len(row) <= col:
            continue
        if row[0] == 'Date' or row[0] == 'Data Date':
            date = row[col]
        elif row[0] == 'Time' or row[0] == 'Data Time':
            time = row[col]
        else:
            channel = makeIdentifier(row[0])
            if columnPrefix != None:
                channel = columnPrefix + '_' + channel
            channels.append(channel)
            datum = tryConvertToFloat(row[col])
            if datum == 'ND':
                if offline == False:
                    datum = 0
                else:
                    datum = 'Offline'
            elif datum == 'Offline':
                if weather == True:
                    return {'channel_names':[], 'data':[]}
                else:
                    offline = True
                
            data.append(datum)

    timezone = tz.gettz('America/Los_Angeles')
    if date == None:
        raise Exception('Did not find date')
    if time == None:
        raise Exception('Did not find time')
    timestamp = datetime.datetime.strptime(date + ' ' + time,'%Y-%m-%d %H:%M:%S').replace(tzinfo = timezone)
    epoch_timestamp = (timestamp - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()
    return {
        'channel_names': channels,
        'data': [[epoch_timestamp] + data]
    }   

In [8]:
def getStationFeed(name):
    stationId = makeIdentifier(name)
    device = esdr.get_device_by_serial_number(product, stationId)
    if not device:
        esdr.create_device(product, stationId, name=name)
        device = esdr.get_device_by_serial_number(product, stationId)

    feed = esdr.get_feed(device)
    if not feed:
        esdr.create_feed(device)
        feed = esdr.get_feed(device)
    return feed

def upload(site, html):
    if site == 'rodeo':
        uploadRodeo(html)
    elif site == 'richmond':
        uploadRichmond(html)
    else:
        raise Exception('unknown site')

def uploadRodeo(html):
    # Repair problem with commented out <tr> on July 7
    html = re.sub(r'<tr>\s*-->\s*<td align="left"><strong>Date', '--><tr><td align="left"><strong>Date', html)
    
    selector = Selector(text = html)
    
    # Fence lines
    for (feed_name, column) in [('Rodeo South Fenceline', 1), ('Rodeo North Fenceline', 2)]:
        upload = []
        for sensor in ['FTIR', 'UV', 'TDL']:
            table = fetchTable(selector, sensor)
            upload.append(parseColumn(table, column, sensor))
        feed = getStationFeed(feed_name)
        print 'Uploading to %s' % (feed_name)
        esdr.upload(feed, upload)
        
    # OGDs and Weather
    ogd = fetchTable(selector, 'OGD')
    ogdUpload = parseColumn(ogd, 1, 'OGD')

    weather = fetchTable(selector, 'Weather')
    weatherUpload = parseColumn(weather, 1, None, True)

    # Extract wind direction
    for row in weather:
        for col in row:
            for match in re.finditer(r'Wind is.*?(\d+)', col):
                weatherUpload['channel_names'].append('Wind_Direction')
                weatherUpload['data'][0].append(float(match.group(1)))

    upload = [ogdUpload, weatherUpload]
    
    feed = getStationFeed('Rodeo')

    print 'Uploading to Rodeo'
    esdr.upload(feed, upload)

def uploadRichmond(html):
    selector = Selector(text = html)

    timestampText = re.search(r'System Status as of\s+(\w+\s+.*?(am|pm))', html).group(1)
    timestamp = dateutil.parser.parse(timestampText).replace(tzinfo=tz.gettz('America/Los_Angeles'))
    epochTimestamp = (timestamp - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()
    epochTimestamp

    s = selector.xpath("//div[h3/text()[contains(.,'Atchison Village Area')]]")
    # Capture 6 tables, two for each site (fenceline and community)
    tables = s.xpath(".//table[.//text()[contains(.,'Chemical')]]")
    if len(tables) != 6:
        raise Exception('Should have found 6 tables')
    # Capture 3 site names
    sites = s.xpath(".//h3/text()").extract()
    if len(sites) != 3:
        raise Exception('Should have found 3 sites')

    for i in range(0, 6):
        table = tables[i]
        site_name = sites[i / 2].replace('Area', '')
        site_name += ['Refinery Fence Line', 'Community'][i % 2]
        feed = getStationFeed(site_name)
        
        
        t = tableToArray(table)
        upload = {'channel_names':[], 'data':[[epochTimestamp]]}

        for row in t:
            datum = tryConvertToFloat(re.sub(r'\s*\(.*?\)', '', row[1]))
            if datum == 'Nothing detected':
                datum = 0
            upload['channel_names'].append(makeIdentifier(row[0]))
            upload['data'][0].append(datum)

        # Skies
        upload['channel_names'].append('Skies')
        upload['data'][0].append(t[0][2])

        # Weather
        for row in t:
            for field in row[2:]:
                split = field.split(':')
                if len(split) == 2:
                    key = makeIdentifier(split[0])
                    value = hardConvertToFloat(split[1])
                    if key == 'Dew_Point' or key == 'Wind_Origin':
                        continue
                    upload['channel_names'].append(key)
                    upload['data'][0].append(value)

        esdr.upload(feed, upload)
#uploadRodeo(open('rodeo/2016-12-05/23_02_01-UTC.html').read())

In [24]:
def uploadDate(date):
    for site in ['rodeo', 'richmond']:
        files = glob.glob('%s/%s/*.html' % (site, date.strftime('%Y-%m-%d')))

        for file in files:
            donePath = file + '.done'

            if not os.path.exists(donePath):
                print 'Uploading %s' % file
                html = open(file).read()
                
                if len(html) < 10000:
                    print 'Skipping %s, too short (%d bytes)' % (file, len(html))
                else:
                    upload(site, html)

                open(donePath, 'w')

In [25]:
uploadDate(datetime.datetime(2016,12,12))

Uploading richmond/2016-12-12/23:25:04-UTC.html
Uploading richmond/2016-12-12/23:26:03-UTC.html
Uploading richmond/2016-12-12/23:29:04-UTC.html
Uploading richmond/2016-12-12/23:30:03-UTC.html
Uploading richmond/2016-12-12/23:31:03-UTC.html
Uploading richmond/2016-12-12/23:32:04-UTC.html
Uploading richmond/2016-12-12/23:33:04-UTC.html
Uploading richmond/2016-12-12/23:34:03-UTC.html
Uploading richmond/2016-12-12/23:35:04-UTC.html
Uploading richmond/2016-12-12/23:36:04-UTC.html
Uploading richmond/2016-12-12/23:37:03-UTC.html
Uploading richmond/2016-12-12/23:38:03-UTC.html


KeyboardInterrupt: 

In [None]:
# Upload previous 50 days

for date in rrule.rrule(rrule.DAILY, datetime.date.today() - datetime.timedelta(days=50), until=datetime.date.today()):
    uploadDate(date)