In [38]:
import pandas as pd
from bs4 import BeautifulSoup
from collections import deque, OrderedDict
import os,datetime, pytz, calendar, urllib2, json, re

In [39]:
#area name and associated feed IDs
areas = [{
    "Atchison Village" : {
        4910 : {},
        4909 : {}},
    "North Richmond" : {
        4911 : {},
        4912 : {}},
    "Point Richmond" : {
        4913 : {},
        4914 : {}}
    },{
    "Rodeo" : {
        4901 : {},
        4902 : {},
        10011 : {}}
    }];

#chemical names and associated health threshold
channels = OrderedDict([("Benzene", 1), ("Black_Carbon", 5), ("Ethylbenzene", 60), ("Hydrogen_Sulfide", 8),("Sulfur_Dioxide", 75),
            ("Toluene", 70),("Xylene", 50)])
channelNames = channels.keys()

detectionLimits = pd.DataFrame.from_items([
        ('Fenceline', [5,None,None,30,5,5,5]),
        ('Community', [0.5,0.05,0.5,2,None,0.5,0.5]),
        ('Hound', [30,None,None,30,None,30,30]),  
    ],orient='index', columns=channelNames)

In [40]:
#-------------------------------------------------------------------
#
#HELPER FUNCTIONS
#
#-------------------------------------------------------------------

In [41]:
def isRodeo(feed):
    return 4900 < feed < 4903

In [42]:
def isRichmondCommunity(feed):
    return feed == 4910 or feed == 4912 or feed == 4914

In [43]:
def getFeedType(feed):
    feedTypes = {"Fenceline" : [4909,4911,4913,4901,4902], "Community" : [4910,4912,4914],
                 "Hound": [10011]}
    for t in feedTypes:
        if feed in feedTypes[t]:
            return t
#getFeedType(10011)

In [44]:
def addPrefix(feed, chemical):
    if isRodeo(feed) and chemical == "Hydrogen_Sulfide":
        return "TDL_"+chemical
    elif isRodeo(feed):
        return "UV_"+chemical
    elif isRichmondCommunity(feed) and chemical == "Xylene":
        return "m_p_Xylene,o_Xylene"
    else:
        return chemical

In [45]:
def slicePrefix(chan):
    if re.match('[A-Z]+_',chan):
        return chan[chan.find('_') + 1:]
    else:
        return chan

In [46]:
def unit(chemical):
    if chemical == "Black_Carbon":
        return "µg/m³"
    else:
        return "ppb"

In [47]:
def generateHealthFactor(value, chemical):
    healthLimit = channels[chemical]
    factor = value/healthLimit
    if factor % 1 >= 0.5:
        prepend = "almost "
    else:
        prepend = "over "
    return prepend + str(int(round(factor))) + "x"
#generateHealthFactor(1.59655, "Benzene")

In [48]:
def noData(n):
    ret = []
    for i in range (0,n):
        ret.append("No data")
    return ret

In [49]:
#returns start and end timestamps of provided date
def getEpochTimeBounds(d, duration=1):
    dt = assignPacificTimeZone(datetime.datetime(d.year,d.month,d.day))

    start = calendar.timegm(dt.utctimetuple())
    end = calendar.timegm((dt + datetime.timedelta(days=duration)).utctimetuple())
    return {'start' : start, 'end': end}
#getEpochTimeBounds(datetime.date(2017,2,1))

In [50]:
#attach DST aware timezone offset
#Note: does not convert time
def assignPacificTimeZone(dt):
    pacific = pytz.timezone("US/Pacific")
    dt = pacific.localize(dt)
    return dt

In [51]:
#convert either a unix timestamp or a datetime with tzinfo to a datetime in Pacific time
def convertToPacific(time):
    if not isinstance(time,datetime.datetime):
        time = datetime.datetime.fromtimestamp(time,tz=pytz.utc)
    pacific = pytz.timezone("US/Pacific")
    inPacific = time.astimezone(pacific)
    return inPacific

In [52]:
def fillDatum(inputArr,hole):
    data = inputArr.popleft()
    if not (re.match('0.00+',data) or re.search('\s0x',data) or re.match('nan',data)):
        hole.string = data

In [53]:
#-------------------------------------------------------------------
#
#ANALYSIS FUNCTIONS
#
#-------------------------------------------------------------------

In [54]:
#def calcDownTime(df):

In [55]:
def maxHourlyAverage(df, windFeed, channel):
    ret = []
    halfHourInSecs = 30 * 60
    def avg(x, delta):
        ser = df.iloc[(df.index >= x - delta) & (df.index <= x + delta), 0]
        return ser.mean()
    
    healthLimit = 1
    df['avg'] = pd.Series(data = df.index, index = df.index).apply(lambda x: avg(x,delta=halfHourInSecs))
    maxAvg = df.nlargest(1,'avg')
    maxAvgValue = maxAvg.avg.iloc[0]
    ret.append("%.2f" % maxAvgValue + unit(channel))
    
    #don't calculate time and wind data if max average was 0 (below detection limit)
    if maxAvgValue != 0:
        ret.append(generateHealthFactor(maxAvgValue,channel))
    
        hourStart = convertToPacific(maxAvg.index[0] - halfHourInSecs).strftime('%I:%M')
        hourEnd = convertToPacific(maxAvg.index[0] + halfHourInSecs).strftime('%I:%M%p')
        ret.append(hourStart + "-" + hourEnd)
    
        #get wind data for hour with highest average
        bounds = {'start':maxAvg.index[0] - halfHourInSecs,'end':maxAvg.index[0] + halfHourInSecs}
        wind = makeDataFrameFromEsdr(windFeed,"Wind_Direction,Wind_Speed_MPH","Wind_Direction,Wind_Speed_MPH",{'bounds':bounds})
        
        if len(wind) == 0 or wind['Wind_Direction'].mean() == 0:
            ret.append("No data")
        else:
            #break into quadrants and select the prevailing one
            quads = [0,90,180,270,360]
            quad_names = ['NE','SE','SW','NW']
            wind['Compass_Dir'] = pd.cut(wind['Wind_Direction'],quads,labels=quad_names)
            direction = wind.groupby('Compass_Dir').sum().nlargest(1,'Wind_Speed_MPH').index[0]
            ret.append(direction)
    else:
        ret.extend(['0.00','0.00','0.00'])
    return ret
#maxHourlyAverage(df, 4914, "Benzene")

In [56]:
#total time, in hours, that a detection was present of given chemical or aggregated set of chemicals
def calcHoursDetected(df, chemical, sampleFrequency = 1):
    detected = df.loc[df[chemical] > 0, [chemical]]
    fullDecimal = sampleFrequency * len(detected) / float(60)
    return "%.2f" % fullDecimal + " hours"
#calcHoursDetected(df, "Benzene")

In [57]:
#total time, in hours, that detection was greater than health threshold of given chemical
def calcHoursAboveHealthLimit(df, chemical, sampleFrequency = 1):
    limit = channels[chemical]
    detected = df.loc[df[chemical] > limit, [chemical]]
    fullDecimal = sampleFrequency * len(detected) / float(60)
    return "%.2f" % fullDecimal + " hours"
#calcHoursAboveHealthLimit(df, "Benzene")

In [58]:
def calcDailyMean(df, chemical, nd=0):
    if nd == 0:
        fullDecimal = df[chemical].mean()
    else:
        #substitute readings of 0 for the passed-in non-detect value
        #(which should represent that chemicals' detection limit)
        fullDecimal = df.replace(0.0,nd)[chemical].mean()
    return "%.2f" % fullDecimal + unit(chemical)
#calcDailyMean(df,"Benzene", 0.5)

In [59]:
#-------------------------------------------------------------------
#
#ESDR FUNCTIONS
#
#-------------------------------------------------------------------

In [60]:
def loadFeeds():
    for index in range(0,2):
        for locale in areas[index]:
            for feed in areas[index][locale]:
                url = "https://esdr.cmucreatelab.org/api/v1/feeds/%s" % feed
                ogChannels = json.loads(urllib2.urlopen(url).read())['data']['channelBounds']['channels'].keys()
                for chan in ogChannels:
                    formattedChannel = slicePrefix(chan)
                    if formattedChannel in channels:
                        areas[index][locale][feed][formattedChannel] = chan

In [61]:
def makeDataFrameFromEsdr(feed, formattedChannel, esdrChannel, timeOptions={}):
    if timeOptions.get('bounds') == None:
        duration = timeOptions.get('duration') or 1
        bounds = getEpochTimeBounds(timeOptions.get('day'), duration)
    else:
        bounds = timeOptions.get('bounds')
    url = "https://esdr.cmucreatelab.org/api/v1/feeds/%s/channels/%s/export?from=%s&to=%s&format=json" % (feed, esdrChannel, bounds['start'], bounds['end'])
    try:
        r = json.loads(urllib2.urlopen(url).read())
        print "loaded " + str(len(r['data'])) + " data points for feed " + str(feed) + ", channel " + esdrChannel + ", time " + str(bounds['start'])
    except:
        print "error loading data from ESDR: feed " + str(feed) + ", channel " + esdrChannel + ", time " + str(bounds['start'])
    isXylene = esdrChannel == "m_p_Xylene,o_Xylene"
    cols = ['Time']
    cols.extend(esdrChannel.split(",") if isXylene else formattedChannel.split(","))
    df = pd.DataFrame(r["data"],columns=cols).set_index(['Time'])
    if(isXylene):
        df["Xylene"] = df["m_p_Xylene"] + df["o_Xylene"]
        del df["m_p_Xylene"]
        del df["o_Xylene"]
    df = df.replace("[^0-9]+",0,regex=True)
    return df
#makeDataFrameFromEsdr(4911,"Benzene","Benzene",{'day':datetime.date(2017,3,18)})

In [62]:
#-------------------------------------------------------------------
#
#REPORTING FUNCTIONS
#
#-------------------------------------------------------------------

In [63]:
def compileCalculations(df, feed, chemical):
    sampleRate = 5 if isRodeo(feed) else 1
    windFeed = 4903 if isRodeo(feed) else feed
    t = getFeedType(feed)
    dl = detectionLimits[chemical][t]
    result = [t, str(dl) + unit(chemical)]
    if len(df) == 0:
        result.extend(noData(8))
    else:
        result.extend(maxHourlyAverage(df, windFeed, chemical))
        result.append(calcHoursDetected(df, chemical, sampleRate))
        result.append(calcHoursAboveHealthLimit(df,chemical, sampleRate))
        result.append(calcDailyMean(df, chemical))
        result.append(calcDailyMean(df,chemical,dl))
    return result
#compileCalculations(df,4902,"Benzene")

In [64]:
def fillInThreeMonitorTemplate(calculations, dataHoles):
    for i, hole in enumerate(dataHoles):
        if i % 3 == 0:
            fillDatum(calculations[2],hole)
        elif (i-1) % 3 == 0:
            fillDatum(calculations[1],hole)
        else:
            fillDatum(calculations[0],hole)
#fillInThreeMonitorTemplate(calculations, dataHoles)

In [65]:
def fillInTwoMonitorTemplate(calculations, dataHoles):
    for i, hole in enumerate(dataHoles):
        if i % 2 == 0:
            fillDatum(calculations[0],hole)
        else:
            fillDatum(calculations[1],hole)

In [66]:
loadFeeds()
pacific = pytz.timezone("US/Pacific")
dt = datetime.datetime.now(tz=pytz.utc).astimezone(pacific)
date = dt.date() - datetime.timedelta(days=1)
dateDir = 'archived/' + date.strftime('%Y-%m-%d')
if not os.path.exists(dateDir):
    os.makedirs(dateDir)
for a, area in enumerate(areas):
    for locale in area:
        allCalculations = []
        for chemical in channels:
            if a == 1 and (chemical == "Ethylbenzene" or chemical == "Black_Carbon"):
                continue
            chemCalculations = []
            for feed in area[locale]:
                df = makeDataFrameFromEsdr(feed,chemical,addPrefix(feed,chemical),{'day':date})
                chemCalculations.append(deque(compileCalculations(df, feed, chemical)))
            for i in range(0,len(chemCalculations)):
                if i > len(allCalculations) - 1:
                    allCalculations.append(deque([]))
                allCalculations[i].extend(chemCalculations[i])
        areaName = "Richmond" if a==0 else "Rodeo"
        soup = BeautifulSoup(open("templates/" + areaName  + ".html"), "lxml")
        prettyDate = "Day of " + date.strftime('%B %d, %Y')
        for hole in soup.findAll("td", class_='locale'):
            hole.string = locale
        for hole in soup.findAll("td", class_='date'):
            hole.string = prettyDate
        dataHoles = deque(soup.findAll("td", class_='data'))
        if a==0:
            fillInTwoMonitorTemplate(allCalculations, dataHoles)
        elif a==1:
            fillInThreeMonitorTemplate(allCalculations,dataHoles)

        html = soup.prettify("utf-8")
        outfile = 'yesterday/' + re.sub('\W+', '_', locale) + '.html'
        with open(outfile, "wb") as file:
            file.write(html)
        archived = 'archived/' + date.strftime('%Y-%m-%d') +'/'+ re.sub('\W+', '_', locale) + '.html'
        with open(archived, "wb") as file:
            file.write(html)

loaded 1430 data points for feed 4913, channel Benzene, time 1489734000
loaded 1430 data points for feed 4914, channel Benzene, time 1489734000
loaded 0 data points for feed 4913, channel Black_Carbon, time 1489734000
loaded 1430 data points for feed 4914, channel Black_Carbon, time 1489734000
loaded 59 data points for feed 4914, channel Wind_Direction,Wind_Speed_MPH, time 1489752060
loaded 0 data points for feed 4913, channel Ethylbenzene, time 1489734000
loaded 1430 data points for feed 4914, channel Ethylbenzene, time 1489734000
loaded 1430 data points for feed 4913, channel Hydrogen_Sulfide, time 1489734000
loaded 1430 data points for feed 4914, channel Hydrogen_Sulfide, time 1489734000
loaded 61 data points for feed 4914, channel Wind_Direction,Wind_Speed_MPH, time 1489809900
loaded 1430 data points for feed 4913, channel Sulfur_Dioxide, time 1489734000
loaded 61 data points for feed 4913, channel Wind_Direction,Wind_Speed_MPH, time 1489781400
error loading data from ESDR: feed 49

UnboundLocalError: local variable 'r' referenced before assignment