# Parse emissions reports

This notebook parses reports scraped from the Texas Commission on Environmental Quality's Air Emission Event Reporting Database, and saves the parsed data to two CSV files.

In [1]:
import lxml.html
import pandas as pd
import glob
import re

In [2]:
report_paths = glob.glob("../inputs/scraped-reports/*.html")
len(report_paths)

1066

In [3]:
def get_cell_text(el):
    return re.sub(r"\s+", " ", el.text_content()).strip()

In [4]:
class Report(object):
    def __init__(self, path):
        self.path = path
        self.report_id = path.split("/")[-1].split(".")[0]
        self.dom = lxml.html.fromstring(open(path).read())
        self.tables = self.dom.cssselect("#content table")
    
    @property
    def metadata(self):
        cells = self.tables[0].cssselect("th, td")
        d = dict(zip(
            map(get_cell_text, cells[0::2]),
            map(get_cell_text, cells[1::2])
        ))
        d["report_id"] = self.report_id
        return d
    
    @property
    def emissions(self):
        arr = []
        for table in self.tables[1:]:
            rows = table.cssselect("tr")
            for row in rows:
                cells = row.cssselect("td")
                if len(cells) > 0:
                    arr.append([self.report_id] + list(map(get_cell_text, cells)))
        return arr

## Test on one report

In [5]:
report = Report(report_paths[0])

In [6]:
report.metadata

{'Action taken': '',
 'Cause': 'Unauthorized discharge at 150 Persimmon Manhole. Due to power outage. 9100 Gallons.',
 'City, County': 'BAYTOWN, HARRIS',
 'Emissions estimation method': '',
 'Event began:': '08/10/2017 8:43PM',
 'Event ended:': '08/10/2017 11:45PM',
 'Physical location': '',
 'Regulated entity RN number': 'RN101611457',
 'Regulated entity name': 'EAST DISTRICT',
 'This is based on the:': 'FINAL REPORT',
 'Type(s) of air emissions event:': 'WASTEWATER BYPASS',
 'report_id': '265500'}

In [7]:
report.emissions

[['265500', 'Sewage', '', '0.0', '9100.0 gals (est.)']]

## Parse all reports

In [8]:
report_metadata = pd.DataFrame([ Report(path).metadata for path in report_paths ])
report_metadata.columns = [ x.strip(":") for x in report_metadata.columns ]
report_metadata.head()

Unnamed: 0,Action taken,Cause,"City, County",Emissions estimation method,Event began,Event ended,Physical location,Regulated entity RN number,Regulated entity name,This is based on the,Type(s) of air emissions event,report_id
0,,Unauthorized discharge at 150 Persimmon Manhol...,"BAYTOWN, HARRIS",,08/10/2017 8:43PM,08/10/2017 11:45PM,,RN101611457,EAST DISTRICT,FINAL REPORT,WASTEWATER BYPASS,265500
1,,Chlorinated Excursion; Cleared Private Line; C...,"HOUSTON, HARRIS",,08/04/2017,08/04/2017,,RN101607596,BELTWAY WWTP,FINAL REPORT,WASTEWATER BYPASS,265502
2,,Scheduled for Further Repairs;,"HOUSTON, HARRIS",,08/04/2017,08/04/2017,,RN101612158,FWSD 23 WWTP,FINAL REPORT,WASTEWATER BYPASS,265503
3,,Unauthorized Discharge at 1016 Applewood manho...,"FRIENDSWOOD, HARRIS",,08/14/2017 7:00PM,08/14/2017 10:00PM,,RN102183340,BLACKHAWK REGIONAL WTP,FINAL REPORT,WASTEWATER BYPASS,265504
4,,Chlorinated Excursion; Cleared Private Line; C...,"HOUSTON, HARRIS",,08/04/2017,08/04/2017,"9400 White Chapel Ln, Houston, TX",RN101614113,KEEGANS BAYOU WWTP,FINAL REPORT,WASTEWATER BYPASS,265505


In [9]:
report_emissions = pd.concat([ pd.DataFrame(Report(path).emissions) for path in report_paths ])
report_emissions.columns = [ "report_id", "contaminant", "authorization", "limit", "amount_released" ]
report_emissions.head()

Unnamed: 0,report_id,contaminant,authorization,limit,amount_released
0,265500,Sewage,,0.0,9100.0 gals (est.)
0,265502,Sewage,,0.0,0.00001 gals (est.)
0,265503,Sewage,,0.0,0.00001 gals (est.)
0,265504,Sewage,,0.0,11000.0 gals (est.)
0,265505,Sewage,,0.0,0.00001 gals (est.)


## Write parsed data

In [10]:
report_metadata.to_csv("../outputs/report-metadata-raw.csv", index=False)
report_emissions.to_csv("../outputs/report-emissions-raw.csv", index=False)

# Do the same for historical (2015 and 2016) reports

Note: These reports correspond only to the industrial facilities that reported Harvey-related emissions.

In [11]:
historical_report_metadata = pd.DataFrame([ Report(path).metadata
    for path in glob.glob("../inputs/scraped-reports-historical/*.html") ])
historical_report_metadata.columns = [ x.strip(":") for x in historical_report_metadata.columns ]
historical_report_metadata.head()

Unnamed: 0,Action taken,Cause,"City, County",Emissions estimation method,Event began,Event ended,Physical location,Regulated entity RN number,Regulated entity name,This is based on the,Type(s) of air emissions event,report_id
0,"During start-up activities, TPC Operators and ...",Reconstruction of the Dehydrogenation Unit #2 ...,"HOUSTON, HARRIS",CEMS units and engineering calculations were u...,02/01/2015,04/19/2015 1:00AM,,RN100219526,HOUSTON PLANT,FINAL REPORT,EMISSIONS EVENT,207108
1,Enterprise recovered as much hydrocarbon as po...,Enterprise conducted maintenance in the Splitt...,"MONT BELVIEU, CHAMBERS",The emissions in this report are based on meas...,12/05/2014 10:00AM,01/16/2015 8:00AM,,RN102984911,ENTERPRISE EAST,FINAL REPORT,MAINTENANCE,207120
2,EU-1592 and U-1092 followed shutdown procedure...,The EU-1592 and U-1092 units were shutdown for...,"BAYTOWN, HARRIS","On-line flare instrumentation, sampling and en...",01/06/2015 4:00PM,01/26/2015 5:00PM,,RN103919817,CHEVRON PHILLIPS CHEMICAL CEDAR BAYOU PLANT,FINAL REPORT,AIR SHUTDOWN,207729
3,The derivative and polyethylene unit purge gas...,This Maintenance Notice is being retracted bec...,"BAYTOWN, HARRIS","On-line flare instrumentation, measurements an...",01/12/2015 8:00AM,02/11/2015 1:30PM,,RN103919817,CHEVRON PHILLIPS CHEMICAL CEDAR BAYOU PLANT,FINAL REPORT,MAINTENANCE,207732
4,Proceeded through startup as quickly as possib...,Emissions associated with Start-up of the plan...,"FREEPORT, BRAZORIA",Flare and cooling tower flowmeters and analyze...,01/08/2015 7:00AM,01/11/2015 6:00AM,,RN100225945,DOW TEXAS OPERATIONS FREEPORT,FINAL REPORT,AIR STARTUP,207797


In [12]:
historical_report_emissions = pd.concat([ pd.DataFrame(Report(path).emissions)
    for path in glob.glob("../inputs/scraped-reports-historical/*.html") ])
historical_report_emissions.columns = [ "report_id", "contaminant", "authorization", "limit", "amount_released" ]
historical_report_emissions.head()

Unnamed: 0,report_id,contaminant,authorization,limit,amount_released
0,207108,Carbon Monoxide,Initial Air Start-Up Notification #207108 Date...,1200.0 bbls,3414.0 lbs (est.)
1,207108,Nitrogen Oxides,Initial Air Start-Up Notification #207108 Date...,1250.0 lbs,3538.0 lbs (est.)
2,207108,Particulate Matter,Initial Air Start-Up Notification #207108 Date...,300.0 lbs,879.0 lbs (est.)
3,207108,Sulfur dioxide,Initial Air Start-Up Notification #207108 Date...,25.0 lbs,69.0 lbs (est.)
4,207108,VOCs,Initial Air Start-Up Notification #207108 Date...,225.0 lbs,636.0 lbs (est.)


In [13]:
historical_report_metadata.to_csv("../outputs/report-metadata-raw-historical.csv", index=False)
historical_report_emissions.to_csv("../outputs/report-emissions-raw-historical.csv", index=False)

---

---

---