**Status**
* Starter notebook for Parsing: Administration for Children's Services Public Hearings



In [1]:
from bs4 import BeautifulSoup as Soup
from IPython.display import display, HTML
import pandas as pd
import re
import pprint
import json

In [2]:
orig_prettify = Soup.prettify
r = re.compile(r'^(\s*)', re.MULTILINE)
def prettify(self, encoding=None, formatter="minimal", indent_width=4):
    return r.sub(r'\1' * indent_width, orig_prettify(self, encoding, formatter))
Soup.prettify = prettify

In [3]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
rows = pd.read_csv(fn, header=0)

In [4]:
agency = rows['AgencyName'] == "Administration for Children's Services"
description = rows['TypeOfNoticeDescription'] == "Public Hearings"
target_rows = rows[agency & description]


In [5]:
def scrape(row):
    output = {}
    text = row.AdditionalDescription
    if not isinstance(text, str):
        output = { 'error' : 'source is not a string: {}'.format(text) }
    else:
        display(HTML(text))
        display(HTML('<hr/>'))

    row['output'] = json.dumps(output)
    return row

In [6]:
processed_rows = target_rows.apply(scrape,1)

In [7]:
errors = []
cols = ['RequestID', 'output', 'AdditionalDescription', 'StartDate', 'EndDate', 'DueDate']
for rec in processed_rows[cols].values:
    id, output, desc, dtStart, dtEnd, dtDue = rec
    output = json.loads(output)
    if output.get('error', None):
        errors.append({'RequestID' : id, 'error': output, 'desc' : desc, 
                       'StartDate' : dtStart, 'EndDate' : dtEnd, 'DueDate' :  dtDue
                      })
        continue

In [8]:
err_summary = '''<h1>{} Errors parsing Public Hearings::Administration for Children's Services</h1>'''.format(len(errors))
display(HTML(err_summary))
for error in errors:
    pprint.pprint(error)
    print('\n\n')
    desc = error['desc']
    if isinstance(desc, str):
        display(HTML(error['desc']))
    display(HTML('<hr/>'))
