**Status**
* Starter notebook for Parsing: Citywide Administrative Services Notice



In [1]:
from bs4 import BeautifulSoup as Soup
from IPython.display import display, HTML
import pandas as pd
import re
import pprint
import json

In [2]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
rows = pd.read_csv(fn, header=0)

In [3]:
agency = rows['AgencyName'] == "Citywide Administrative Services"
description = rows['TypeOfNoticeDescription'] == "Notice"
target_rows = rows[agency & description]


In [4]:
orig_prettify = Soup.prettify
r = re.compile(r'^(\s*)', re.MULTILINE)
def prettify(self, encoding=None, formatter="minimal", indent_width=4):
    return r.sub(r'\1' * indent_width, orig_prettify(self, encoding, formatter))
Soup.prettify = prettify

In [5]:
def scrape(row):
    output = {}
    text = row.AdditionalDescription
    if not isinstance(text, str):
        output = { 'error' : 'source is not a string: {}'.format(text) }
    else:
        display(HTML(text))
        display(HTML('<hr/>'))

    row['output'] = json.dumps(output)
    return row

In [6]:
processed_rows = target_rows.apply(scrape,1)

In [7]:
errors = []
cols = ['RequestID', 'output', 'AdditionalDescription', 'StartDate', 'EndDate', 'DueDate']
for rec in processed_rows[cols].values:
    id, output, desc, dtStart, dtEnd, dtDue = rec
    output = json.loads(output)
    if output.get('error', None):
        errors.append({'RequestID' : id, 'error': output, 'desc' : desc, 
                       'StartDate' : dtStart, 'EndDate' : dtEnd, 'DueDate' :  dtDue
                      })
        continue

In [9]:
err_summary = '''<h1>{} Errors parsing Notice::Citywide Administrative Services</h1>'''.format(len(errors))
display(HTML(err_summary))
for error in errors:
    pprint.pprint(error)
    desc = error['desc']
    if isinstance(desc, str):
        display(HTML(error['desc']))
    display(HTML('<hr/>'))


{'DueDate': nan,
 'EndDate': '10/3/2014 0:00:00',
 'RequestID': 20140929102,
 'StartDate': '10/3/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '10/10/2014 0:00:00',
 'RequestID': 20141006101,
 'StartDate': '10/10/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '10/8/2014 0:00:00',
 'RequestID': 20141007109,
 'StartDate': '10/8/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '10/17/2014 0:00:00',
 'RequestID': 20141014103,
 'StartDate': '10/17/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '10/24/2014 0:00:00',
 'RequestID': 20141020107,
 'StartDate': '10/24/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '10/31/2014 0:00:00',
 'RequestID': 20141027108,
 'StartDate': '10/31/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '11/7/2014 0:00:00',
 'RequestID': 20141103111,
 'StartDate': '11/7/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '11/14/2014 0:00:00',
 'RequestID': 20141110105,
 'StartDate': '11/14/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '11/21/2014 0:00:00',
 'RequestID': 20141117107,
 'StartDate': '11/21/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '11/28/2014 0:00:00',
 'RequestID': 20141124105,
 'StartDate': '11/28/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '12/5/2014 0:00:00',
 'RequestID': 20141201102,
 'StartDate': '12/5/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '12/12/2014 0:00:00',
 'RequestID': 20141208104,
 'StartDate': '12/12/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '12/19/2014 0:00:00',
 'RequestID': 20141215111,
 'StartDate': '12/19/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}


{'DueDate': nan,
 'EndDate': '12/26/2014 0:00:00',
 'RequestID': 20141222102,
 'StartDate': '12/26/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}
