**Status**
* Starter notebook for Parsing: Housing Preservation and Development Notice



In [2]:
from bs4 import BeautifulSoup as Soup
from IPython.display import display, HTML
import pandas as pd
import re
import pprint
import json

In [3]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
rows = pd.read_csv(fn, header=0)

In [4]:
agency = rows['AgencyName'] == "Housing Preservation and Development"
description = rows['TypeOfNoticeDescription'] == "Notice"
target_rows = rows[agency & description]


strong starts a new section
document ends when strong contains Notice Date:\xa0\xa0 October 10\n

**tokenize**
* iterate over paragraphs
* if strong detected
  * if date is detected
    * emit transition(section_end)
    * emit transition(date)
    * emit text
  * emit transition(section_start | section_end)
    * emit strong.text (this is the section name)
    * emit transition(eof)

In [5]:
class Token:
    def __init__(self, name, value=''):
        self.name = name
        self.value = value

    def __repr__(self):
        return '\t\t{} : {}'.format(self.name, self.value)
    
def tokenize(fragment):
    parsing = False
    for para in Soup(fragment).find_all('p'):
        strong = para.find('strong')
        if strong:
            text = strong.get_text(strip=True).strip()
            if text.lower().startswith('notice date'):
                yield Token('END_SECTION')
                yield Token('DATE', para.get_text(strip=True))
            else:
                if parsing:
                    yield Token('END_SECTION')
                    yield Token('SECTION_NAME', text)
                else:
                    parsing = True
                    yield Token('SECTION_NAME', text)
        else:
            text = para.get_text(strip=True)
            yield Token('TEXT', text)
            
    yield Token('EOF')
        

In [6]:
class State:
    
    def __repr__(self):
        return 'State: {}'.format(self.__class__.__name__).upper()
    
    def run(self):
        pass
        
    def next(self, input, record):
        assert 0, "implement me"
        
    def flush(self, record):
        raise Exception('wtf!')


In [7]:
class Section(State):
    
    def __init__(self):
        State.__init__(self)
        self.name = None
        self.text = ''
        self.date = ''
        print ('\n\nState: {}'.format(self))
        
    def next(self, tok, record):
        print(tok)
        if tok.name == 'SECTION_NAME':
            self.name = tok.value
        elif tok.name == 'END_SECTION':
            record['sections'].append({'name' : self.name, 'text' : self.text })
            self.name, self.text = None, ''
        elif tok.name == 'TEXT':
            self.text += tok.value
        elif tok.name == 'DATE':
            record['date'] = tok.value
        elif tok.name == 'EOF':
            print('EOF detected')
        else:
            raise Exception( '{}: Unknown token [{}]'.format(self, tok))
        return self
    
    def flush(self, record):
        record['error'] = 'no transition from {}'.format(self)
        return record
                
        

In [8]:
class Init(State):
    
    def next(self, tok, record):
        if tok.name == 'BEGIN_SECTION':
            return Section()
        return self
        

In [9]:
orig_prettify = Soup.prettify
r = re.compile(r'^(\s*)', re.MULTILINE)
def prettify(self, encoding=None, formatter="minimal", indent_width=4):
    return r.sub(r'\1' * indent_width, orig_prettify(self, encoding, formatter))
Soup.prettify = prettify

In [10]:
def xscrape(row):
    output = {'sections' : []}
    text = row.AdditionalDescription
    if not isinstance(text, str):
        output = { 'error' : 'source is not a string: {}'.format(row.AdditionalDescription) }
    else:
        pprint.pprint(Soup(text).prettify(indent_width=5))
        print ('---'*15)
        display(HTML(text))
        state = Init()
        for tok in tokenize(text):
            state.run()
            state = state.next(tok, output)
            print (tok)
        pprint.pprint(output)
    display(HTML('<hr/>'))

    row['output'] = json.dumps(output)
    return row

In [11]:
def scrape(row):
    output = {}
    text = row.AdditionalDescription
    if not isinstance(text, str):
        output = { 'error' : 'source is not a string: {}'.format(text) }
    else:
        display(HTML(text))
        display(HTML('<hr/>'))

    row['output'] = json.dumps(output)
    return row

In [12]:
processed_rows = target_rows.apply(scrape,1)

In [13]:
errors = []
cols = ['RequestID', 'output', 'AdditionalDescription', 'StartDate', 'EndDate', 'DueDate']
for rec in processed_rows[cols].values:
    id, output, desc, dtStart, dtEnd, dtDue = rec
    output = json.loads(output)
    if output.get('error', None):
        errors.append({'RequestID' : id, 'error': output, 'desc' : desc, 
                       'StartDate' : dtStart, 'EndDate' : dtEnd, 'DueDate' :  dtDue
                      })
        continue

In [14]:
err_summary = '''<h1>{} Errors parsing Notice::Housing Preservation and Development</h1>'''.format(len(errors))
display(HTML(err_summary))
for error in errors:
    pprint.pprint(error)
    print('\n\n')
    desc = error['desc']
    if isinstance(desc, str):
        display(HTML(error['desc']))
    display(HTML('<hr/>'))


{'DueDate': nan,
 'EndDate': '10/16/2014 0:00:00',
 'RequestID': 20141003106,
 'StartDate': '10/16/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '10/27/2014 0:00:00',
 'RequestID': 20141020105,
 'StartDate': '10/27/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '10/24/2014 0:00:00',
 'RequestID': 20141022107,
 'StartDate': '10/24/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '11/3/2014 0:00:00',
 'RequestID': 20141027101,
 'StartDate': '11/3/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '11/24/2014 0:00:00',
 'RequestID': 20141103108,
 'StartDate': '11/14/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '11/28/2014 0:00:00',
 'RequestID': 20141118108,
 'StartDate': '11/28/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '11/25/2014 0:00:00',
 'RequestID': 20141118110,
 'StartDate': '11/25/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '12/22/2014 0:00:00',
 'RequestID': 20141202102,
 'StartDate': '12/12/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '12/9/2014 0:00:00',
 'RequestID': 20141202111,
 'StartDate': '12/9/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '12/12/2014 0:00:00',
 'RequestID': 20141210101,
 'StartDate': '12/12/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}



