**Status**
* Some entries seem to be missing context

* ![](http://yuml.me/822a77d9)


In [5]:
from bs4 import BeautifulSoup as Soup
from IPython.display import display, HTML
import pandas as pd
import re
import pprint
import json

In [6]:
class Token:
    def __init__(self, name, value=''):
        self.name = name
        self.value = value
    
    def __repr__(self):
        return '{} : {}'.format(self.name, self.value)
        
# we will use these tokens:
# NOTICE -> when NOTICE IS HEREBY GIVEN is detected
# MATTER -> when IN THE MATTER OF is detected
def tokenize(fragment):
    soup = Soup(fragment)
    for para in soup.find_all('p'):
        text = para.get_text(strip=True)
        
        if 'notice is hereby given' in text.lower():
            yield Token('NOTICE')
            
            # 'in the matter' could be part of the same text
            # that contained the 'notice is hereby given'
            # so check immediately after yielding NOTICE
            if 'in the matter' in text.lower():
                loc = text.lower().find('in the matter')
                yield Token('TEXT', text[:loc])
                yield Token('MATTER')
                yield Token('TEXT', ' ' + text[loc:])
            else:
                yield Token('TEXT', text) 
        elif 'in the matter' in text.lower():
            loc = text.lower().find('in the matter')
            yield Token('TEXT', text[:loc])
            yield Token('MATTER')
            yield Token('TEXT', ' ' + text[loc:])
        else:
            yield Token('TEXT', text)


In [7]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
rows = pd.read_csv(fn, header=0)

In [8]:
class State:
    def run(self):
        # implement this for printing trace statements
        assert 0, "implement me"
        
    def next(self, input):
        assert 0, "implement me"
        
    def flush(self, record):
        raise Exception('wtf!')


In [9]:
class Matters(State):
    
    def __init__(self):
        State.__init__(self)
        self.matter = ''
    
    def run(self):
        print('Process MATTERS')
        
    def flush(self, record):
        record['matters'].append({ 'matter' : self.matter })
        self.matter = ''
        
    def next(self, tok, record):
        if tok.name == 'TEXT':
            self.matter += tok.value
        if tok.name == 'MATTER':
            self.flush(record)
        return self
            

class Notice(State):
    
    def __init__(self, notice=''):
        State.__init__(self)
        self.notice = notice
        
    def run(self):
        print('NOTICE')
        
    def next(self, tok, record):
        if tok.name == 'TEXT':
            self.notice += tok.value
            return self
        if tok.name == 'MATTER':
            record['context'] = self.notice
            return Matters()
        return self
    
    def flush(self, record):
        record['error'] = 'no transition out of NOTICE'
        
    
class Init(State):
    
    def run(self):
        print('INIT')
        
    def next(self, tok, record):
        if tok.name == 'NOTICE':
            return Notice()
        return self
                                  
    def flush(self, record):
        record['error'] = 'no transition out of INIT'
                                  

In [10]:
agency = rows['AgencyName'] == "Citywide Administrative Services"
description = rows['TypeOfNoticeDescription'] == "Public Hearings"
target_rows = rows[agency & description]


In [11]:
def scrape(row):
    output = {}
    if not isinstance(row.AdditionalDescription, str):
        output = { 'error' : 'source is not a string: {}'.format(row.AdditionalDescription) }
    else:   
        state = Init()
        record = { 'matters' :[], 'context' : '' }
        for tok in tokenize(row.AdditionalDescription):
            state = state.next(tok, record)
        state.flush(record)
        output = record

    row['output'] = json.dumps(output)
    return row

In [12]:
processed_rows = target_rows.apply(scrape,1)

In [13]:
errors = []
cols = ['RequestID', 'output', 'AdditionalDescription', 'StartDate', 'EndDate', 'DueDate']
for rec in processed_rows[cols].values:
    id, output, desc, dtStart, dtEnd, dtDue = rec
    output = json.loads(output)
    if output.get('error', None):
        errors.append({'RequestID' : id, 'error': output, 'desc' : desc, 
                       'StartDate' : dtStart, 'EndDate' : dtEnd, 'DueDate' :  dtDue
                      })
        continue
    print('RequestID: {}'.format(id))
    pprint.pprint(output)
    display(HTML(desc))
    
    display(HTML('<hr/>'))

RequestID: 20141006102
{'context': 'NOTICE IS HEREBY GIVEN THAT A REAL PROPERTY ACQUISITIONS AND '
            'DISPOSITIONS PUBLIC HEARING, in accordance with Section 824 of '
            'the New York City Charter, will be held on Wednesday October '
            '22, 2014 at 10:00 a.m., 22 Reade Street, 2ndfloor conference '
            'room, Borough of Manhattan',
 'matters': [{'matter': ' IN THE MATTER OFa lease for The City of New '
                        'York, as Tenant, of approximately 10,826 rentable '
                        'square feet of space on part of the second\xa0 '
                        'floor in a building located at 1 Teleport '
                        'Drive\xa0 (Block 2165, Lot 170), in the Borough '
                        'of Staten Island, for the Traffic Enforcement '
                        'Division of the Police Department to use as an '
                        'office.The proposed lease shall be for a period '
                        'of twenty (20) 

RequestID: 20141105101
{'context': 'NOTICE IS HEREBY GIVEN THAT A REAL PROPERTY ACQUISITIONS AND '
            'DISPOSITIONS PUBLIC HEARING, in accordance with Section 824 of '
            'the New York City Charter, will be held on November 24, 2014 at '
            '10:00 a.m., 22 Reade Street, 2ndfloor conference room, Borough '
            'of Manhattan.',
 'matters': [{'matter': ' IN THE MATTER OFa lease renewal and amendment '
                        'agreement for The City of New York, as Tenant, of '
                        'approximately 21,741 rentable square feet of '
                        'space on the tenth floor in a building located at '
                        '200 Varick Street (Block 520, Lot 1) in the '
                        'Borough of Manhattan for the Board of Elections '
                        'to use as an office, or any other use that the '
                        'Department of Citywide Administrative Services '
                        'may determine.The 

RequestID: 20141118109
{'context': 'NOTICE IS HEREBY GIVEN THAT A REAL PROPERTY ACQUISITIONS AND '
            'DISPOSITIONS PUBLIC HEARING, in accordance with Section 824 of '
            'the New York City Charter, will be held on December 10, 2014 at '
            '10:30 a.m., 22 Reade Street, 2ndfloor conference room, Borough '
            'of Manhattan, ',
 'matters': [{'matter': ' in the matter of a lease for the City of New '
                        'York, as tenant, of approximately 16,334 rentable '
                        'square feet of space on the entire fifth (5th) '
                        'floor of the building located at 118-35 Queens '
                        'Boulevard (Block 2270, Lot 41) in the Borough of '
                        'Queens for a Computerized Testing and '
                        'Applications Center, or for such other use as the '
                        'Commissioner of the Department of Citywide '
                        'Administrative Services m

In [14]:
err_summary = '<h1>{} Errors parsing Citywide Administrative Services::Public Hearings'.format(len(errors))
display(HTML(err_summary))
for error in errors:
    pprint.pprint(error)
    print('\n\n')
    desc = error['desc']
    if isinstance(desc, str):
        display(HTML(error['desc']))
    display(HTML('<hr/>'))


{'DueDate': nan,
 'EndDate': '10/8/2014 0:00:00',
 'RequestID': 20140930111,
 'StartDate': '10/8/2014 0:00:00',
 'desc': '<p align=center"> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN '
         'THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '10/10/2014 0:00:00',
 'RequestID': 20141007108,
 'StartDate': '10/10/2014 0:00:00',
 'desc': '<p align=center"> <strong>Notice of Public Hearing</strong></p> '
         '<p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN THAT A REAL '
         'PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': '11/13/2014 10:00:00',
 'EndDate': '11/3/2014 0:00:00',
 'RequestID': 20141030107,
 'StartDate': '11/3/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': '11/13/2014 10:00:00',
 'EndDate': '11/3/2014 0:00:00',
 'RequestID': 20141031112,
 'StartDate': '11/3/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': '11/13/2014 10:00:00',
 'EndDate': '11/6/2014 0:00:00',
 'RequestID': 20141103110,
 'StartDate': '11/6/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '11/13/2014 0:00:00',
 'RequestID': 20141106101,
 'StartDate': '11/13/2014 0:00:00',
 'desc': '<p align=center"> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN '
         'THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '11/26/2014 0:00:00',
 'RequestID': 20141119103,
 'StartDate': '11/26/2014 0:00:00',
 'desc': '<p align=center"> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN '
         'THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '11/28/2014 0:00:00',
 'RequestID': 20141120105,
 'StartDate': '11/28/2014 0:00:00',
 'desc': '<p align=center"> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN '
         'THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': '12/11/2014 10:00:00',
 'EndDate': '11/28/2014 0:00:00',
 'RequestID': 20141120108,
 'StartDate': '11/28/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': '12/11/2014 10:00:00',
 'EndDate': '12/1/2014 0:00:00',
 'RequestID': 20141128101,
 'StartDate': '12/1/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '12/8/2014 0:00:00',
 'RequestID': 20141128107,
 'StartDate': '12/8/2014 0:00:00',
 'desc': '<p align=center"> <strong>Notice of Public Hearing</strong></p> '
         '<p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN THAT A REAL '
         'PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '12/4/2014 0:00:00',
 'RequestID': 20141201104,
 'StartDate': '12/4/2014 0:00:00',
 'desc': '<p align=center"> <strong>Notice of Public Hearing</strong></p> '
         '<p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN THAT A REAL '
         'PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '12/8/2014 0:00:00',
 'RequestID': 20141201109,
 'StartDate': '12/8/2014 0:00:00',
 'desc': '<p align=center"> <strong>Notice of Public Hearing</strong></p> '
         '<p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN THAT A REAL '
         'PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': '12/11/2014 10:00:00',
 'EndDate': '12/4/2014 0:00:00',
 'RequestID': 20141203109,
 'StartDate': '12/4/2014 0:00:00',
 'desc': nan,
 'error': {'error': 'source is not a string: nan'}}





{'DueDate': nan,
 'EndDate': '12/26/2014 0:00:00',
 'RequestID': 20141219102,
 'StartDate': '12/26/2014 0:00:00',
 'desc': '<p align=center"> <strong>Notice of Public Hearing</strong></p> '
         '<p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN THAT A REAL '
         'PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '12/29/2014 0:00:00',
 'RequestID': 20141219106,
 'StartDate': '12/29/2014 0:00:00',
 'desc': '<p align=center"> &nbsp;</p> <p> <strong>NOTICE IS HEREBY GIVEN '
         'THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}





{'DueDate': nan,
 'EndDate': '12/31/2014 0:00:00',
 'RequestID': 20141223103,
 'StartDate': '12/31/2014 0:00:00',
 'desc': '<p align=center"> <strong>Corrected Notice of Public '
         'Hearing</strong></p> <p> &nbsp;</p> <p> <strong>NOTICE IS HEREBY '
         'GIVEN THAT A REAL PROPERTY ACQUISITIONS AND DISPOSITIONS PUBLIC '
         'HEARING</strong>',
 'error': {'context': '',
           'error': 'no transition out of NOTICE',
           'matters': []}}



