![Image of Yaktocat](http://yuml.me/c961b9c3)

#States
* INIT
  * reads underlined text
  * extracts context
    * set next state = PREAMBLE
    
* PREAMBLE
  * reads PREAMBLE (NOTICE IS HEREBY...)
  * extracts preamble
    * set next state = PROCESS_ADVERTS

* PROCESS_ADVERTS
  * reads first key and first chunk of value
    * set next state = PROCESS_ADVERT
    
* PROCESS_ADVERT
  * reads next chunk and evaluate.
    ```
    if ':' in chunk then 
       store current key/value
       set current key, and initial value
    else
       append chunk to current-value
    ```



In [351]:
from bs4 import BeautifulSoup as Soup
from IPython.display import HTML
import pandas as pd
import re
import pprint

In [352]:
class State:
    def run(self):
        # implement this for printing trace statements
        assert 0, "implement me"
        
    def next(self, input):
        assert 0, "implement me"
        
    def flush(self, record):
        raise Exception('wtf!')



In [353]:
class Adverts(State):
    
    def __init__(self):
        State.__init__(self)
        
        # Current key and Value
        # value can span multiple lines/paragraphs
        # so we have to build it up a chunk at a time.
        # The complete text of value is assumed when
        # a new key/value pair is detected
        self.key = None
        self.value = None
        
        # Current Advertisement
        self.advert = {}
        self.processingAdvert = False
        
    def run(self):
        print('Process ADVERTS')
        
    def flush(self, record):
        self.advert[self.key] = self.value.strip()
        record['adverts'].append(self.advert)
        
    def next(self, para, record):
        text = para.get_text(strip=True)
        if ':' in text:
            k, v = text.split(':', 1)
            k = k.strip()
            if k.lower().startswith('agency'):
                # the first time the code gets here
                # processingAdvert is False, set it to True
                # so that from now on getting here means
                # we've completed an Advertisement
                
                if self.processingAdvert:
                    # End of record reached
                    self.advert[self.key] = self.value.strip()
                    record['adverts'].append(self.advert)
                    self.advert = {}
                
                # yep, processing advertisements
                self.processingAdvert = True
                    
            else:
                # A new key/value pair is detected,
                # store current key/value pair
                self.advert[self.key] = self.value.strip()
                
            # Initialize the current key/value pair
            self.key, self.value = k, v
            
        else:
            # A value can span multiple lines
            # keep adding chunks to the current value
            # until a new key/value pair is detected...
            if self.value:
                self.value += text   # Append to a previous value
            else:
                self.value = text    # Otherwise set value
            
        return self
        

class Preamble(State):
    def run(self):
        print('PREAMBLE')
        
    def next(self, para, record):
        text = para.get_text(strip=True).strip()
        if 'NOTICE IS HEREBY' in text:
            record['preamble'] = text
            return Adverts()
        return self
            

class Init(State):
    def run(self):
        print('INIT')
        
    def next(self, para, record):
        if para.find_all('u'):
            record['context'] = para.get_text(strip=True).strip()
            return Preamble()
        return self

        

In [354]:
def parse_notice(soup):
    state = Init()
    record = { 'adverts': []}

    for para in soup.find_all('p'):
    #     state.run()
    #     print (para)
        state = state.next(para, record)
    #     print ('\n\n')

    state.flush(record)
    return record



In [355]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
rows = pd.read_csv(fn, header=0)

In [356]:
cols = ['RequestID', 'StartDate', 'EndDate',
        'AgencyCode', 'AgencyName', 'AgencyDivision',
        'TypeOfNoticeCode', 'TypeOfNoticeDescription',
        'ShortTitle', 'SectionID', 'SectionName',
        'DueDate', 'ConfirmationNumber',
        'AdditionalDescription', 'Address1']

In [357]:
rows = rows[cols]

In [358]:
mocs = rows['AgencyName'] == "Mayor's Office of Contract Services" 
# meets = rows['TypeOfNoticeDescription'] == "Meeting"
notices = rows['TypeOfNoticeDescription'] == "Notice"
# rows = rows[mocs]
# rows = rows[notices]

In [359]:
# from IPython.display import display
# badcount = 0
# for html in rows.AdditionalDescription.values:
#     if not isinstance(html, str):
#         badcount += 1
#     else:
#         display(HTML(html))
#         try:
#             parse_notice(Soup(html))
#         except Exception:
#             badcount += 1

In [360]:
import json
def scrape(row):
    output = None
    if row['AgencyName'] == "Mayor's Office of Contract Services" and \
       row['TypeOfNoticeDescription'] == "Notice":
        try:
            output = parse_notice(Soup(row.AdditionalDescription))
        except Exception :
            output = { 'error' : 'bad input' }
    row['output'] = json.dumps(output)
    return row

In [361]:
fix = rows.apply(scrape,1)


In [362]:
fix


Unnamed: 0,RequestID,StartDate,EndDate,AgencyCode,AgencyName,AgencyDivision,TypeOfNoticeCode,TypeOfNoticeDescription,ShortTitle,SectionID,SectionName,DueDate,ConfirmationNumber,AdditionalDescription,Address1,output
0,20130621104,1/2/2014 0:00:00,12/31/2014 0:00:00,56,Police,,13,Notice,The following listed property is in the custod...,3,Property Disposition,,20130621104,<p> <strong><strong><strong><strong><strong><s...,,
1,20131104111,9/29/2014 0:00:00,10/8/2014 0:00:00,OCS,Mayor's Office of Contract Services,,12,Meeting,FCRC Public Meeting Notice,1,Public Hearings and Meetings,10/8/2014 14:30:00,20131104111,<p> &nbsp;</p> <p> &nbsp;&nbsp;&nbsp;&nbsp;&nb...,,
2,20131104112,10/30/2014 0:00:00,11/12/2014 0:00:00,OCS,Mayor's Office of Contract Services,,12,Meeting,FCRC Public Meeting Notice,1,Public Hearings and Meetings,11/12/2014 14:30:00,20131104112,"<p align=center""> &nbsp;</p> <p> &nbsp;&nbsp;&...",,
3,20131104113,12/1/2014 0:00:00,12/10/2014 0:00:00,OCS,Mayor's Office of Contract Services,,12,Meeting,FCRC Public Meeting Notice,1,Public Hearings and Meetings,12/10/2014 14:30:00,20131104113,<p> &nbsp;</p> <p> &nbsp;</p> <p> &nbsp;&nbsp;...,,
4,20140109107,1/2/2014 0:00:00,12/31/2014 0:00:00,856,Citywide Administrative Services,Office of Citywide Procurement,13,Notice,The Department of Citywide Administrative Serv...,3,Property Disposition,,20140109107,<p> The Department of Citywide Administrative ...,,
5,20140623102,10/6/2014 0:00:00,10/6/2014 0:00:00,NBM,Board Meetings,,12,Meeting,BOARD MEETINGS,1,Public Hearings and Meetings,,20140623102,<p> <strong>City Planning Commission</strong><...,,
6,20140623103,10/20/2014 0:00:00,10/20/2014 0:00:00,NBM,Board Meetings,,12,Meeting,BOARD MEETINGS,1,Public Hearings and Meetings,,20140623103,<p> <strong>City Planning Commission</strong><...,,
7,20140623104,10/27/2014 0:00:00,10/27/2014 0:00:00,NBM,Board Meetings,,12,Meeting,BOARD MEETINGS,1,Public Hearings and Meetings,,20140623104,<p> <strong>City Planning Commission</strong><...,,
8,20140814108,11/3/2014 0:00:00,11/3/2014 0:00:00,NBM,Board Meetings,,12,Meeting,BOARD MEETINGS,1,Public Hearings and Meetings,,20140814108,<p> <strong>City Planning Commission</strong><...,,
9,20140814109,11/10/2014 0:00:00,11/10/2014 0:00:00,NBM,Board Meetings,,12,Meeting,BOARD MEETINGS,1,Public Hearings and Meetings,,20140814109,<p> <strong>City Planning Commission</strong><...,,
