In [31]:
import re
import pandas as pd
import json
from bs4 import BeautifulSoup as Soup
import pprint

#pd.options.display.max_rows=100

In [32]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
fixed = pd.read_csv(fn, header=0)

In [33]:
agencyNames = sorted(list(fixed['AgencyName'].unique()))
fixed['AgencyName'].value_counts().head(10)

Mayor's Office of Contract Services       67
Community Boards                          36
Citywide Administrative Services          35
Landmarks Preservation Commission         18
Housing Preservation and Development      18
Human Resources Administration            17
Transportation                            16
Health and Mental Hygiene                 15
Administration for Children's Services    14
City Planning                             13
dtype: int64

In [34]:
noticeDescription = pd.unique(fixed.TypeOfNoticeDescription.ravel())
noticeDescription

array(['Notice', 'Meeting', 'Public Hearings', 'Sale'], dtype=object)

# Parse Notices

In [35]:
mocs = fixed['AgencyName'] == "Mayor's Office of Contract Services" 
meets = fixed['TypeOfNoticeDescription'] == "Meeting"
notices = fixed['TypeOfNoticeDescription'] == "Notice"

In [36]:
fixed[mocs].TypeOfNoticeDescription.value_counts()

Notice     64
Meeting     3
dtype: int64

In [37]:
fixed[mocs].TypeOfNoticeDescription.value_counts().keys()

Index(['Notice', 'Meeting'], dtype='object')

In [39]:
cols = ['RequestID', 'AgencyCode', 'AgencyName',  'TypeOfNoticeCode', 'TypeOfNoticeDescription', 'ShortTitle', 'SectionID', 'SectionName', 'ConfirmationNumber', 'AdditionalDescription']
fixed = fixed[cols]
fixed = fixed[mocs]
ad_notices = fixed[notices]
ad_meetings = fixed[meets]
ad_notices



Unnamed: 0,RequestID,AgencyCode,AgencyName,TypeOfNoticeCode,TypeOfNoticeDescription,ShortTitle,SectionID,SectionName,ConfirmationNumber,AdditionalDescription
47,20140930106,OCS,Mayor's Office of Contract Services,13,Notice,DOITT extension,5,Special Materials,20140930106,<p> <u>Notice of Intent to Extend Contract(s) ...
55,20141001102,OCS,Mayor's Office of Contract Services,13,Notice,LL 63 Posting,5,Special Materials,20141001102,<p> <u>Notice of Intent to Issue New Solicitat...
60,20141001124,OCS,Mayor's Office of Contract Services,13,Notice,LL63--DDC Amendment Extensions,5,Special Materials,20141001124,<p> <u>Notice of Intent to Extend Contract(s) ...
61,20141001125,OCS,Mayor's Office of Contract Services,13,Notice,LL63-- DDC New Procurements,5,Special Materials,20141001125,<p> <u>Notice of Intent to Issue New Solicitat...
67,20141002106,OCS,Mayor's Office of Contract Services,13,Notice,LL63-- DDC New Procurement,5,Special Materials,20141002106,<p> <u>Notice of Intent to Issue New Solicitat...
68,20141002107,OCS,Mayor's Office of Contract Services,13,Notice,LL63 -- OMB New Procurement,5,Special Materials,20141002107,<p> <u>Notice of Intent to Issue New Solicitat...
69,20141002108,OCS,Mayor's Office of Contract Services,13,Notice,LL63 - Department of Parks and Recreation- New...,5,Special Materials,20141002108,<p> <u>Notice of Intent to Issue New Solicitat...
73,20141003102,OCS,Mayor's Office of Contract Services,13,Notice,LL 63 Posting,5,Special Materials,20141003102,<p> <u>Notice of Intent to Extend Contract(s) ...
75,20141003105,OCS,Mayor's Office of Contract Services,13,Notice,LL 63 Posting,5,Special Materials,20141003105,<p> <u>Notice of Intent to Issue New Solicitat...
77,20141003107,OCS,Mayor's Office of Contract Services,13,Notice,LL63 -Human Resources Administration- New Cont...,5,Special Materials,20141003107,<p> <u>Notice of Intent to Issue New Solicitat...


In [40]:
def parse_notice(soup):
    ''' A record starts with: 
    p -> u -> "text"             # Underlined text
    p -> NOTICE is HEREBY ...    # Preamble
    
    One or more notice contents, where each content
    begins with 'Agency:'
    
    Agency: blah blah blah            # Begining of an ad
    key1: single line value
    key2: this is a multi-line        # Build value string until
          value                       #  another another key is detected
    key3: another single line value   #  and then add previ key:value pair to record
    
    Agency: ...                       # Signals previous record ends
    
    '''
    record = { 'adverts' : [] }
    ad = {}
    key, val = None, None
    for para in soup.find_all('p'):
        if para.find_all('u'):
            record['context'] = para.get_text(strip=True).strip() 
        else:
            txt = para.get_text(strip=True)
            if txt.strip() == '':
                if key:
                    val += txt
            elif 'NOTICE IS HEREBY' in txt:
                ad['preamble'] = txt.strip()
                
            elif ':' in txt:
                k, v = txt.split(':', 1)
                k, v = k.strip(), v.strip()
                
                if key:
                    ad[key] = val
                else:
                    key, val = k, v
                    
                if k.lower().startswith('agency'):
                    # new record
                    if ad.keys():
                        record['adverts'].append(ad)
                        ad = {}
                key, val = k, v
                    
    if ad.keys():
        ad[key] = val
        record['adverts'].append(ad)
    return record

In [49]:
from IPython.display import HTML, display
output = []
baddata = 0
for doc in ad_notices[['AdditionalDescription']].values:
    if not isinstance(doc[0], str):
        baddata+=1
        continue
    soup = Soup(doc[0])
#     print(soup.prettify())
#     display(HTML(doc[0]))
    record = parse_notice(soup)
    output.append(record)
#     pprint.pprint(record)    
#     display(HTML('<hr />'))
print ('{} rows of adverts. {} duds'.format(len(output), baddata))
    

62 rows of adverts. 2 duds


In [46]:
pprint.pprint(output)

[{'adverts': [{'preamble': 'NOTICE IS HEREBY GIVENthat the Mayor will be '
                           'entering into the following extension(s) of '
                           '(a) contract(s) not included in the FY 2015 '
                           'Annual Contracting Plan and Schedule that is '
                           'published pursuant to New York City Charter § '
                           '312(a):'},
              {'Agency': 'Department of Information Technology & '
                         'Telecommunications',
               'Award method of original contract': 'Intergovernmental',
               'Description of services': 'Design, development and '
                                          'deployment of application '
                                          'enhancements and extensions to '
                                          'the existing APT system along '
                                          'with the appropriate '
                                          '

#Parse Meetings (todo)

In [48]:
output = []
baddata = 0
for doc in ad_meetings[['AdditionalDescription']].values:
    if not isinstance(doc[0], str):
        baddata+=1
        continue
    soup = Soup(doc[0])
    print(soup.prettify())
    display(HTML(doc[0]))
#     record = parse_html(soup)
#     output.append(record)
#     pprint.pprint(record)
    
#     display(HTML('<hr />'))
# print ('{} rows of adverts. {} duds'.format(len(output), baddata))


<p>
</p>
<p>
 <strong>
  PUBLIC NOTICE IS HEREBY GIVEN THAT
 </strong>
 the
</p>
<p>
 Franchise and Concession Review Committee will hold a
</p>
<p>
 Public Meeting on Wednesday, October 8, 2014 at 2:30 p.m.,
</p>
<p>
 at 22 Reade Street, Spector Hall, Borough of Manhattan.
</p>
<p>
</p>
<p>
 NOTE: Individuals requesting Sign Language Interpreters
</p>
<p>
 should contact the Mayor's Office of Contracts Services,
</p>
<p>
 Public Hearings Unit, 253 Broadway, 9th Floor,
</p>
<p>
 New York, NY 10007 (212) 788-7490, no later
</p>
<p>
 than SEVEN (7) BUSINESS DAYS PRIOR TO THE PUBLIC MEETING. TDD
</p>
<p>
 users should call Verizon relay service.
</p>


<p align='center"'>
</p>
<p>
 <strong>
  PUBLIC NOTICE IS HEREBY GIVEN THAT
 </strong>
 the
</p>
<p>
 Franchise and Concession Review Committee will hold a
</p>
<p>
 Public Meeting on Wednesday
</p>


<p>
</p>
<p>
</p>
<p>
 <strong>
  PUBLIC NOTICE IS HEREBY GIVEN THAT
 </strong>
 the
</p>
<p>
 Franchise and Concession Review Committee will hold a
</p>
<p>
 Public Meeting on Wednesday, December 10, 2014 at 2:30 p.m.,
</p>
<p>
 at 22 Reade Street, Spector Hall, Borough of Manhattan.
</p>
<p>
</p>
<p>
 NOTE: Individuals requesting Sign Language Interpreters
</p>
<p>
 should contact the Mayor's Office of Contracts Services,
</p>
<p>
 Public Hearings Unit, 253 Broadway, 9th Floor,
</p>
<p>
 New York, NY 10007 (212) 788-7490, no later
</p>
<p>
 than SEVEN (7) BUSINESS DAYS PRIOR TO THE PUBLIC MEETING. TDD
</p>
<p>
 users should call Verizon relay service.
</p>
