In [1]:
import re
import pandas as pd

In [2]:
fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'
t1 = pd.read_csv(fn, header=0)

In [3]:
from tidylib import tidy_document
import html2text

In [4]:
def cleanup(txt):
    doc, errors = tidy_document(str(txt))
    doc = html2text.html2text(doc)
    return doc.replace('*','').replace('_','').lower().strip()
    
def add_clean_text(row):
    row['scrape'] = cleanup(row.AdditionalDescription)
    return row

In [5]:
t1_fix = t1.apply(add_clean_text,1)

In [6]:
# I only care about these columns for now
cols = ['RequestID', 
 'ConfirmationNumber',
 'AgencyCode',
 'AgencyName',
 'AgencyDivision',
 'SectionID',
 'SectionName',
 'scrape']
fixed = t1_fix[cols]

# top 10: breakdown of ads by agencies
fixed['AgencyName'].value_counts()[:10]

Mayor's Office of Contract Services       67
Community Boards                          36
Citywide Administrative Services          35
Housing Preservation and Development      18
Landmarks Preservation Commission         18
Human Resources Administration            17
Transportation                            16
Health and Mental Hygiene                 15
Administration for Children's Services    14
City Planning                             13
dtype: int64

In [27]:
# Focus on Mayor's Office of Contract of Services
# because there seems to be a fixed format. Can Regex be useful?
#
# Example entry:
#     public notice is hereby given that the 
#      franchise and concession review committee will hold a 
#      public meeting on wednesday, october 8, 2014 at 2:30 p.m., 
#       at 22 reade street, spector hall, borough of manhattan. 
#

mocs = fixed['AgencyName'] == "Mayor's Office of Contract Services"
mocs_ads = fixed[mocs][['RequestID', 'scrape']]

# build a regex
rex_time = 'on\s+(?P<day>[^\s,]+)[\s,]\s+(?P<month>[^\s,]+)\s+(?P<date>\d+)[,\s]+(?P<year>\d+)\s+(at)?\s*(?P<hour>\d+):(?P<minute>\d+)\s+(?P<tod>\w+\.?\w+\.?)'
rex_time = re.compile(rex_time, re.IGNORECASE|re.DOTALL|re.MULTILINE)



In [43]:
# Iterate over the data and see if we can scrape
#
moc_ads = mocs_ads['scrape'].values
for ad in moc_ads:
    # remove multiple spaces and newlines
    txt = re.sub(' +', ' ', ad)
    txt = re.sub('\n{3,}', '\n', txt)
    # extract ...
    match = rex_time.findall(txt)
    if match:
        print('Extracted: {}'.format(match))
    else:
        print(txt)
    print('--------\n\n')


Extracted: [('wednesday', 'october', '8', '2014', 'at', '2', '30', 'p.m.')]
--------


public notice is hereby given that the 

 franchise and concession review committee will hold a 

 public meeting on wednesday
--------


Extracted: [('wednesday', 'december', '10', '2014', 'at', '2', '30', 'p.m.')]
--------


notice of intent to extend contract(s) not included in fy 2015 annual
contracting plan and schedule
notice is hereby given that the mayor will be entering into the following extension(s) of (a) contract(s) not included in the fy 2015 annual contracting plan and schedule that is published pursuant to new york city charter § 312(a): 
agency: department of information technology & telecommunications

vendor: accenture llp

description of services: design, development and deployment of application
enhancements and extensions to the existing apt system along with the
appropriate documentation required.

award method of original contract: intergovernmental

fms contract type: consult