In [1]:
import os, re
import numpy as np
import pandas as pd
import dateutil.parser as dparser

In [2]:
# Gridley-specific header info
all_section_headers = [
    'CALL TO ORDER',
    'ROLL CALL',
    'PLEDGE OF ALLEGIANCE',
    'INVOCATION',
    'PROCLAMATIONS',
    'COMMUNITY PARTICIPATION FORUM',
    'CONSENT AGENDA',    
    'ANNOUNCEMENT OF NEW EMPLOYEES AND PROMOTIONS',
    'NEW AND PROMOTED EMPLOYEES',
    'ANNOUNCEMENT OF NEW AND PROMOTED EMPLOYEES',    
    'INTRODUCTION OF NEW OR PROMOTED EMPLOYEES',
    'NEW AND PROMOTED EMPLOYEES'
    'PUBLIC HEARING',
    'ITEMS FOR COUNCIL CONSIDERATION',
    'CITY STAFF AND COUNCIL COMMITTEE REPORTS',
    'POTENTIAL FUTURE CITY COUNCIL ITEMS',
    'CLOSED SESSION',
    'ADJOURNMENT',
    'NOTE 1',
    'NOTE 2'
]
mtg_vars = [
    'MTG_TYPE',
    'MTG_DATETIME',
    'MTG_LOCATION',
    'DOC_NUM_PAGES',
]
footer_start = 'GRIDLEY CITY COUNCIL AGENDA'
head_suffixes = [
    '\s+[\x2d\u2013\u2014]',
    ':',
    '',
]
strip_patterns = [
    '\n',
    '[0-9]\.',
    'Brief updates from City staff and brief reports on conferences, seminars, and meetings attended by the Mayor and City Council members, if any.',
    '\(Appearing on the Agenda within 30-90 days\):',
]
nan_values = [
    'None',
    ''
]

In [3]:
# get all Gridley agendas

data_dir =  '../data/docs/gridley/'
required_substrings = ['Gridley', 'Agenda', 'City-Council', '.txt']

matches_substrings = lambda s: all([ss in s for ss in required_substrings])
doc_list = [s for s in os.listdir(data_dir) if matches_substrings(s)]
doc_paths = [os.path.join(data_dir, doc) for doc in doc_list]

In [4]:
def parse_txt(doc):

    # remove inserted characters
    doc = re.sub('\(cid:[0-9]\)','', doc)
    
    # the number of pages is the last character on the first page
    page_break_re = re.compile('\f')
    page_breaks = [m.start() for m in page_break_re.finditer(doc)]
    page_count_re = re.compile('Page\s[\d]\sof\s[\d]')
    n0, n1 = zip(*[[int(d) for d in s[5:].split(' of ')] for s in page_count_re.findall(doc)])
    num_pages = n0[np.where(np.array(n0)==np.array(n1))[0][0]]
    
    # trim extraneous pages and remove page footers
    agenda = doc[:page_breaks[num_pages-1]+1]
#     agenda = re.sub(footer_start + '[^()]*\x0c', '', agenda)
#     agenda = re.sub('Page\s[\d]\sof\s[\d][\s\n\t]*\x0c', '', agenda)    
    agenda = re.sub(footer_start +'(.*\n?)', '', agenda)
    agenda = re.sub('Page\s[\d]\sof\s[\d]', '', agenda)
    agenda = re.sub('\f', '', agenda)
    
    # get section breaks in document
    all_section_starts = [agenda.find(h) for h in all_section_headers]
    section_heads, section_starts = zip(*[(k,t) for k, t in zip(all_section_headers, all_section_starts) if t>0])
    
    # sort sections
    sort_idx = np.argsort(section_starts)
    headers = list(np.array(section_heads)[sort_idx])
    breaks = list(np.array(section_starts)[sort_idx]) + [len(agenda)]    
    
    # get section ranges
    section_ranges = [('HEADER', 0, breaks[0])]
    section_ranges.extend([(headers[i], breaks[i], breaks[i+1]) for i in range(len(headers))])
    
    # store section

    sections = {}
    for head, start, end in section_ranges:
        section = agenda[start:end]
        if head=='HEADER':
            header = section.split('\n')
            header = [h for h in header if h.strip()]
        else:
            for s in head_suffixes:                
                section = re.sub(head+s, '', section)
            for s in strip_patterns:                
                section = re.sub(s, '', section)
            section = section.strip()
            sections[head] = section
        
        for v in mtg_vars:
            if v=='MTG_TYPE':
                for s in ['regular', 'special', 'amended']:
                    if s in header[0].lower():
                        sections[v] = s 
            elif v=='MTG_DATETIME':
                sections[v] = dparser.parse(header[1])
            elif v=='MTG_LOCATION':            
                sections[v] = header[2]
            elif v=='DOC_NUM_PAGES':
                sections[v] = num_pages
        sections['HEADER'] = header
        
    return sections

In [5]:
docs = []
segmented_docs = []
for path in doc_paths:
    with open(path, 'r') as f:
        doc = f.read()
    docs.append(doc)
    sections = parse_txt(doc)
    segmented_docs.append(sections)

In [6]:
data = pd.DataFrame(segmented_docs, columns=mtg_vars + all_section_headers)
for s in nan_values:
    data[data==s] = np.nan
data = data.sort_values('MTG_DATETIME', ascending=False).reset_index(drop=False, )

  result = method(y)


In [7]:
data

Unnamed: 0,index,MTG_TYPE,MTG_DATETIME,MTG_LOCATION,DOC_NUM_PAGES,CALL TO ORDER,ROLL CALL,PLEDGE OF ALLEGIANCE,INVOCATION,PROCLAMATIONS,...,ANNOUNCEMENT OF NEW AND PROMOTED EMPLOYEES,INTRODUCTION OF NEW OR PROMOTED EMPLOYEES,NEW AND PROMOTED EMPLOYEESPUBLIC HEARING,ITEMS FOR COUNCIL CONSIDERATION,CITY STAFF AND COUNCIL COMMITTEE REPORTS,POTENTIAL FUTURE CITY COUNCIL ITEMS,CLOSED SESSION,ADJOURNMENT,NOTE 1,NOTE 2
0,21,regular,2019-10-07 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",3,Mayor Johnson,Recording Secretary,Council member Borges,"Bishop Joshua McLean, Church of Jesus Christ, ...",• Appreciation and Support of the Gridley Are...,...,,,,Acceptance of US Department of Homeland Securi...,Police Department Digital Radio System Potenti...,,11/4/2019 11/18/2019 11/18/2019 Conference wi...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
1,0,regular,2019-09-16 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Mayor Johnson,,,...,,,,Program to Manage Unclaimed Cats,,Environmental Document Approval and Adoption –...,Conference with Labor Negotiators pursuant to ...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
2,47,amended,2019-09-03 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Torres,"Pastor Bill Hammond, Lighthouse Tabernacle",,...,,,,Police Vehicle PurchaseConsideration and Appro...,,,Animal Control Program Revisions Police Depart...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
3,20,regular,2019-08-19 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Crye,,,...,,,,Approval of Butte Subbasin Cooperation Agreeme...,,Potential Cessation of Net Metering Program Po...,Conference with Labor Negotiators pursuant to ...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS — In accordance ...
4,24,regular,2019-08-05 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Mayor Johnson,,,...,,• New Police Officer Ruben Quihuiz,,Approval of Letter to Butte County Board of Su...,Potential Cessation of Net Metering Program Po...,,8/19/2019 9/3/2019 Conference with Labor Nego...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
5,42,regular,2019-07-01 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Vice Mayor Williams,"Pastor Brad Roberts, Calvary Chapel of Gridley",,...,,,,Interviews of Candidates to fill Planning Comm...,,8/5/2019 8/5/2019 8/5/2019 8/19/2019 Seating o...,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
6,3,regular,2019-06-17 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Borges,"Pastor Brad Roberts, Calvary Chapel of Gridley",,...,,,,Purchase and Installation of new Vierra Park P...,,Interview Planning Commissioners (Special Meet...,Conference with Legal Counsel concerning exist...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
7,13,regular,2019-06-03 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Zachary Torres,"Bishop Pono Nako, Church of Jesus Christ, Latt...",,...,,,,City Council acceptance of Resolution No. 2019...,,Adoption of Budget Approval of New Playground ...,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
8,14,amended,2019-05-20 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Mayor Bruce Johnson,"Pastor Branden Heskett, Christian Life Church ...",None *,...,,,,Approval of New Playground Equipment for Vierr...,,,Budget presentation Adoption of Budget Potenti...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
9,19,regular,2019-05-06 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Vice Mayor Williams,Pastor Ed Lucas of the Gridley Christian Church,,...,,,,Council Appointment Process of Partial Term Co...,• FEMA Update by Administrator Eckert,,Approval of new playground equipment for Vierr...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...


In [13]:
for i, r in data.iterrows():
    print(i, r['ITEMS FOR COUNCIL CONSIDERATION'])
    print()

0 Acceptance of US Department of Homeland Security SAFER Grant Award Valley Oaks Estates, Tentative Subdivision Map 1-19; Applicant is proposing a 21-parcel single-family residential subdivision on three parcels totaling ±7 acres, APN: 022-230-022, -024 and -02  Determine the project is Categorically Exempt per the California Environmental Quality Act, Section 15332 (a-e), Class 32, Infill Development Projects. •  Adopt Resolution Number 2019-R-025:  A Resolution Approving Tentative Subdivision Map No. 1-19 to Subdivide Three Parcels Consisting of Approximately 7 Acres into Twenty-One (21) Parcels Consisting of One 25 Acre Parcel for a Detention Basin and Twenty (20) Parcels for a Residential Housing Development Located at the Northeast Corner of Peach Street and West Biggs Gridley Road in the Single Family Residential District (R-1)  and Residential, Low Density (RLD) General Plan Land Use Designation  (APN: 022-230-022, -024 & -025)  •  Adopt Resolution Number 2019-R-026:  A Resoluti

In [10]:
hex(ord('•'))

'0x2022'

In [9]:
for i, r in data.iterrows():
    print(i, r['CITY STAFF AND COUNCIL COMMITTEE REPORTS'])

0 Police Department Digital Radio System Potential Cessation of Net Metering Program Temporary Residential Housing Incentive Program #2
1 nan
2 nan
3 nan
4 Potential Cessation of Net Metering Program Police Department Digital radio system
5 nan
6 nan
7 nan
8 nan
9 •  FEMA Update by Administrator Eckert
10 Oral Update of FEMA and Gridley Camp Fire Community Expanded Use of Eagle Meadows Park for Organized Sports CAL Fire Contract to add one Additional Firefighter Potential Cessation of Net Metering Program
11 nan
12 nan
13 Approval of FY 17/18 Audit Fire Vehicle Purchase City Council selection of a new Councilmember Midyear Budget Update Swearing-in of new City Councilmember Overview of Council Review Process for Budget and CIP
14 nan
15 nan
16 Council Committee Appointments Fire Vehicle Purchase Update on Finance Software Selection Process Approval of Selection Process for Police Chief Approval of FY 17/18 Audit
17 nan
18 nan
19 CITY ADMINISTRATOR UPDATE REGARDING EVACUATION CENTER
20 

In [21]:

agenda = 'GRIDLEY CITY COUNCIL AGENDA: Regular Meeting of 3-18-19                      Page 1 of 2'
agenda = re.sub(footer_start + '.*Page\s[\d]\sof\s[\d]', '', agenda)

In [22]:
agenda

''

In [12]:
re.sub('\(cid:[0-9]\)','', docs[9][:3000])

'Gridley City Council — Regular City Council Meeting Agenda \n\nMonday, July 17, 2017; 6:00 pm \n\nGridley City Hall, 685 Kentucky Street, Gridley, CA 95948 \n\n"Our purpose is to continuously enhance our community\'s vitality and overall quality of life. We \n\nare committed to providing high quality, cost-effective municipal services and forming \n\nproductive partnerships with our residents and regional organizations. We collectively develop, \n\nshare, and are guided by a clear vision, values, and meaningful objectives." \n\nCALL TO ORDER - Mayor Hall \nROLL CALL - Recording Secretary \nPLEDGE OF ALLEGIANCE — Councilmember Williams \nINVOCATION — TBD \nPROCLAMATIONS - None \nCOMMUNITY PARTICIPATION FORUM - Members of the public may address the City \nCouncil on matters not listed on the agenda. The City Council may not discuss nor take action on \nany community participation item brought forward by a member of the community. Comments are \nrequested to be limited to three (3) minut

In [38]:
agenda = 'GRIDLEY CITY COUNCIL AGENDA: Regular Meeting of 7-1 7-1 7 \n\nPage 1 of 3'
agenda = re.sub(footer_start +'(.*\n?)', '', agenda)
agenda

'\nPage 1 of 3'