In [1]:
import os, re
import numpy as np
import pandas as pd
import dateutil.parser as dparser

In [10]:
# Gridley-specific header info
all_section_headers = [
    'CALL TO ORDER',
    'ROLL CALL',
    'PLEDGE OF ALLEGIANCE',
    'INVOCATION',
    'PROCLAMATIONS',
    'COMMUNITY PARTICIPATION FORUM',
    'CONSENT AGENDA',    
    'ANNOUNCEMENT OF NEW EMPLOYEES AND PROMOTIONS',
    'NEW AND PROMOTED EMPLOYEES',
    'ANNOUNCEMENT OF NEW AND PROMOTED EMPLOYEES',    
    'INTRODUCTION OF NEW OR PROMOTED EMPLOYEES',
    'NEW AND PROMOTED EMPLOYEES',
    'PUBLIC HEARING',
    'ITEMS FOR COUNCIL CONSIDERATION',
    'CITY STAFF AND COUNCIL COMMITTEE REPORTS',
    'POTENTIAL FUTURE CITY COUNCIL ITEMS',
    'CLOSED SESSION',
    'ADJOURNMENT',
    'NOTE 1',
    'NOTE 2'
]
mtg_vars = [
    'MTG_TYPE',
    'MTG_DATETIME',
    'MTG_LOCATION',
    'DOC_NUM_PAGES',
]
footer_start = 'GRIDLEY CITY COUNCIL AGENDA'
head_suffixes = [
    '\s+[\x2d\u2013\u2014]',
    ':',
    '',
]
strip_patterns = [
    '\n',
    '[0-9]\.',
    'Brief updates from City staff and brief reports on conferences, seminars, and meetings attended by the Mayor and City Council members, if any.',
    '\(Appearing on the Agenda within 30-90 days\):',
]
nan_values = [
    'None',
    ''
]

In [11]:
# get all Gridley agendas

data_dir =  '../data/docs/gridley/'
required_substrings = ['Gridley', 'Agenda', 'City-Council', '.txt']

matches_substrings = lambda s: all([ss in s for ss in required_substrings])
doc_list = [s for s in os.listdir(data_dir) if matches_substrings(s)]
doc_paths = [os.path.join(data_dir, doc) for doc in doc_list]

In [12]:
def parse_txt(doc):

    # remove inserted characters
    doc = re.sub('\(cid:[0-9]\)','', doc)
    
    # the number of pages is the last character on the first page
    page_break_re = re.compile('\f')
    page_breaks = [m.start() for m in page_break_re.finditer(doc)]
    page_count_re = re.compile('Page\s[\d]\sof\s[\d]')
    n0, n1 = zip(*[[int(d) for d in s[5:].split(' of ')] for s in page_count_re.findall(doc)])
    num_pages = n0[np.where(np.array(n0)==np.array(n1))[0][0]]
    
    # trim extraneous pages and remove page footers
    agenda = doc[:page_breaks[num_pages-1]+1]
#     agenda = re.sub(footer_start + '[^()]*\x0c', '', agenda)
#     agenda = re.sub('Page\s[\d]\sof\s[\d][\s\n\t]*\x0c', '', agenda)    
    agenda = re.sub(footer_start +'(.*\n?)', '', agenda)
    agenda = re.sub('Page\s[\d]\sof\s[\d]', '', agenda)
    agenda = re.sub('\f', '', agenda)
    
    # get section breaks in document
    all_section_starts = [agenda.find(h) for h in all_section_headers]
    section_heads, section_starts = zip(*[(k,t) for k, t in zip(all_section_headers, all_section_starts) if t>0])
    
    # sort sections
    sort_idx = np.argsort(section_starts)
    headers = list(np.array(section_heads)[sort_idx])
    breaks = list(np.array(section_starts)[sort_idx]) + [len(agenda)]    
    
    # get section ranges
    section_ranges = [('HEADER', 0, breaks[0])]
    section_ranges.extend([(headers[i], breaks[i], breaks[i+1]) for i in range(len(headers))])
    
    # store section

    sections = {}
    for head, start, end in section_ranges:
        section = agenda[start:end]
        if head=='HEADER':
            header = section.split('\n')
            header = [h for h in header if h.strip()]
        else:
            for s in head_suffixes:                
                section = re.sub(head+s, '', section)
            for s in strip_patterns:                
                section = re.sub(s, '', section)
            section = section.strip()
            sections[head] = section
        
        for v in mtg_vars:
            if v=='MTG_TYPE':
                for s in ['regular', 'special', 'amended']:
                    if s in header[0].lower():
                        sections[v] = s 
            elif v=='MTG_DATETIME':
                sections[v] = dparser.parse(header[1])
            elif v=='MTG_LOCATION':            
                sections[v] = header[2]
            elif v=='DOC_NUM_PAGES':
                sections[v] = num_pages
        sections['HEADER'] = header
        
    return sections

In [13]:
docs = []
segmented_docs = []
for path in doc_paths:
    with open(path, 'r') as f:
        doc = f.read()
    docs.append(doc)
    sections = parse_txt(doc)
    segmented_docs.append(sections)

In [14]:
data = pd.DataFrame(segmented_docs, columns=mtg_vars + all_section_headers)
for s in nan_values:
    data[data==s] = np.nan
data = data.sort_values('MTG_DATETIME', ascending=False).reset_index(drop=False, )

  result = method(y)


ValueError: cannot reindex from a duplicate axis

In [15]:
data

Unnamed: 0,MTG_TYPE,MTG_DATETIME,MTG_LOCATION,DOC_NUM_PAGES,CALL TO ORDER,ROLL CALL,PLEDGE OF ALLEGIANCE,INVOCATION,PROCLAMATIONS,COMMUNITY PARTICIPATION FORUM,...,INTRODUCTION OF NEW OR PROMOTED EMPLOYEES,NEW AND PROMOTED EMPLOYEES,PUBLIC HEARING,ITEMS FOR COUNCIL CONSIDERATION,CITY STAFF AND COUNCIL COMMITTEE REPORTS,POTENTIAL FUTURE CITY COUNCIL ITEMS,CLOSED SESSION,ADJOURNMENT,NOTE 1,NOTE 2
0,regular,2019-09-16 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Mayor Johnson,,,Members of the public may address the City Cou...,...,,,Public Hearing: Applicant is proposing a 21-p...,Program to Manage Unclaimed Cats,,Environmental Document Approval and Adoption –...,Conference with Labor Negotiators pursuant to ...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
1,regular,2018-03-19 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Hall,Recording Secretary,Vice Mayor Bruce Johnson,"Pastor Brad Roberts, Calvary Chapel of Gridley",,Members of the public may address the City Cou...,...,,,,Authorization for Execution of Certifications ...,,,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
2,regular,2019-04-15 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Borges,Otto Behunin of the Church of Jesus Christ Lat...,"National Sexual Assault Awareness Month, April...",Members of the public may address the City Cou...,...,,,,Annual Independent Financial Audit Report Swea...,Oral Update of FEMA and Gridley Camp Fire Comm...,,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
3,regular,2019-06-17 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Borges,"Pastor Brad Roberts, Calvary Chapel of Gridley",,Members of the public may address the City Cou...,...,,,,Purchase and Installation of new Vierra Park P...,,Interview Planning Commissioners (Special Meet...,Conference with Legal Counsel concerning exist...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
4,special,2018-06-21 13:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",1,Mayor Hall,,,,,Members of the public may address the City Cou...,...,,,,"Special Budget Meeting – Introduction, Finance...",,,,adjourning to the next Special Meeting on June...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
5,regular,2018-03-05 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Hall,Recording Secretary,Councilmember Borges,"Reverend Dan Boeger, St. Timothy’s Episcopal C...",,Members of the public may address the City Cou...,...,,,,Annual Independent Financial Audit Report,,,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
6,amended,2019-02-19 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",2,Mayor Johnson,Recording Secretary,Councilmember Hall,"Pastor Branden Heskett, Christian Life Church ...",,Members of the public may address the City Cou...,...,,• Swearing in of Officer Homero Rodriguez,,Council Authorization to fund Butte County Int...,,,Discussion to fill Council Vacancy Approval of...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
7,special,2018-12-28 16:00:00,"Gridley Community Center, 200 E. Spruce Street...",1,Mayor Johnson,Recording Secretary,,,,,...,,,,Update Regarding Enhanced Law Enforcement in t...,,,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
8,special,2018-06-25 13:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",1,Mayor Hall,,,,,Members of the public may address the City Cou...,...,,,,Council approval of Fiscal Year 2018-2019 Budg...,,,,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS – In accordance ...
9,regular,2017-07-17 18:00:00,"Gridley City Hall, 685 Kentucky Street, Gridle...",3,Mayor Hall,Recording Secretary,Councilmember Williams,TBD,,Members of the public may address the City Cou...,...,,,Public Hearing to receive public comment on th...,Information Report related to the status of th...,,Biggs Police Contract Resolutions for Assessme...,Government Code 54959: Conference with Legal C...,adjourning to the next regularly scheduled mee...,POSTING OF AGENDA- This agenda was posted on t...,REGARDING UNSCHEDULED MATTERS — In accordance ...


In [16]:
for i, r in data.iterrows():
    print(i, r['CONSENT AGENDA'])
    print()

0 Items on the Consent Agenda are considered routine and acted upon by one motion. Any Council member may request that an item be removed for separate consideration. The City Council may only make minor comments; otherwise the item should be removed from the Consent Agenda and placed as the first item(s) under “Items for Council Consideration”.    City Council minutes dated September 3, 2019

1 The Consent Agenda is considered routine and acted upon by one motion. Any Council member may request that an item be removed for separate consideration. The City Council may only make minor comments; otherwise the item should be removed from the Consent Agenda and placed as the first item(s) under “Items for Council Consideration”.    City Council minutes dated February 28 and March 5, 2018

2 Items on the Consent Agenda are considered routine and acted upon by one motion. Any Council member may request that an item be removed for separate consideration. The City Council may only make minor com

In [10]:
hex(ord('•'))

'0x2022'

In [9]:
for i, r in data.iterrows():
    print(i, r['CITY STAFF AND COUNCIL COMMITTEE REPORTS'])

0 Police Department Digital Radio System Potential Cessation of Net Metering Program Temporary Residential Housing Incentive Program #2
1 nan
2 nan
3 nan
4 Potential Cessation of Net Metering Program Police Department Digital radio system
5 nan
6 nan
7 nan
8 nan
9 •  FEMA Update by Administrator Eckert
10 Oral Update of FEMA and Gridley Camp Fire Community Expanded Use of Eagle Meadows Park for Organized Sports CAL Fire Contract to add one Additional Firefighter Potential Cessation of Net Metering Program
11 nan
12 nan
13 Approval of FY 17/18 Audit Fire Vehicle Purchase City Council selection of a new Councilmember Midyear Budget Update Swearing-in of new City Councilmember Overview of Council Review Process for Budget and CIP
14 nan
15 nan
16 Council Committee Appointments Fire Vehicle Purchase Update on Finance Software Selection Process Approval of Selection Process for Police Chief Approval of FY 17/18 Audit
17 nan
18 nan
19 CITY ADMINISTRATOR UPDATE REGARDING EVACUATION CENTER
20 

In [21]:

agenda = 'GRIDLEY CITY COUNCIL AGENDA: Regular Meeting of 3-18-19                      Page 1 of 2'
agenda = re.sub(footer_start + '.*Page\s[\d]\sof\s[\d]', '', agenda)

In [22]:
agenda

''

In [12]:
re.sub('\(cid:[0-9]\)','', docs[9][:3000])

'Gridley City Council — Regular City Council Meeting Agenda \n\nMonday, July 17, 2017; 6:00 pm \n\nGridley City Hall, 685 Kentucky Street, Gridley, CA 95948 \n\n"Our purpose is to continuously enhance our community\'s vitality and overall quality of life. We \n\nare committed to providing high quality, cost-effective municipal services and forming \n\nproductive partnerships with our residents and regional organizations. We collectively develop, \n\nshare, and are guided by a clear vision, values, and meaningful objectives." \n\nCALL TO ORDER - Mayor Hall \nROLL CALL - Recording Secretary \nPLEDGE OF ALLEGIANCE — Councilmember Williams \nINVOCATION — TBD \nPROCLAMATIONS - None \nCOMMUNITY PARTICIPATION FORUM - Members of the public may address the City \nCouncil on matters not listed on the agenda. The City Council may not discuss nor take action on \nany community participation item brought forward by a member of the community. Comments are \nrequested to be limited to three (3) minut

In [38]:
agenda = 'GRIDLEY CITY COUNCIL AGENDA: Regular Meeting of 7-1 7-1 7 \n\nPage 1 of 3'
agenda = re.sub(footer_start +'(.*\n?)', '', agenda)
agenda

'\nPage 1 of 3'