In [1]:
import os, re
import numpy as np

In [2]:
# Gridley-specific header info
all_section_headers = [
            'CALL TO ORDER',
            'ROLL CALL',
            'PLEDGE OF ALLEGIANCE',
            'INVOCATION',
            'PROCLAMATIONS',
            'INTRODUCTION OF NEW OR PROMOTED EMPLOYEES',
            'COMMUNITY PARTICIPATION FORUM',
            'CONSENT AGENDA',
            'PUBLIC HEARING',
            'ITEMS FOR COUNCIL CONSIDERATION',
            'CITY STAFF AND COUNCIL COMMITTEE REPORTS',
            'POTENTIAL FUTURE CITY COUNCIL ITEMS',
            'CLOSED SESSION',
            'ADJOURNMENT',
            'NOTE 1',
            'NOTE 2'
        ]
footer_start = 'GRIDLEY CITY COUNCIL AGENDA'

In [3]:
# get all Gridley agendas

data_dir =  '../data/docs/gridley/'
required_substrings = ['Gridley', 'Agenda', 'City-Council', '.txt']

matches_substrings = lambda s: all([ss in s for ss in required_substrings])
doc_list = [s for s in os.listdir(data_dir) if matches_substrings(s)]
doc_paths = [os.path.join(data_dir, doc) for doc in doc_list]

In [4]:
def parse_txt(doc):

    # the number of pages is the last character on the first page
    page_break_re = re.compile('\f')
    page_breaks = [m.start() for m in page_break_re.finditer(doc)]
    #num_pages = int(doc[:page_breaks[0]].replace('\n','').replace(' ', '')[-1])
    page_count_re = re.compile('Page\s[\d]\sof\s[\d]')
    n0, n1 = zip(*[[int(d) for d in s[5:].split(' of ')] for s in page_count_re.findall(doc)])
    num_pages = n0[np.where(np.array(n0)==np.array(n1))[0][0]]
    
    # trim extraneous pages and remove page footers
    agenda = doc[:page_breaks[num_pages-1]+1]
    agenda = re.sub('{}[^()]*\f'.format(footer_start), '', agenda)    
    
    # get section breaks in document
    all_section_starts = [agenda.find(h) for h in all_section_headers]
    section_heads, section_starts = zip(*[(k,t) for k, t in zip(all_section_headers, all_section_starts) if t>0])
    
    # sort sections
    sort_idx = np.argsort(section_starts)
    headers = list(np.array(section_heads)[sort_idx])
    breaks = list(np.array(section_starts)[sort_idx]) + [len(agenda)]    
    
    # get section ranges
    section_ranges = [('HEADER', 0, breaks[0])]
    section_ranges.extend([(headers[i], breaks[i], breaks[i+1]) for i in range(len(headers))])
    
    # store section
    sections = []
    for head, start, end in section_ranges:
        content = agenda[start:end].replace(head,'')
        section = {
            'title': head,
            'content': content
        }
        sections.append(section)    
        
    return sections

In [5]:
docs = []
for path in doc_paths:
    with open(path, 'r') as f:
        doc = f.read()
    sections = parse_txt(doc)
    docs.append(sections)

In [6]:
docs[0]

[{'title': 'HEADER',
  'content': 'Gridley City Council – Regular City Council Meeting Agenda \n\nMonday, September 16, 2019; 6:00 pm \n\nGridley City Hall, 685 Kentucky Street, Gridley, CA 95948 \n\n \n\n \n\n \n\n“Our purpose is to continuously enhance our community’s vitality and overall quality of life.  We \n\nare committed to providing high quality,  cost-effective municipal services and forming \n\nproductive partnerships with our residents and regional organizations.  We collectively develop, \n\nshare, and are guided by a clear vision, values, and meaningful objectives.” \n\n \n\n \n'},
 {'title': 'CALL TO ORDER', 'content': ' - Mayor Johnson \n'},
 {'title': 'ROLL CALL', 'content': ' - Recording Secretary \n'},
 {'title': 'PLEDGE OF ALLEGIANCE', 'content': ' – Mayor Johnson \n'},
 {'title': 'PROCLAMATIONS', 'content': ' – None \n\n'},
 {'title': 'INTRODUCTION OF NEW OR PROMOTED EMPLOYEES',
  'content': ' - None \n\n'},
 {'title': 'COMMUNITY PARTICIPATION FORUM',
  'content': 