In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

In [37]:
class Scraper(object): 
    def __init__ (self, site_url, table_url, mtg_type = None):
        self.site_url = site_url
        self.table_url = table_url
        self.mtg_type = mtg_type
        self.next_table_url = None
        self.table_page = None
        self.table_data = None
        self.data_headers = [
            'City','Date','Type',
            'Committee','Agendas','Minutes',
            'Revised' 'document?','Meeting cancelled?',
            'Time'
        ]
        self.scrape()
        
    def scrape(self):
        self.read_table_data()
        self.parse_table_data()
        self.data = self.convert_table_data()
        
    def read_table_data(self):
        with urllib.request.urlopen(self.table_url) as f:
            self.table_page = f.read()
        self.table_data = BeautifulSoup(self.table_page)
        
    def build_url(self, x):
        if x.a:
            return self.site_url + x.a["href"]
        
class GridleyScraper(Scraper):
            
    def parse_table_data(self):
        table_of_docs = self.table_data.body.find('table', attrs={'class': 'table-responsve'})
        table_headers = [x.text for x in table_of_docs.find_all('th')]
        table_rows = table_of_docs.tbody.find_all('tr')
        table_data = {h: [] for h in table_headers}
        for row in table_rows:
            elements = row.find_all('td')
            for i, h in enumerate(table_headers):
                element = elements[i]
                table_data[h].append(element)
        self.table_data = pd.DataFrame(table_data)

    def convert_table_data(self):
        def split_date_and_name(x):
            words = x.split(" ")
            return " ".join(words[:3]), " ".join(words[3:])
        def parse_table_row(df):
            date_and_name = df["Date"].text
            meeting_date, meeting_type = split_date_and_name(date_and_name)
            cancelled = "cancel" in meeting_type
            agenda_elem = df["Agenda"]
            minutes_elem = df["Minutes"]
            return meeting_date, cancelled, self.build_url(agenda_elem), self.build_url(minutes_elem)
        data = pd.DataFrame({"id": range(0, self.table_data.shape[0])})
        data["Date"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"] = zip(*self.table_data.apply(parse_table_row, axis=1))
        data["Type"] = self.mtg_type
        return data

gridley_cc_scraper = GridleyScraper(
    site_url = "http://gridley.ca.us",
    mtg_type = "City Council",
    table_url = "http://gridley.ca.us/government-and-departments/city-council/")
gridley_pc_scraper = GridleyScraper(
    site_url = "http://gridley.ca.us",
    mtg_type = "Planning Commission",
    table_url = "http://gridley.ca.us/government-and-departments/planning-commission/")

In [38]:
gridley_cc_scraper.data

Unnamed: 0,id,Date,Meeting cancelled?,Agenda,Minutes,Type
0,0,"October 7, 2019",False,http://gridley.ca.us/public/uploads/pdfs/10-7-...,,City Council
1,1,"September 16, 2019",False,http://gridley.ca.us/public/uploads/pdfs/9-16-...,http://gridley.ca.us/public/uploads/pdfs/9-16-...,City Council
2,2,"September 3, 2019",False,http://gridley.ca.us/public/uploads/pdfs/9-3-1...,http://gridley.ca.us/public/uploads/pdfs/9-3-1...,City Council
3,3,"August 19, 2019",False,http://gridley.ca.us/public/uploads/pdfs/8-19-...,http://gridley.ca.us/public/uploads/pdfs/8-19-...,City Council
4,4,"August 5, 2019",False,http://gridley.ca.us/public/uploads/pdfs/8-5-1...,http://gridley.ca.us/public/uploads/pdfs/8-5-1...,City Council
5,5,"July 1, 2019",False,http://gridley.ca.us/public/uploads/pdfs/7-1-1...,http://gridley.ca.us/public/uploads/pdfs/7-1-1...,City Council
6,6,"June 17, 2019",False,http://gridley.ca.us/public/uploads/pdfs/6-17-...,http://gridley.ca.us/public/uploads/pdfs/6-17-...,City Council
7,7,"June 3, 2019",False,http://gridley.ca.us/public/uploads/pdfs/6-3-1...,http://gridley.ca.us/public/uploads/pdfs/6-3-1...,City Council
8,8,"June 3, 2019",False,,,City Council
9,9,"May 20, 2019",False,http://gridley.ca.us/public/uploads/pdfs/5-20-...,http://gridley.ca.us/public/uploads/pdfs/5-20-...,City Council


In [39]:
class BiggsScraper(Scraper): 
        
    def scrape(self):
        while self.table_url != self.next_table_url:
            if self.next_table_url:
                self.table_url = self.next_table_url
            self.read_table_data()
            self.parse_table_data()
            self.data = self.convert_table_data()
    
    def parse_table_data(self):
        table_of_docs = self.table_data.body.find('table')
        table_headers = [x.text.strip() for x in table_of_docs.find("tr").find_all("td")]
        table_rows = table_of_docs.tbody.find_all('tr')[1:]
        table_data = {h: [] for h in table_headers}
        for row in table_rows:
            elements = row.find_all('td')
            if len(elements) == len(table_headers):
                for i, h in enumerate(table_headers):
                    element = elements[i]
                    table_data[h].append(element)
            else:
                print("next url to lookup: ", self.build_url(elements[0]))
                self.next_table_url = self.build_url(elements[0])
        self.table_data = pd.DataFrame(table_data)

    def convert_table_data(self):
        def split_date_and_name(x):
            words = x.split(" ")
            return " ".join(words[:3]), " ".join(words[3:])
        def parse_table_row(df):
            date = df["Meeting Date"].text.strip()
            time = df["Time"].text.strip()
            if "Type" in df:
                mtg_type = df["Type"].text.strip()
            elif "TYpe" in df:
                mtg_type = df["TYpe"].text.strip()
            else:
                mtg_type = ""
            cancelled = "cancel" in mtg_type.lower()
            if cancelled:
                mtg_type = mtg_type[11:]
            agenda_elem = df["Agendas"]
            minutes_elem = df["Minutes"]
            return date, time, mtg_type, cancelled, self.build_url(agenda_elem), self.build_url(minutes_elem)
        data = pd.DataFrame({"id": range(0, self.table_data.shape[0])})
        data["Date"], data["Time"], data["Type"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"] = zip(*self.table_data.apply(parse_table_row, axis=1))
        return data

biggs_scraper = BiggsScraper(
    site_url = "https://www.biggs-ca.gov/",
    table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/index.html")
#     table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/2009-Agendas--Minutes/index.html")

next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2018--Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2017-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2016-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2015-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2014-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2013-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2012-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2011-Agendas--Minutes/index.html
next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2010-Agendas--Minutes/index.html
next url to lookup:  https://www.big

In [40]:
biggs_scraper.data

Unnamed: 0,id,Date,Time,Type,Meeting cancelled?,Agenda,Minutes
0,0,12-15-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
1,1,12-12-08,3:00 pm,Special City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
2,2,11-17-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
3,3,10-20-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
4,4,10-06-08,6:00 pm,Special City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
5,5,09-15-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
6,6,08-18-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
7,7,08-13-08,5:30 pm,Special City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
8,8,07-25-08,3:00 pm,Special City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
9,9,07-21-08,6:00 pm,Regular City Council Meeting,False,https://www.biggs-ca.gov/documents/Government/...,https://www.biggs-ca.gov/documents/Government/...
