In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
class Scraper(object): 
    def __init__ (self, site_url, table_url, mtg_type = None):
        self.site_url = site_url
        self.table_url = table_url
        self.mtg_type = mtg_type
        self.table_html = None
        self.table_page = None
        self.table_data = None
        self.next_table_url = None
        self.data_headers = [
            'city',
            'committee',
            'date',
            'doc_type',
            'url',
            'local_path_pdf',
            'local_path_txt'
        ]
        
    def scrape(self):
        self.read_table_page()
        self.parse_table_html()
        self.data = self.convert_table_data()
        
    def run(self):
        # on a loop
        self.scrape()
        self.update_files()
        pass
    
    def update_files():
        pass
        
    def read_table_page(self):
        with urllib.request.urlopen(self.table_url) as f:
            self.table_page = f.read()
        self.table_html = BeautifulSoup(self.table_page)
        
    def build_url(self, x):
        if x.a:
            return self.site_url + x.a["href"]

In [8]:
class GridleyScraper(Scraper):
            
    def parse_table_html(self):
        table_of_docs = self.table_html.body.find('table', attrs={'class': 'table-responsve'})
        table_headers = [x.text for x in table_of_docs.find_all('th')]
        table_rows = table_of_docs.tbody.find_all('tr')
        table_data = {h: [] for h in table_headers}
        for row in table_rows:
            elements = row.find_all('td')
            for i, h in enumerate(table_headers):
                element = elements[i]
                table_data[h].append(element)
        self.table_data = pd.DataFrame(table_data)

    def convert_table_data(self):
        def split_date_and_name(x):
            words = x.split(" ")
            return " ".join(words[:3]), " ".join(words[3:])
        def parse_table_row(df):
            date_and_name = df["Date"].text
            meeting_date, meeting_type = split_date_and_name(date_and_name)
            cancelled = "cancel" in meeting_type
            agenda_elem = df["Agenda"]
            minutes_elem = df["Minutes"]
            return meeting_date, cancelled, self.build_url(agenda_elem), self.build_url(minutes_elem)
        data = pd.DataFrame({"id": range(0, self.table_data.shape[0])})
        data["Date"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"] = zip(*self.table_data.apply(parse_table_row, axis=1))
        data["Type"] = self.mtg_type
        return data

In [10]:
gridley_cc_scraper.data

Unnamed: 0,id,Date,Meeting cancelled?,Agenda,Minutes,Type
0,0,"October 7, 2019",False,http://gridley.ca.us/public/uploads/pdfs/10-7-...,,City Council
1,1,"September 16, 2019",False,http://gridley.ca.us/public/uploads/pdfs/9-16-...,http://gridley.ca.us/public/uploads/pdfs/9-16-...,City Council
2,2,"September 3, 2019",False,http://gridley.ca.us/public/uploads/pdfs/9-3-1...,http://gridley.ca.us/public/uploads/pdfs/9-3-1...,City Council
3,3,"August 19, 2019",False,http://gridley.ca.us/public/uploads/pdfs/8-19-...,http://gridley.ca.us/public/uploads/pdfs/8-19-...,City Council
4,4,"August 5, 2019",False,http://gridley.ca.us/public/uploads/pdfs/8-5-1...,http://gridley.ca.us/public/uploads/pdfs/8-5-1...,City Council
5,5,"July 1, 2019",False,http://gridley.ca.us/public/uploads/pdfs/7-1-1...,http://gridley.ca.us/public/uploads/pdfs/7-1-1...,City Council
6,6,"June 17, 2019",False,http://gridley.ca.us/public/uploads/pdfs/6-17-...,http://gridley.ca.us/public/uploads/pdfs/6-17-...,City Council
7,7,"June 3, 2019",False,http://gridley.ca.us/public/uploads/pdfs/6-3-1...,http://gridley.ca.us/public/uploads/pdfs/6-3-1...,City Council
8,8,"June 3, 2019",False,,,City Council
9,9,"May 20, 2019",False,http://gridley.ca.us/public/uploads/pdfs/5-20-...,http://gridley.ca.us/public/uploads/pdfs/5-20-...,City Council


In [37]:
class GridleyScraper(Scraper):
            
    def parse_table_html(self):
        table_of_docs = self.table_html.body.find('table', attrs={'class': 'table-responsve'})
        table_headers = [x.text for x in table_of_docs.find_all('th')]
        table_rows = table_of_docs.tbody.find_all('tr')
        table_data = {h: [] for h in table_headers}
        for row in table_rows:
            elements = row.find_all('td')
            for i, h in enumerate(table_headers):
                element = elements[i]
                table_data[h].append(element)
        self.table_data = pd.DataFrame(table_data)

    def convert_table_data(self):
        def split_date_and_name(x):
            words = x.split(" ")
            return " ".join(words[:3]), " ".join(words[3:])
        def parse_table_row(df):
            date_and_name = df["Date"].text
            meeting_date, meeting_type = split_date_and_name(date_and_name)
            cancelled = "cancel" in meeting_type
            agenda_elem = df["Agenda"]
            minutes_elem = df["Minutes"]
            doc_types = ["Agenda", "Minutes"]
            return pd.Series({
                "city": "Gridley",
                "committee": self.mtg_type,
                "date": meeting_date,
                "doc_type": doc_types,
                'url': [self.build_url(df[x]) for x in doc_types]})
        print(self.table_data.apply(parse_table_row, axis=1))

gridley_cc_scraper = GridleyScraper(
    site_url = "http://gridley.ca.us",
    mtg_type = "City Council",
    table_url = "http://gridley.ca.us/government-and-departments/city-council/")
print(gridley_cc_scraper.data)

                                                    0  \
0   city                 Gridley
committee       C...   
1   city                    Gridley
committee     ...   
2   city                   Gridley
committee      ...   
3   city                 Gridley
committee       C...   
4   city                Gridley
committee      Cit...   
5   city              Gridley
committee    City Co...   
6   city               Gridley
committee     City ...   
7   city              Gridley
committee    City Co...   
8   city              Gridley
committee    City Co...   
9   city              Gridley
committee    City Co...   
10  city              Gridley
committee    City Co...   
11  city                Gridley
committee      Cit...   
12  city               Gridley
committee     City ...   
13  city                Gridley
committee      Cit...   
14  city               Gridley
committee     City ...   
15  city                   Gridley
committee      ...   
16  city                  Gridl

In [177]:
# gridley_cc_scraper.data

In [45]:
class BiggsScraper(Scraper): 
        
    def scrape(self):
        while self.table_url != self.next_table_url:
            if self.next_table_url:
                self.table_url = self.next_table_url
            self.read_table_page()
            self.parse_table_html()
            self.data = self.convert_table_data()
    
    def parse_table_html(self):
        table_of_docs = self.table_html.body.find('table')
        table_headers = [x.text.strip() for x in table_of_docs.find("tr").find_all("td")]
        table_rows = table_of_docs.tbody.find_all('tr')[1:]
        table_data = {h: [] for h in table_headers}
        for row in table_rows:
            elements = row.find_all('td')
            if len(elements) == len(table_headers):
                for i, h in enumerate(table_headers):
                    element = elements[i]
                    table_data[h].append(element)
            else:
                print("next url to lookup: ", self.build_url(elements[0]))
                self.next_table_url = self.build_url(elements[0])
        self.table_data = pd.DataFrame(table_data)

    def convert_table_data(self):
        def split_date_and_name(x):
            words = x.split(" ")
            return " ".join(words[:3]), " ".join(words[3:])
        def parse_table_row(df):
            date = df["Meeting Date"].text.strip()
            time = df["Time"].text.strip()
            if "Type" in df:
                mtg_type = df["Type"].text.strip()
            elif "TYpe" in df:
                mtg_type = df["TYpe"].text.strip()
            else:
                mtg_type = ""
            cancelled = "cancel" in mtg_type.lower()
            if cancelled:
                mtg_type = mtg_type[11:]
            agenda_elem = df["Agendas"]
            minutes_elem = df["Minutes"]
            return date, time, mtg_type, cancelled, self.build_url(agenda_elem), self.build_url(minutes_elem)
        data = pd.DataFrame({"id": range(0, self.table_data.shape[0])})
        data["Date"], data["Time"], data["Type"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"] = zip(*self.table_data.apply(parse_table_row, axis=1))
        return data

biggs_scraper = BiggsScraper(
    site_url = "https://www.biggs-ca.gov/",
#     table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/index.html")
    table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/2009-Agendas--Minutes/index.html")

next url to lookup:  https://www.biggs-ca.gov/Government/Agendas--Minutes/2008-Agendas--Minutes/index.html


In [176]:
# biggs_scraper.data

In [47]:
table_url = "http://liveoakca.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999"
with urllib.request.urlopen(table_url) as f:
    table_page = f.read()
table_html = BeautifulSoup(table_page)

In [178]:
site_url = "http://liveoakca.iqm2.com/Citizens/"
        
def build_url(x):
    if x.a:
        return site_url + x.a["href"]
    
table_data = []
        
month = None
year = None
rows = table_html.find("div", {"id": "ContentPlaceholder1_pnlMeetings"}).find_all("div", {"class": "Row"})
for row in rows:
    if "MonthHeader" in row["class"]:
        month, year = row.text.strip().split(", ")
    elif "MeetingRow" in row["class"]:
        cancelled = False
        links = {}
        for row_part in row.find_all("div", recursive=False):
            for div in row_part.find_all("div", recursive=False):
                if "RowIcon" in div["class"]:
                    pass
                elif "RowLink" in div["class"]:
                    # title of RowLink has lots of info
                    links["mtg_page"] = build_url(div)
                    date = div.text.strip()
                elif "MeetingLinks" in div["class"]:
                    for link in div.find_all("div"):
                        doc_type = div.a.text.strip()
                        partial_url = div.a["href"]
                        if not partial_url in ["javascript:void(0)", "", "#"]:
                            doc_url = build_url(div)
                            links[doc_type] = doc_url
                elif "RowDetails" in div["class"]:
                    mtg_type = div.text.strip()
                    pass
                elif "RowRight" in div["class"]:
                    cancelled = True
        table_data.append((month, year, date, cancelled, links, mtg_type))
table_data = pd.DataFrame(table_data, columns=["Month", "Year", "Date", "Cancelled", "Links", "Type"])
table_data

Unnamed: 0,Month,Year,Date,Cancelled,Links,Type
0,November,2016,"Nov 16, 2016 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
1,December,2016,"Dec 7, 2016 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
2,December,2016,"Dec 21, 2016 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
3,January,2017,"Jan 4, 2017 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
4,January,2017,"Jan 18, 2017",True,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
...,...,...,...,...,...,...
119,October,2019,"Oct 16, 2019 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
120,November,2019,"Nov 6, 2019 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
121,November,2019,"Nov 20, 2019 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting
122,December,2019,"Dec 4, 2019 6:00 PM",False,{'mtg_page': 'http://liveoakca.iqm2.com/Citize...,City Council - Regular Meeting


In [180]:
def parse_table_row(df):
    agenda = None
    minutes = None
    links = df["Links"]
    if "Agenda" in links:
        agenda = links["Agenda"]
    if "Minutes" in links:
        minutes = links["Minutes"]
    return df["Date"], df["Cancelled"], agenda, minutes, df["Type"]
data = pd.DataFrame({"id": range(0, table_data.shape[0])})
data["Date"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"], data["Type"] = zip(*table_data.apply(parse_table_row, axis=1))
# return data
data

Unnamed: 0,id,Date,Meeting cancelled?,Agenda,Minutes,Type
0,0,"Nov 16, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
1,1,"Dec 7, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
2,2,"Dec 21, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
3,3,"Jan 4, 2017 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
4,4,"Jan 18, 2017",True,,,City Council - Regular Meeting
...,...,...,...,...,...,...
119,119,"Oct 16, 2019 6:00 PM",False,,,City Council - Regular Meeting
120,120,"Nov 6, 2019 6:00 PM",False,,,City Council - Regular Meeting
121,121,"Nov 20, 2019 6:00 PM",False,,,City Council - Regular Meeting
122,122,"Dec 4, 2019 6:00 PM",False,,,City Council - Regular Meeting


In [182]:
class LiveOaksScraper(Scraper):
            
    def parse_table_html(self):
        table_data = []

        month = None
        year = None
        rows = self.table_html.find("div", {"id": "ContentPlaceholder1_pnlMeetings"}).find_all("div", {"class": "Row"})
        for row in rows:
            if "MonthHeader" in row["class"]:
                month, year = row.text.strip().split(", ")
            elif "MeetingRow" in row["class"]:
                cancelled = False
                links = {}
                date = None
                mtg_type = None
                for row_part in row.find_all("div", recursive=False):
                    for div in row_part.find_all("div", recursive=False):
                        if "RowIcon" in div["class"]:
                            pass
                        elif "RowLink" in div["class"]:
                            # title of RowLink has lots of info
                            links["mtg_page"] = self.build_url(div)
                            date = div.text.strip()
                        elif "MeetingLinks" in div["class"]:
                            for link in div.find_all("div"):
                                doc_type = div.a.text.strip()
                                partial_url = div.a["href"]
                                if not partial_url in ["javascript:void(0)", "", "#"]:
                                    doc_url = self.build_url(div)
                                    links[doc_type] = doc_url
                        elif "RowDetails" in div["class"]:
                            mtg_type = div.text.strip()
                            pass
                        elif "RowRight" in div["class"]:
                            cancelled = True
                table_data.append((month, year, cancelled, links, mtg_type, date))
        table_data = pd.DataFrame(table_data, columns=["Month", "Year", "Cancelled", "Links", "Type", "Date"])
        self.table_data = table_data

    def convert_table_data(self):
        def parse_table_row(df):
            agenda = None
            minutes = None
            links = df["Links"]
            if "Agenda" in links:
                agenda = links["Agenda"]
            if "Minutes" in links:
                minutes = links["Minutes"]
            return df["Date"], df["Cancelled"], agenda, minutes, df["Type"]
        data = pd.DataFrame({"id": range(0, self.table_data.shape[0])})
        data["Date"], data["Meeting cancelled?"], data["Agenda"], data["Minutes"], data["Type"] = zip(*self.table_data.apply(parse_table_row, axis=1))
        return data
    
live_oaks_scraper = LiveOaksScraper(
    site_url = "http://liveoakca.iqm2.com/Citizens/",
    table_url = "http://liveoakca.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999")

In [183]:
# live_oaks_scraper.data

Unnamed: 0,id,Date,Meeting cancelled?,Agenda,Minutes,Type
0,0,"Nov 16, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
1,1,"Dec 7, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
2,2,"Dec 21, 2016 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
3,3,"Jan 4, 2017 6:00 PM",False,http://liveoakca.iqm2.com/Citizens/FileOpen.as...,,City Council - Regular Meeting
4,4,"Jan 18, 2017",True,,,City Council - Regular Meeting
...,...,...,...,...,...,...
119,119,"Oct 16, 2019 6:00 PM",False,,,City Council - Regular Meeting
120,120,"Nov 6, 2019 6:00 PM",False,,,City Council - Regular Meeting
121,121,"Nov 20, 2019 6:00 PM",False,,,City Council - Regular Meeting
122,122,"Dec 4, 2019 6:00 PM",False,,,City Council - Regular Meeting


In [None]:
table_url = "https://www.hillsborough.net/DocumentCenter/"
# HillsboroughScraper

In [None]:
def init_srapers():
    return [
        GridleyScraper(
            site_url = "http://gridley.ca.us",
            mtg_type = "City Council",
            table_url = "http://gridley.ca.us/government-and-departments/city-council/"),
        GridleyScraper(
            site_url = "http://gridley.ca.us",
            mtg_type = "Planning Commission",
            table_url = "http://gridley.ca.us/government-and-departments/planning-commission/"),
        BiggsScraper(
            site_url = "https://www.biggs-ca.gov/",
            table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/index.html"),
#             table_url = "https://www.biggs-ca.gov/Government/Agendas--Minutes/2009-Agendas--Minutes/index.html")
         LiveOaksScraper(
            site_url = "http://liveoakca.iqm2.com/Citizens/",
            table_url = "http://liveoakca.iqm2.com/Citizens/Calendar.aspx?From=1/1/1900&To=12/31/9999")
    ]