all can be run from scraper.py. Various fixes and improvements.

43South · Sep 30, 2021 · c79860b · c79860b
1 parent f3ca635
commit c79860b
Show file tree

Hide file tree

Showing 23 changed files with 687 additions and 515 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.idea/
+scraperwiki.sqlite
+__pycache__
diff --git a/breakoday.py b/breakoday.py
@@ -14,11 +14,11 @@ def councildas():
     records = []
     for da in das:
         lines = da('td')
-        council_reference = lines[3].get_text().strip()
-        address = lines[1].get_text() + ', Tasmania, Australia'
-        description = lines[0].get_text()
+        council_reference = lines[3].text.strip()
+        address = lines[1].text + ', Tasmania, Australia'
+        description = lines[0].text
         info_url = lines[3].find('a')['href']
-        on_notice_to = datetime.strptime(lines[2].get_text(), '%d %B %Y').strftime('%Y-%m-%d')
+        on_notice_to = datetime.strptime(lines[2].text, '%d %B %Y').strftime('%Y-%m-%d')
         record = {
           'council_reference': council_reference,
           'address': address,

diff --git a/brighton.py b/brighton.py
@@ -10,17 +10,16 @@ def councildas():
     html = scraperwiki.scrape(applications_url)
     date_scraped = datetime.now().isoformat()
     page = BeautifulSoup(html, 'html.parser')
-
     das = page.find('table')('tr')[1:]
     records = []
     for da in das:
         lines = da('td')
-        council_reference = lines[3].get_text().strip()
-        address = lines[1].get_text() + ', Tasmania, Australia'
-        description = lines[0].get_text()
+        council_reference = lines[3].text.strip()
+        address = lines[1].text + ', Tasmania, Australia'
+        description = lines[0].text
         info_url = lines[3].find('a')['href']
         try:
-            on_notice_to = parse(lines[2].get_text()).strftime('%Y-%m-%d')
+            on_notice_to = parse(lines[2].text).strftime('%Y-%m-%d')
         except ParserError:
             on_notice_to = ''
         record = {

diff --git a/burnie.py b/burnie.py
@@ -5,30 +5,36 @@
 from dateutil.parser import parse, ParserError
 import logging
 
-logging.basicConfig(level=logging.DEBUG)
+def councildas():
+    applications_url = 'https://www.burnie.net/Development/Planning/Permit-applications-on-exhibition'
+    html = scraperwiki.scrape(applications_url)
+    date_scraped = datetime.now().isoformat()
+    page = BeautifulSoup(html, 'html.parser')
+    das = page.find_all('article')
+    ondisplayprefix = len('On display until ') + 2
+    records = []
+    for da in das:
+        lines = da.text.splitlines()
+        council_reference = da.find('p', 'da-application-number').text
+        description = da('p')[-1].text
+        info_url = da.find('a')['href']
+        address = da.find('p', 'list-item-address').text + ', Tasmania, Australia'
+        on_notice_to = parse(da.find('p', 'display-until-date').text[ondisplayprefix:]).strftime('%Y-%m-%d')
+        record = {
+          'council_reference': council_reference,
+          'address': address,
+          'description': description,
+          'info_url': info_url,
+          'date_scraped': date_scraped,
+          'on_notice_to': on_notice_to
+        }
+        records = records + [record]
+    return records
 
-
-os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
-applications_url = 'https://www.burnie.net/Development/Planning/Permit-applications-on-exhibition'
-html = scraperwiki.scrape(applications_url)
-date_scraped = datetime.now().isoformat()
-page = BeautifulSoup(html, 'html.parser')
-das = page.find_all('article')
-ondisplayprefix = len('On display until ') + 2
-for da in das:
-    lines = da.get_text().splitlines()
-    council_reference = da.find('p', 'da-application-number').get_text()
-    description = da('p')[-1].get_text()
-    info_url = da.find('a')['href']
-    address = da.find('p', 'list-item-address').get_text() + ', Tasmania, Australia'
-    on_notice_to = parse(da.find('p', 'display-until-date').get_text()[ondisplayprefix:]).strftime('%Y-%m-%d')
-    record = {
-      'council_reference': council_reference,
-      'address': address,
-      'description': description,
-      'info_url': info_url,
-      'date_scraped': date_scraped,
-      'on_notice_to': on_notice_to
-    }
-    logging.debug(record)
-    scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
+    records = councildas()
+    for record in records:
+        logging.debug(record)
+        scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
diff --git a/centralcoast.py b/centralcoast.py
@@ -7,28 +7,37 @@
 
 logging.basicConfig(level=logging.DEBUG)
 
+def councildas():
+    applications_url = 'https://www.centralcoast.tas.gov.au/current-planning-applications'
+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0'
+    html = scraperwiki.scrape(applications_url, user_agent=user_agent)
+    date_scraped = datetime.now().isoformat()
+    page = BeautifulSoup(html, 'html.parser')
+    das = page.find_all('div', 'listing-item')
+    expiryprefixlen = len('Notification expiry date - ')
+    records = []
+    for da in das:
+        description, expiryraw = da.find('span', 'excerpt').text.split(')')
+        on_notice_to = parse(expiryraw.split('[')[1][expiryprefixlen:-1]).strftime('%Y-%m-%d')
+        council_reference, dummy, address = da.find('a').text.split(' ', 2)
+        address = address + ', Tasmania, Australia'
+        info_url = da.find('a')['href']
+        record = {
+          'council_reference': council_reference,
+          'address': address,
+          'description': description,
+          'info_url': info_url,
+          'date_scraped': date_scraped,
+          'on_notice_to': on_notice_to
+        }
+        records = records + [record]
+    return records
 
-os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
-applications_url = 'https://www.centralcoast.tas.gov.au/current-planning-applications'
-user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0'
-html = scraperwiki.scrape(applications_url, user_agent=user_agent)
-date_scraped = datetime.now().isoformat()
-page = BeautifulSoup(html, 'html.parser')
-das = page.find_all('div', 'listing-item')
-expiryprefixlen = len('Notification expiry date - ')
-for da in das:
-    description, expiryraw = da.find('span', 'excerpt').get_text().split('\xa0')
-    on_notice_to = parse(expiryraw.split('[')[1][expiryprefixlen:-1]).strftime('%Y-%m-%d')
-    council_reference, dummy, address = da.find('a').get_text().split(' ', 2)
-    address = address + ', Tasmania, Australia'
-    info_url = da.find('a')['href']
-    record = {
-      'council_reference': council_reference,
-      'address': address,
-      'description': description,
-      'info_url': info_url,
-      'date_scraped': date_scraped,
-      'on_notice_to': on_notice_to
-    }
-    logging.debug(record)
-    scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
+    records = councildas()
+    for record in records:
+        logging.debug(record)
+        scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
diff --git a/centralhighlands.py b/centralhighlands.py
@@ -5,35 +5,42 @@
 from dateutil.parser import parse, ParserError
 import logging
 
-logging.basicConfig(level=logging.DEBUG)
-
 # This all feels horribly fragile. Smallest council = most work
 
-os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
-applications_url = 'https://centralhighlands.tas.gov.au/development-applications/'
-html = scraperwiki.scrape(applications_url)
-date_scraped = datetime.now().isoformat()
-page = BeautifulSoup(html, 'html.parser')
-das = page.find('div', 'twelve columns')
-locations = das(text='Location:')
-for location in locations:
-    addresselement = location.parent.next_sibling
-    address = addresselement.rstrip().rstrip('/').strip() + ', Tasmania, Australia'
-    info_url = addresselement.previous_sibling.find_next('a')['href']
-    council_reference, description = location.find_next(text='Proposal:').parent.next_sibling.split('–', 2)
-    paras = location.parent.find_all_next('p')
-    if paras:
-        for para in paras:
-            if para.get_text().startswith('The relevant documents'):
-                on_notice_to = parse(' '.join(para.get_text().split()[-3:])).strftime('%Y-%m-%d')
-                break
-    record = {
-      'council_reference': council_reference.strip(),
-      'address': address,
-      'description': description.strip(),
-      'info_url': info_url,
-      'date_scraped': date_scraped,
-      'on_notice_to': on_notice_to
-    }
-    logging.debug(record)
-    scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
+def councildas():
+    applications_url = 'https://centralhighlands.tas.gov.au/development-applications/'
+    html = scraperwiki.scrape(applications_url)
+    date_scraped = datetime.now().isoformat()
+    page = BeautifulSoup(html, 'html.parser')
+    das = page.find('div', 'twelve columns')
+    locations = das(text='Location:')
+    records = []
+    for location in locations:
+        addresselement = location.parent.next_sibling
+        address = addresselement.rstrip().rstrip('/').strip() + ', Tasmania, Australia'
+        info_url = addresselement.previous_sibling.find_next('a')['href']
+        council_reference, description = location.find_next(text='Proposal:').parent.next_sibling.split('–', 2)
+        paras = location.parent.find_all_next('p')
+        if paras:
+            for para in paras:
+                if para.text.startswith('The relevant documents'):
+                    on_notice_to = parse(' '.join(para.text.split()[-3:])).strftime('%Y-%m-%d')
+                    break
+        record = {
+          'council_reference': council_reference.strip(),
+          'address': address,
+          'description': description.strip(),
+          'info_url': info_url,
+          'date_scraped': date_scraped,
+          'on_notice_to': on_notice_to
+        }
+        records = records + [record]
+    return records
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
+    records = councildas()
+    for record in records:
+        logging.debug(record)
+        scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
diff --git a/circularhead.py b/circularhead.py
@@ -6,36 +6,41 @@
 
 # This is horribly fragile
 
-logging.basicConfig(level=logging.DEBUG)
-os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
-applications_url = 'https://www.circularhead.tas.gov.au/council-services/development/planning'
-html = scraperwiki.scrape(applications_url)
-date_scraped = datetime.now().isoformat()
-page = BeautifulSoup(html, 'html.parser')
-das = page.find(text='Current Planning Permit Applications').find_all_next('li')
-for da in das:
-    address = ''
-    description = ''
-    info_url = da.find('a')['href']
-    refaddressdesc = da.find('a').get_text()
-    # this is so horrible I can hardly make myself do it
-    bits = refaddressdesc.split('-')
-    if not bits[1] == '2021':
-        if bits[0].endswith('2021'):
-            bits = [bits[0][0:2], bits[0][2:]] + bits[1:]
-        else:
-            # impossible, skip this one
-            continue
-    council_reference = '-'.join(bits[0:3])
-    address = ' '.join(bits[3: 6]) + ', ' + bits[6] + ', Tasmania, Australia'
-    description = ' '.join(bits[7:-1] + [bits[-1].split('.')[0]])
-    logging.debug(bits)
-    record = {
-      'council_reference': council_reference,
-      'address': address,
-      'description': description,
-      'info_url': info_url,
-      'date_scraped': date_scraped,
-    }
-    logging.debug(record)
-    scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
+def councildas():
+    applications_url = 'https://www.circularhead.tas.gov.au/council-services/development/planning'
+    html = scraperwiki.scrape(applications_url)
+    date_scraped = datetime.now().isoformat()
+    page = BeautifulSoup(html, 'html.parser')
+    das = page.find(text='Current Planning Permit Applications').find_all_next('li')
+    records = []
+    for da in das:
+        info_url = da.find('a')['href']
+        refaddressdesc = da.find('a').text
+        # this is so horrible I can hardly make myself do it
+        bits = refaddressdesc.split('-')
+        if not bits[1] == '2021':
+            if bits[0].endswith('2021'):
+                bits = [bits[0][0:2], bits[0][2:]] + bits[1:]
+            else:
+                # impossible, skip this one
+                continue
+        council_reference = '-'.join(bits[0:3])
+        address = ' '.join(bits[3: 6]) + ', ' + bits[6] + ', Tasmania, Australia'
+        description = ' '.join(bits[7:-1] + [bits[-1].split('.')[0]])
+        record = {
+          'council_reference': council_reference,
+          'address': address,
+          'description': description,
+          'info_url': info_url,
+          'date_scraped': date_scraped,
+        }
+        records = records + [record]
+    return records
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
+    records = councildas()
+    for record in records:
+        logging.debug(record)
+        scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
diff --git a/dorset.py b/dorset.py
@@ -5,28 +5,35 @@
 from dateutil.parser import parse
 import logging
 
+def councildas():
+    applications_url = \
+        'https://eservices.dorset.tas.gov.au/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=12237'
+    html = scraperwiki.scrape(applications_url)
+    date_scraped = datetime.now().isoformat()
+    page = BeautifulSoup(html, 'html.parser')
+    das = page('h4', 'non_table_headers')
+    records = []
+    for da in das:
+        address = da.text
+        info_url = 'https://eservices.dorset.tas.gov.au' + da.find('a')['href']
+        council_reference = da.find_next('span', text='Application No.').next_sibling.text
+        description = da.find_next('span', text='Type of Work').next_sibling.text
+        date_received = parse(da.find_next('span', text='Date Lodged').next_sibling.text).strftime('%Y-%m-%d')
+        record = {
+          'council_reference': council_reference,
+          'address': address,
+          'description': description,
+          'info_url': info_url,
+          'date_scraped': date_scraped,
+          'date_received': date_received,
+        }
+        records = records + [record]
+    return records
 
-logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
-os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
-applications_url = \
-    'https://eservices.dorset.tas.gov.au/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=12237'
-html = scraperwiki.scrape(applications_url)
-date_scraped = datetime.now().isoformat()
-page = BeautifulSoup(html, 'html.parser')
-das = page('h4', 'non_table_headers')
-for da in das:
-    address = da.get_text()
-    info_url = 'https://eservices.dorset.tas.gov.au' + da.find('a')['href']
-    council_reference = da.find_next('span', text='Application No.').next_sibling.get_text()
-    description = da.find_next('span', text='Type of Work').next_sibling.get_text()
-    date_received = parse(da.find_next('span', text='Date Lodged').next_sibling.get_text()).strftime('%Y-%m-%d')
-    record = {
-      'council_reference': council_reference,
-      'address': address,
-      'description': description,
-      'info_url': info_url,
-      'date_scraped': date_scraped,
-      'date_received': date_received,
-    }
-    logging.debug(record)
-    scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
+    records = councildas()
+    for record in records:
+        logging.debug(record)
+        scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')