Skip to content

Commit

Permalink
all can be run from scraper.py. Various fixes and improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
43South committed Sep 30, 2021
1 parent f3ca635 commit c79860b
Show file tree
Hide file tree
Showing 23 changed files with 687 additions and 515 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -0,0 +1,3 @@
.idea/
scraperwiki.sqlite
__pycache__
8 changes: 4 additions & 4 deletions breakoday.py
Expand Up @@ -14,11 +14,11 @@ def councildas():
records = []
for da in das:
lines = da('td')
council_reference = lines[3].get_text().strip()
address = lines[1].get_text() + ', Tasmania, Australia'
description = lines[0].get_text()
council_reference = lines[3].text.strip()
address = lines[1].text + ', Tasmania, Australia'
description = lines[0].text
info_url = lines[3].find('a')['href']
on_notice_to = datetime.strptime(lines[2].get_text(), '%d %B %Y').strftime('%Y-%m-%d')
on_notice_to = datetime.strptime(lines[2].text, '%d %B %Y').strftime('%Y-%m-%d')
record = {
'council_reference': council_reference,
'address': address,
Expand Down
9 changes: 4 additions & 5 deletions brighton.py
Expand Up @@ -10,17 +10,16 @@ def councildas():
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')

das = page.find('table')('tr')[1:]
records = []
for da in das:
lines = da('td')
council_reference = lines[3].get_text().strip()
address = lines[1].get_text() + ', Tasmania, Australia'
description = lines[0].get_text()
council_reference = lines[3].text.strip()
address = lines[1].text + ', Tasmania, Australia'
description = lines[0].text
info_url = lines[3].find('a')['href']
try:
on_notice_to = parse(lines[2].get_text()).strftime('%Y-%m-%d')
on_notice_to = parse(lines[2].text).strftime('%Y-%m-%d')
except ParserError:
on_notice_to = ''
record = {
Expand Down
58 changes: 32 additions & 26 deletions burnie.py
Expand Up @@ -5,30 +5,36 @@
from dateutil.parser import parse, ParserError
import logging

logging.basicConfig(level=logging.DEBUG)
def councildas():
applications_url = 'https://www.burnie.net/Development/Planning/Permit-applications-on-exhibition'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find_all('article')
ondisplayprefix = len('On display until ') + 2
records = []
for da in das:
lines = da.text.splitlines()
council_reference = da.find('p', 'da-application-number').text
description = da('p')[-1].text
info_url = da.find('a')['href']
address = da.find('p', 'list-item-address').text + ', Tasmania, Australia'
on_notice_to = parse(da.find('p', 'display-until-date').text[ondisplayprefix:]).strftime('%Y-%m-%d')
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
records = records + [record]
return records


os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
applications_url = 'https://www.burnie.net/Development/Planning/Permit-applications-on-exhibition'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find_all('article')
ondisplayprefix = len('On display until ') + 2
for da in das:
lines = da.get_text().splitlines()
council_reference = da.find('p', 'da-application-number').get_text()
description = da('p')[-1].get_text()
info_url = da.find('a')['href']
address = da.find('p', 'list-item-address').get_text() + ', Tasmania, Australia'
on_notice_to = parse(da.find('p', 'display-until-date').get_text()[ondisplayprefix:]).strftime('%Y-%m-%d')
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
records = councildas()
for record in records:
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
57 changes: 33 additions & 24 deletions centralcoast.py
Expand Up @@ -7,28 +7,37 @@

logging.basicConfig(level=logging.DEBUG)

def councildas():
applications_url = 'https://www.centralcoast.tas.gov.au/current-planning-applications'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0'
html = scraperwiki.scrape(applications_url, user_agent=user_agent)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find_all('div', 'listing-item')
expiryprefixlen = len('Notification expiry date - ')
records = []
for da in das:
description, expiryraw = da.find('span', 'excerpt').text.split(')')
on_notice_to = parse(expiryraw.split('[')[1][expiryprefixlen:-1]).strftime('%Y-%m-%d')
council_reference, dummy, address = da.find('a').text.split(' ', 2)
address = address + ', Tasmania, Australia'
info_url = da.find('a')['href']
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
records = records + [record]
return records

os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
applications_url = 'https://www.centralcoast.tas.gov.au/current-planning-applications'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0'
html = scraperwiki.scrape(applications_url, user_agent=user_agent)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find_all('div', 'listing-item')
expiryprefixlen = len('Notification expiry date - ')
for da in das:
description, expiryraw = da.find('span', 'excerpt').get_text().split('\xa0')
on_notice_to = parse(expiryraw.split('[')[1][expiryprefixlen:-1]).strftime('%Y-%m-%d')
council_reference, dummy, address = da.find('a').get_text().split(' ', 2)
address = address + ', Tasmania, Australia'
info_url = da.find('a')['href']
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")

if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
records = councildas()
for record in records:
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
67 changes: 37 additions & 30 deletions centralhighlands.py
Expand Up @@ -5,35 +5,42 @@
from dateutil.parser import parse, ParserError
import logging

logging.basicConfig(level=logging.DEBUG)

# This all feels horribly fragile. Smallest council = most work

os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
applications_url = 'https://centralhighlands.tas.gov.au/development-applications/'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find('div', 'twelve columns')
locations = das(text='Location:')
for location in locations:
addresselement = location.parent.next_sibling
address = addresselement.rstrip().rstrip('/').strip() + ', Tasmania, Australia'
info_url = addresselement.previous_sibling.find_next('a')['href']
council_reference, description = location.find_next(text='Proposal:').parent.next_sibling.split('–', 2)
paras = location.parent.find_all_next('p')
if paras:
for para in paras:
if para.get_text().startswith('The relevant documents'):
on_notice_to = parse(' '.join(para.get_text().split()[-3:])).strftime('%Y-%m-%d')
break
record = {
'council_reference': council_reference.strip(),
'address': address,
'description': description.strip(),
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
def councildas():
applications_url = 'https://centralhighlands.tas.gov.au/development-applications/'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find('div', 'twelve columns')
locations = das(text='Location:')
records = []
for location in locations:
addresselement = location.parent.next_sibling
address = addresselement.rstrip().rstrip('/').strip() + ', Tasmania, Australia'
info_url = addresselement.previous_sibling.find_next('a')['href']
council_reference, description = location.find_next(text='Proposal:').parent.next_sibling.split('–', 2)
paras = location.parent.find_all_next('p')
if paras:
for para in paras:
if para.text.startswith('The relevant documents'):
on_notice_to = parse(' '.join(para.text.split()[-3:])).strftime('%Y-%m-%d')
break
record = {
'council_reference': council_reference.strip(),
'address': address,
'description': description.strip(),
'info_url': info_url,
'date_scraped': date_scraped,
'on_notice_to': on_notice_to
}
records = records + [record]
return records

if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
records = councildas()
for record in records:
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
71 changes: 38 additions & 33 deletions circularhead.py
Expand Up @@ -6,36 +6,41 @@

# This is horribly fragile

logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
applications_url = 'https://www.circularhead.tas.gov.au/council-services/development/planning'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find(text='Current Planning Permit Applications').find_all_next('li')
for da in das:
address = ''
description = ''
info_url = da.find('a')['href']
refaddressdesc = da.find('a').get_text()
# this is so horrible I can hardly make myself do it
bits = refaddressdesc.split('-')
if not bits[1] == '2021':
if bits[0].endswith('2021'):
bits = [bits[0][0:2], bits[0][2:]] + bits[1:]
else:
# impossible, skip this one
continue
council_reference = '-'.join(bits[0:3])
address = ' '.join(bits[3: 6]) + ', ' + bits[6] + ', Tasmania, Australia'
description = ' '.join(bits[7:-1] + [bits[-1].split('.')[0]])
logging.debug(bits)
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
}
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
def councildas():
applications_url = 'https://www.circularhead.tas.gov.au/council-services/development/planning'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page.find(text='Current Planning Permit Applications').find_all_next('li')
records = []
for da in das:
info_url = da.find('a')['href']
refaddressdesc = da.find('a').text
# this is so horrible I can hardly make myself do it
bits = refaddressdesc.split('-')
if not bits[1] == '2021':
if bits[0].endswith('2021'):
bits = [bits[0][0:2], bits[0][2:]] + bits[1:]
else:
# impossible, skip this one
continue
council_reference = '-'.join(bits[0:3])
address = ' '.join(bits[3: 6]) + ', ' + bits[6] + ', Tasmania, Australia'
description = ' '.join(bits[7:-1] + [bits[-1].split('.')[0]])
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
}
records = records + [record]
return records

if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
records = councildas()
for record in records:
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')
55 changes: 31 additions & 24 deletions dorset.py
Expand Up @@ -5,28 +5,35 @@
from dateutil.parser import parse
import logging

def councildas():
applications_url = \
'https://eservices.dorset.tas.gov.au/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=12237'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page('h4', 'non_table_headers')
records = []
for da in das:
address = da.text
info_url = 'https://eservices.dorset.tas.gov.au' + da.find('a')['href']
council_reference = da.find_next('span', text='Application No.').next_sibling.text
description = da.find_next('span', text='Type of Work').next_sibling.text
date_received = parse(da.find_next('span', text='Date Lodged').next_sibling.text).strftime('%Y-%m-%d')
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'date_received': date_received,
}
records = records + [record]
return records

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
applications_url = \
'https://eservices.dorset.tas.gov.au/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=12237'
html = scraperwiki.scrape(applications_url)
date_scraped = datetime.now().isoformat()
page = BeautifulSoup(html, 'html.parser')
das = page('h4', 'non_table_headers')
for da in das:
address = da.get_text()
info_url = 'https://eservices.dorset.tas.gov.au' + da.find('a')['href']
council_reference = da.find_next('span', text='Application No.').next_sibling.get_text()
description = da.find_next('span', text='Type of Work').next_sibling.get_text()
date_received = parse(da.find_next('span', text='Date Lodged').next_sibling.get_text()).strftime('%Y-%m-%d')
record = {
'council_reference': council_reference,
'address': address,
'description': description,
'info_url': info_url,
'date_scraped': date_scraped,
'date_received': date_received,
}
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name="data")
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
os.environ["SCRAPERWIKI_DATABASE_NAME"] = "sqlite:///data.sqlite"
records = councildas()
for record in records:
logging.debug(record)
scraperwiki.sqlite.save(unique_keys=['council_reference'], data=record, table_name='data')

0 comments on commit c79860b

Please sign in to comment.