Skip to content

Commit

Permalink
Create scraper.py
Browse files Browse the repository at this point in the history
Creation of Scraper. SQL works remote but not sure on how to get it to work online.
  • Loading branch information
Ekunga committed Jun 13, 2017
1 parent 316eeac commit 17abe9a
Showing 1 changed file with 135 additions and 21 deletions.
156 changes: 135 additions & 21 deletions scraper.py
@@ -1,24 +1,138 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#############
# Scraper to pull data from Department of Environment, Land, Water and Planning
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
# https://www2.delwp.vic.gov.au/
#
# Found at;
# https://lodgement.planning-permits.delwp.vic.gov.au/
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")

# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
##############

import scraperwiki
from lxml import html, etree
from datetime import datetime
import requests
import cx_Oracle

sub_url = 'https://lodgement.planning-permits.delwp.vic.gov.au'
url = 'https://lodgement.planning-permits.delwp.vic.gov.au/search-register'
global item_ref
item_ref = []
global item
item = []
global debug
debug= 'false'
global max_pages
max_pages = 32
global sql_query
sql_query = []
global column_names
columns_names = []
global column_values
column_values = []
global exit
exit = 'false'

item_ref.append(['Application','//span[@class="app-name"]/a/text()'])
item_ref.append(['Application_Link','//span[@class="app-name"]/a/@href'])
item_ref.append(['Created','//td[@data-label="Created"]/div/text()'])
item_ref.append(['Description','//td[@data-label="Description"]/div/text()'])
item_ref.append(['Properties','//td[@data-label="Properties"]/div/text()'])
item_ref.append(['Status','//td[@data-label="Status"]/div/text()'])

if debug == 'true': print(item_ref)

if debug == 'true': print(len(item_ref))

page_scrape = 1
while page_scrape <= max_pages:
url = ''.join(['https://lodgement.planning-permits.delwp.vic.gov.au/search-register?page=',str(page_scrape)])
item = []
if debug == 'true': print(url)
page = requests.get(url)
page_details = html.fromstring(page.content)

#Scrape Data
item_to_scrape = 0
while item_to_scrape < len(item_ref):
item.append([item_ref[item_to_scrape][0],page_details.xpath(item_ref[item_to_scrape][1])])
if debug == 'true': print('Item Name:',item[item_to_scrape][0])
if debug == 'true': print('Item:',item[item_to_scrape][1][0])
item_to_scrape = item_to_scrape + 1

if debug == 'true': print('Number of Records:',len(item[0][1]))

item_to_scrape = 0
item_to_display = 0

#Loops through each Item and displays all Data
while item_to_display < len(item[0][1]):

if debug == 'true': print('Next Record')
column_names = []
column_values = []
while item_to_scrape < len(item_ref):


value = item[item_to_scrape][1][item_to_display]

value = value.splitlines()
value = ' '.join(value)

column_names.append(item[item_to_scrape][0])



# Checks if Application currently exists
if (item[item_to_scrape][0] == 'Application'):
sql = "Select Distinct Application from data where Application ='"+str(value)+"'"
#print(sql)
scraperwiki.sql.execute(sql)

for row in cur:
print('All Complete')
exit = 'true'
item_to_scrape = len(item_ref)
item_to_display = len(item[0][1]) + 1
page_scrape = max_pages + 1

if exit == 'false':




if exit == 'false':
if item[item_to_scrape][0] == 'Application_Link':
fixed_value = ''.join([sub_url,value.strip()])
else:
fixed_value = value.strip()
fixed_value = fixed_value.replace("'","''")
fixed_value = fixed_value.replace("’","''")
fixed_value = fixed_value.replace("&","\&")
fixed_value = fixed_value.replace("é","e")

column_values.append(fixed_value)

item_to_scrape = item_to_scrape + 1


# Inserts Data
if exit =='false':
column_name = 0
sql_part_1 = "Insert into data ("
sql_part_2 = ','.join(column_names)
sql_part_3 = ") Values ('"
sql_part_4 = "','".join(column_values)
sql_part_5 = "')"
sql

scraperwiki.sql.execute(str)

sql = ''.join([sql_part_1,sql_part_2,sql_part_3,sql_part_4,sql_part_5])
scraperwiki.sql.execute(sql)

item_to_scrape = 0
item_to_display = item_to_display + 1
page_scrape = page_scrape + 1

0 comments on commit 17abe9a

Please sign in to comment.