Initial commit of Newcastle Development Application scraper

CloCkWeRX · Jun 17, 2015 · b3381c4 · b3381c4
1 parent ad2f786
commit b3381c4
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 26 deletions.
diff --git a/Gemfile b/Gemfile
@@ -8,3 +8,4 @@ ruby "2.0.0"
 
 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
 gem "mechanize"
+gem "pdftohtmlr"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -31,6 +31,8 @@ GEM
     nokogiri (1.6.6.2)
       mini_portile (~> 0.6.0)
     ntlm-http (0.1.1)
+    pdftohtmlr (0.4.2)
+      nokogiri (>= 1.3.3)
     sqlite3 (1.3.10)
     sqlite_magic (0.0.3)
       sqlite3
@@ -44,4 +46,8 @@ PLATFORMS
 
 DEPENDENCIES
   mechanize
+  pdftohtmlr
   scraperwiki!
+
+BUNDLED WITH
+   1.10.3
diff --git a/README.md b/README.md
@@ -1 +1,3 @@
-This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
+This is a scraper that runs on [Morph](https://morph.io) that scrapes the Newcastle Development Applications.
+
+This involves PDF scraping.
diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,97 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+require 'scraperwiki'
+require 'rubygems'
+require 'mechanize'
+require 'open-uri'
+require 'pdftohtmlr'
+require 'nokogiri'
+require 'tempfile'
+
+include PDFToHTMLR
+
+comment_url = 'mailto:mail@ncc.nsw.gov.au?subject='
+starting_url = 'http://www.newcastle.nsw.gov.au/building_and_planning/da_assessment/current_das/current_das'
+search_result_url = 'https://ecouncil.burwood.nsw.gov.au/eservice/daEnquiryDetails.do?index='
+
+def commit(pdf_url, reference, address, description, comment_url, date)
+  if (!reference)
+    return
+  end
+  record = {
+    'info_url' => pdf_url,
+    'comment_url' => comment_url + CGI::escape("Development Application Enquiry: " + reference),
+    'council_reference' => reference,
+    'date_received' => Date.strptime(date, '%d_%B_%Y').to_s,
+    'address' => address + ", NSW",
+    'description' => description,
+    'date_scraped' => Date.today.to_s
+  }
+  if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) 
+    ScraperWiki.save_sqlite(['council_reference'], record)
+    puts "Saving " + reference
+  else
+    puts "Skipping already saved record " + reference
+  end
+end
+
+def scrape_pdf(agent, pdf_url, comment_url)
+  puts "Scraping " + pdf_url
+  # Parse date out of URL.
+  /(?<date>\d+_\w+_\d{4})/ =~ pdf_url
+
+  # Open PDF.
+  doc = Nokogiri::HTML(PdfFileUrl.new(pdf_url).convert)
+
+  # Fix encoding issues.
+  content = doc.at('body').inner_text.encode("UTF-16be", :invalid=>:replace, :replace=>"?").encode('UTF-8')
+
+  # Parse out page data boundaries.
+  pages = content.scan(/Exhibition.*Period.*?Newcastle.*City.*Council/m)
+  pages.each do |data|
+    # Split into lines.
+    page = data.split("\n")
+    reference, description, address = false
+    # Data is formatted like this:
+    # reference
+    # address
+    # suburb
+    # description
+    # cost
+    # exhibition period
+    # (optional repeated) more description
+    i = 1
+    ref_regexp = /\d{2}\/\d{4}/
+    while i < page.size - 2 do
+      line = page[i]
+      if (line =~ ref_regexp)
+        commit(pdf_url, reference, address, description, comment_url, date)
+        reference = line
+        address = page[i + 1] + " " + page[i + 2]
+        description = page[i + 3]
+        i += 4
+        while i < page.size - 2 and !(page[i] =~ ref_regexp) do
+          # Skip over cost and exhibition dates.
+          if !(page[i].strip =~ /^\$[\d,]+$/ or page[i].strip =~ /\d+.*to.*\d+.*\d{4}/)
+            description += page[i]
+          end
+          i += 1
+        end
+        i -= 1
+      end
+      i += 1
+    end
+    commit(pdf_url, reference, address, description, comment_url, date)
+  end
+end
+
+agent = Mechanize.new
+
+# Grab the starting page and go into each link to get a more reliable data format.
+doc = agent.get(starting_url)
+doc.search('item link').each do |link|
+  begin
+    scrape_pdf(agent, link.inner_text, comment_url)
+  rescue => ex # Keep trying if something goes wrong
+    puts link.inner_text + " failed"
+    puts ex.message
+  end
+end