Permalink
Browse files

Download attachments and save them in ‘attachments’

  • Loading branch information...
1 parent 0057cb4 commit 3cc0df4f9ed668bc272ac21da6fc9bda0e1afd0f @ahx ahx committed May 31, 2014
Showing with 36 additions and 3 deletions.
  1. +1 −0 Gemfile
  2. +6 −0 Gemfile.lock
  3. +2 −0 application.rb
  4. +27 −3 resolution.rb
View
@@ -1,6 +1,7 @@
source "https://rubygems.org"
gem 'addressable'
+gem 'typhoeus'
gem 'pupa'
gem 'nokogiri'
gem 'pry'
View
@@ -12,10 +12,13 @@ GEM
coderay (1.1.0)
colored (1.2)
connection_pool (2.0.0)
+ ethon (0.7.0)
+ ffi (>= 1.3.0)
faraday (0.9.0)
multipart-post (>= 1.2, < 3)
faraday_middleware (0.9.1)
faraday (>= 0.7.4, < 0.10)
+ ffi (1.9.3)
i18n (0.6.9)
json-schema (2.1.9)
libv8 (3.16.14.3)
@@ -62,6 +65,8 @@ GEM
treetop (1.4.15)
polyglot
polyglot (>= 0.3.1)
+ typhoeus (0.6.8)
+ ethon (>= 0.7.0)
tzinfo (0.3.39)
PLATFORMS
@@ -73,3 +78,4 @@ DEPENDENCIES
pry
pupa
therubyracer
+ typhoeus
View
@@ -5,6 +5,8 @@
require 'bundler/setup'
require 'pupa'
+require 'typhoeus'
+require 'typhoeus/adapters/faraday'
require 'nokogiri'
# Use Addressable::URI to handle URIs with umlauts
View
@@ -31,17 +31,41 @@ def scrape_objects
resolution.anlagen_text = doc.css('table:contains("Download") ~ table:first td:first').text
script = doc.css('table:contains("Download") ~ table:first script').text
- if pdf_urls = extract_js_array(:URL, script)
- resolution.anlagen_urls = pdf_urls
- end
+
+ pdf_urls = extract_js_array(:URL, script).map! { |path| build_url(path.strip!) if path }
+ resolution.anlagen_urls = pdf_urls if pdf_urls.present?
dispatch(resolution)
+
+ download_attachments! resolution.anlagen_urls
end
+ end
+ # Download the attachment to the filesystem and cache it forever.
+ def download_attachments!(urls)
+ return if urls.blank?
+ begin
+ # Send HTTP requests in parallel. – See Pupa's README to learn more.
+ attachment_downloader.in_parallel(attachment_download_manager) do
+ urls.each do |url|
+ attachment_downloader.get(url)
+ end
+ end
+ rescue Faraday::Error::ClientError => e
+ error(e.response.inspect)
+ end
end
private
+ def attachment_download_manager
+ @attachment_download_manager ||= Typhoeus::Hydra.new(max_concurrency: 20)
+ end
+
+ def attachment_downloader
+ @attachment_downloader ||= Pupa::Processor::Client.new(cache_dir: File.expand_path('attachments', Dir.pwd), expires_in: nil)
+ end
+
require 'v8'
def extract_js_array(name, js_source)
context_shim = "document = { write: function() {} };"

0 comments on commit 3cc0df4

Please sign in to comment.