require 'addressable/uri'
require 'digest/md5'
require 'net/https'
require 'thread'
require 'timeout'
require 'yaml'
require 'zlib'
require 'planet/log'
module Planet
# map a URI to a readable and (relatively) unique filename
def Planet.filename uri
name = uri_norm(uri)
name.sub!(/^\w+:\/*(\w+:|www\.)?/,'') # remove scheme and www.
name.gsub! /[?\/:|]+/, ',' # replace separator characters
name.sub! /^[,.]*/, '' # remove initial junk
name.sub! /[,.]*$/, '' # remove final junk
if name.length > 250
parts, excess = name.split(','), []
excess << parts.pop while parts.join(',').length > 220
parts << Digest::MD5.hexdigest(excess.join(','))
name = parts.join(',')
end
name
end
class Fido
attr_accessor :cache, :redirect_limit, :threads, :timeout
def initialize cache
@cache = cache
@timeout = 30
@threads = 6
@redirect_limit = 10
end
# invoke fetch on a list of uris in parallel
def each(uris)
lock = Mutex.new
queue = uris.clone
threads = []
@threads.times do |i|
threads[i] = Thread.new {
while uri = lock.synchronize {queue.pop}
begin
response = fetch(Planet::uri_norm(uri), redirect_limit)
yield uri, response
write_to_cache uri, response
rescue Exception => e
Planet.log.error e.inspect
Planet.log.error uri
e.backtrace.each {|line| Planet.log.error line}
end
end
}
end
# wait for each to complete
threads.each {|thread| thread.join}
end
# fetch a uri, processing up to redirect_limit number of redirects
def fetch uri, redirect_limit=10
cachefile = File.join(@cache, Planet.filename(uri))
# handle permanent redirects and gone
if File.exist? cachefile
cache = File.open(cachefile) {|file| YAML::load file.read}
return cache if cache.code == '410'
if cache.code == '301' and redirect_limit > 0
location = cache['location']
if location
return fetch(Planet::uri_norm(uri,location), redirect_limit-1)
end
end
else
cache = {}
end
# issue the request, handling timeout, ssl, etc.
response = begin
uri = URI.parse(uri)
Timeout::timeout(@timeout) {
http = Net::HTTP::new(uri.host, uri.port)
if uri.scheme == 'https'
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
http.start {
request = Net::HTTP::Get.new(uri.request_uri)
request['If-None-Match'] ||= cache['Etag']
request['If-Modified-Since'] ||= cache['Last-Modified']
request['USER-Agent'] = 'Mars'
request['Accept-Encoding'] = 'gzip, deflate'
http.request(request)
}
}
rescue Timeout::Error => error
Net::HTTPRequestTimeOut.new '1.1', '408', error.to_s
rescue SocketError, Errno::ECONNRESET => error
Net::HTTPInternalServerError.new '1.1', '500', error.to_s
end
# expand gzip and deflated responses
if response.code == '200' and response.body
case response['content-encoding']
when 'gzip', 'x-gzip'
gz = Zlib::GzipReader.new(StringIO.new(response.body))
response.instance_eval {@body = gz.read}
gz.close
response.delete('content-encoding')
when 'deflate'
response.instance_eval {@body = Zlib::Inflate.inflate(response.body)}
response.delete('content-encoding')
end
end
# not all servers handle conditional gets, so while not much can be
# done about the bandwidth, but if the response body is identical
# the downstream processing (parsing, caching, ...) can be avoided.
if response.code == '200' and cache.respond_to? :body
if response.body == cache.body
response = Net::HTTPNotModified.new('1.0', '304', 'Not Modified')
end
end
# handle redirects
if %w[301 302 307].include? response.code and redirect_limit > 0
location = response['location']
if location
return fetch(Planet::uri_norm(uri.to_s,location), redirect_limit - 1)
end
end
# log the response and save the actual content location used
level = (response.code<'400' ? :info : :warn)
Planet.log.send level, "#{response.code} #{uri}"
response.header['Content-Location'] ||= uri.to_s
response
rescue Timeout::Error
raise
rescue Exception => e
response = Net::HTTPInternalServerError.new('1.0', '500', e.to_s)
response.header['Content-Location'] ||= uri.to_s
Planet.log.error "#{response.code} #{uri}"
Planet.log.error e.inspect
e.backtrace.each {|line| Planet.log.error line}
response
end
# update cache with successful and permanent responses
def write_to_cache uri, response
if %w[200 301 410].include? response.code
cachefile = File.join(@cache, Planet.filename(uri))
File.open(cachefile,'w') {|file| file.write(response.to_yaml)}
end
end
# fetch previous successful response from cache
def read_from_cache uri
cachefile = File.join(@cache, Planet.filename(uri))
File.open(cachefile) {|file| YAML::load file.read}
end
end
# convenience method to normalize a URI
def Planet.uri_norm *parts
begin
Addressable::URI.join(*parts).normalize.to_s
rescue Exception => e
Planet.log.warn "#{e} #{parts.inspect}"
parts.last
end
end
end