public
Description: A versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
Homepage: http://spidr.rubyforge.org/
Clone URL: git://github.com/postmodern/spidr.git
spidr / spec / helpers / course.rb
100644 96 lines (80 sloc) 2.307 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
require 'open-uri'
require 'json'
 
module Helpers
  module Course
    COURSE_URL = URI('http://spidr.rubyforge.org/course/start.html')
 
    SPECS_URL = 'http://spidr.rubyforge.org/course/specs.json'
 
    def self.included(base)
      specs = JSON.parse(open(SPECS_URL).read)
 
      if specs.kind_of?(Array)
        specs.each do |spec|
          message = spec['message'].to_s.dump
          url = spec['url'].to_s.dump
 
          case spec['behavior']
          when 'follow'
            base.module_eval %{
it #{message} do
should_visit_link(#{url})
end
}
          when 'nofollow'
            base.module_eval %{
it #{message} do
should_visit_once(#{url})
end
}
          when 'fail'
            base.module_eval %{
it #{message} do
should_fail_link(#{url})
end
}
          else
            link = spec['link'].to_s.dump
 
            base.module_eval %{
it #{message} do
should_ignore_link(#{link})
should_ignore_link(#{url})
end
}
          end
        end
      end
    end
 
    def run_course
      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
        agent.every_url { |url| puts url }
      end
    end
 
    def visited_once?(link)
      url = COURSE_URL.merge(URI.encode(link))
 
      return @agent.visited_urls.select { |visited_url|
        visited_url == url
      }.length == 1
    end
 
    #
    # Returns +true+ if the agent has visited the specified _link_, returns
    # +false+ otherwise.
    #
    def visited_link?(link)
      @agent.visited?(COURSE_URL.merge(URI.encode(link)))
    end
 
    def visit_failed?(link)
      @agent.failed?(COURSE_URL.merge(URI.encode(link)))
    end
 
    def should_visit_link(link)
      visited_link?(link).should == true
    end
 
    def should_ignore_link(link)
      visited_link?(link).should == false
    end
 
    def should_visit_once(link)
      visited_once?(link).should == true
    end
 
    def should_fail_link(link)
      visited_link?(link).should == false
      visit_failed?(link).should == true
    end
  end
end