Skip to content

Commit

Permalink
Added --exclude-body option
Browse files Browse the repository at this point in the history
 [Closes #313]

Tto exclude pages whose bodies match the given patterns.
  • Loading branch information
Zapotek committed Feb 9, 2013
1 parent 5b52651 commit 58ac31a
Show file tree
Hide file tree
Showing 15 changed files with 431 additions and 34 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## _Under development_
- Options
- Added ```--https-only``` to disallow downgrades to HTTP when the seed URL uses HTTPS.
- Added ```--exclude-body``` to exclude pages whose bodies match the given patterns.
- Updated exceptions thrown by the framework, removed ```Arachni::Exceptions```
namespace and replaced it with the ```Arachni::Error``` base exception from
which all component specific exceptions inherit.
Expand Down Expand Up @@ -44,6 +45,10 @@
- Updated to retry a few times when the server fails to respond when trying to
request a page for an audit.
- Failed requests returned by ```#failures```.
- The following methods have been updated to enforce scope criteria:
- ```#audit_page```
- ```#push_to_page_queue```
- ```#push_to_url_queue```
- HTTP
- Fixed corruption of binary response bodies due to aggressive sanitization.
- Custom-404 page detection updated to:
Expand Down
30 changes: 23 additions & 7 deletions lib/arachni/framework.rb
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,15 @@ def run( &block )
def audit_page( page )
return if !page

# we may end up ignoring it but being included in the auditmap means that
# it has been considered but didn't fit the criteria
if skip_page? page
print_info "Ignoring page due to exclusion criteria: #{page.url}"
return false
end

@auditmap << page.url
@sitemap |= @auditmap
@sitemap.uniq!

if Options.exclude_binaries? && !page.text?
print_info "Ignoring page due to non text-based content-type: #{page.url}"
return
end

print_line
print_status "Auditing: [HTTP: #{page.code}] #{page.url}"

Expand Down Expand Up @@ -332,23 +330,41 @@ def stats( refresh_time = false, override_refresh = false )
#
# Pushes a page to the page audit queue and updates {#page_queue_total_size}
#
# @param [Page] page
#
# @return [Bool]
# +true+ if push was successful, +false+ if the +page+ matched any
# exclusion criteria.
#
def push_to_page_queue( page )
return false if skip_page? page

@page_queue << page
@page_queue_total_size += 1

@sitemap |= [page.url]
true
end

#
# Pushes a URL to the URL audit queue and updates {#url_queue_total_size}
#
# @param [String] url
#
# @return [Bool]
# +true+ if push was successful, +false+ if the +url+ matched any
# exclusion criteria.
#
def push_to_url_queue( url )
return false if skip_path? url

abs = to_absolute( url )

@url_queue.push( abs ? abs : url )
@url_queue_total_size += 1

@sitemap |= [url]
false
end

#
Expand Down
35 changes: 32 additions & 3 deletions lib/arachni/options.rb
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,13 @@ class InvalidURL < Error
#
attr_accessor :exclude

#
# Page bodies matching any of these patterns will be are ignored.
#
# @return [Array]
#
attr_accessor :exclude_body

#
# Cookies to exclude from the audit
#
Expand Down Expand Up @@ -551,8 +558,9 @@ def reset
@reports = {}

@exclude = []
@exclude_cookies = []
@exclude_vectors = []
@exclude_body = []
@exclude_cookies = []
@exclude_vectors = []

@include = []

Expand Down Expand Up @@ -587,6 +595,8 @@ def https_only?
#
# @return [Bool] true if the url is redundant, false otherwise
#
# @see #redundant
#
def redundant?( url, &block )
redundant.each do |regexp, count|
next if !(url =~ regexp)
Expand All @@ -599,6 +609,21 @@ def redundant?( url, &block )
false
end

#
# Checks if the given string matches one of the configured {#exclude_body} patterns.
#
# @param [String] body
#
# @return [Bool]
# +true+ if +body+ matches an {#exclude_body} pattern, +false+ otherwise.
#
# @see #exclude_body
#
def exclude_body?( body )
Options.exclude_body.each { |i| return true if body.to_s =~ i }
false
end

def exclude_binaries?
self.exclude_binaries
end
Expand Down Expand Up @@ -790,7 +815,7 @@ def redundant=( filters )
alias :modules= :mods=

# these options need to contain Array<Regexp>
[ :include, :exclude, :lsmod, :lsrep, :lsplug ].each do |m|
[ :exclude_body, :include, :exclude, :lsmod, :lsrep, :lsplug ].each do |m|
define_method( "#{m}=".to_sym ) do |arg|
arg = [arg].flatten.map { |s| s.is_a?( Regexp ) ? s : Regexp.new( s.to_s ) }
instance_variable_set( "@#{m}".to_sym, arg )
Expand Down Expand Up @@ -831,6 +856,7 @@ def parse( require_url = true )
[ '--cookie-string' , GetoptLong::REQUIRED_ARGUMENT ],
[ '--user-agent', '-b', GetoptLong::REQUIRED_ARGUMENT ],
[ '--exclude', '-e', GetoptLong::REQUIRED_ARGUMENT ],
[ '--exclude-body', GetoptLong::REQUIRED_ARGUMENT ],
[ '--include', '-i', GetoptLong::REQUIRED_ARGUMENT ],
[ '--exclude-cookie', GetoptLong::REQUIRED_ARGUMENT ],
[ '--exclude-vector', GetoptLong::REQUIRED_ARGUMENT ],
Expand Down Expand Up @@ -1029,6 +1055,9 @@ def parse( require_url = true )
when '--exclude'
@exclude << Regexp.new( arg )

when '--exclude-body'
@exclude_body << Regexp.new( arg )

when '--include'
@include << Regexp.new( arg )

Expand Down
5 changes: 1 addition & 4 deletions lib/arachni/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,7 @@ def page
alias :run :page

def text?
type = @response.content_type
return false if !type
type.to_s.substring?( 'text' )
@response.text?
end

def doc
Expand All @@ -266,7 +264,6 @@ def headers
'Accept' => 'text/html,application/xhtml+xml,application' +
'/xml;q=0.9,*/*;q=0.8',
'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept-Language' => 'en-gb,en;q=0.5',
'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
'From' => @opts.authed_by || '',
'User-Agent' => @opts.user_agent || '',
Expand Down
9 changes: 7 additions & 2 deletions lib/arachni/spider.rb
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,13 @@ def visit( url, opts = {}, &block )
end

print_status "[HTTP: #{res.code}] #{effective_url}"
@sitemap[effective_url] = res.code
block.call( res )

if skip_response?( res )
print_info 'Ignoring due to exclusion criteria.'
else
@sitemap[effective_url] = res.code
block.call( res )
end

decrease_pending
end
Expand Down
2 changes: 1 addition & 1 deletion lib/arachni/trainer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def push( res )

return false if !@parser.text? ||
@trainings_per_url[@parser.url] >= MAX_TRAININGS_PER_URL ||
redundant?( @parser.url ) || skip_path?( @parser.url )
redundant?( @parser.url ) || skip_resource?( res )

analyze( res )
true
Expand Down
8 changes: 8 additions & 0 deletions lib/arachni/typhoeus/response.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
module Typhoeus
class Response

alias :url :effective_url

alias :old_initialize :initialize
def initialize( *args )
old_initialize( *args )
Expand All @@ -35,6 +37,12 @@ def each( &block )
headers_hash.each( &block )
end

def text?
type = content_type
return false if !type
type.to_s.start_with?( 'text/' )
end

def content_type
find_header_value( 'content-type' )
end
Expand Down
3 changes: 3 additions & 0 deletions lib/arachni/ui/cli/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,9 @@ def usage
--exclude=<regexp> Exclude urls matching <regexp>.
(Can be used multiple times.)
--exclude-body=<regexp> Exclude pages whose body matches <regexp>.
(Can be used multiple times.)
-i <regexp>
--include=<regexp> Include *only* urls matching <regex>.
(Can be used multiple times.)
Expand Down
63 changes: 62 additions & 1 deletion lib/arachni/utilities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def follow_protocol?( url, reference = Options.url )
def skip_path?( path )
return true if !path

parsed = uri_parse( path )
parsed = uri_parse( path.to_s )
begin
return true if !include_path?( parsed )
return true if exclude_path?( parsed )
Expand All @@ -303,6 +303,67 @@ def skip_path?( path )
end
end

#
# Determines whether or not a given {Arachni::Page} or {Typhoeus::Response}
# should be ignored based on:
# * {Options#ignore} patterns
# * {Options#exclude_binaries} option
# * Body
# * Content-type
#
# @param [Page,Typhoeus::Response,#body] page_or_response
#
# @return [Bool]
# +true+ if the +#body+ of the given object matches any of the
# {Options#ignore} patterns, +false+ otherwise.
#
# @see Options#ignore
# @see Options#ignore?
# @see Options#exclude_binaries?
#
def skip_page?( page_or_response )
(Options.exclude_binaries? && !page_or_response.text?) ||
skip_path?( page_or_response.url ) ||
Options.exclude_body?( page_or_response.body )
end
alias :skip_response? :skip_page?

#
# Determines whether or not the given +resource+ should be ignored
# depending on its type and content.
#
# @param [Page,Typhoeus::Response,String]
# If given a:
# * {Page}: both its URL and body will be examined.
# * {Typhoeus::Response}: both its effective URL and body will be examined.
# * {String}: if multi-line it will be treated as a response body,
# otherwise as a path.
#
# @return [Bool]
# +true+ if the resource should be ignore,+false+ otherwise.
#
# @see skip_path?
# @see ignore_page?
# @see ignore_response?
# @see Options#ignore?
#
def skip_resource?( resource )
case resource
when Page
skip_page?( resource )

when Typhoeus::Response
skip_response?( resource )

else
if (s = resource.to_s) =~ /[\r\n]/
Options.exclude_body? s
else
skip_path? s
end
end
end

#
# Returns a random available port
#
Expand Down
62 changes: 47 additions & 15 deletions spec/arachni/framework_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -782,29 +782,61 @@
end

describe '#audit_page' do
it 'should audit an individual page' do
@f.opts.audit :links, :forms, :cookies
context 'when the page does not match exclusion criteria' do
it 'should audit it and return true' do
@f.opts.audit :links, :forms, :cookies

@f.modules.load :taint
@f.modules.load :taint

@f.audit_page Arachni::Page.from_url( @url + '/link' )
@f.auditstore.issues.size.should == 1
@f.audit_page( Arachni::Page.from_url( @url + '/link' ) ).should be_true
@f.auditstore.issues.size.should == 1
end
end
context 'when the page matches exclusion criteria' do
it 'should not audit it and return false' do
@f.opts.exclude << /link/
@f.opts.audit :links, :forms, :cookies

@f.modules.load :taint

@f.audit_page( Arachni::Page.from_url( @url + '/link' ) ).should be_false
@f.auditstore.issues.size.should == 0
end
end
end

describe '#push_to_page_queue' do
it 'should push a page to the page audit queue' do
page = Arachni::Page.from_url( @url + '/train/true' )
context 'when the page does not match exclusion criteria' do
it 'should push it to the page audit queue and return true' do
page = Arachni::Page.from_url( @url + '/train/true' )

@f.opts.audit :links, :forms, :cookies
@f.modules.load :taint
@f.opts.audit :links, :forms, :cookies
@f.modules.load :taint

@f.page_queue_total_size.should == 0
@f.push_to_page_queue( page )
@f.run
@f.auditstore.issues.size.should == 3
@f.page_queue_total_size.should > 0
@f.modules.clear
@f.page_queue_total_size.should == 0
@f.push_to_page_queue( page ).should be_true
@f.run
@f.auditstore.issues.size.should == 3
@f.page_queue_total_size.should > 0
@f.modules.clear
end
end
context 'when the page matches exclusion criteria' do
it 'should not push it to the page audit queue and return false' do
page = Arachni::Page.from_url( @url + '/train/true' )

@f.opts.audit :links, :forms, :cookies
@f.modules.load :taint

@f.opts.exclude << /train/

@f.page_queue_total_size.should == 0
@f.push_to_page_queue( page ).should be_false
@f.run
@f.auditstore.issues.size.should == 0
@f.page_queue_total_size.should == 0
@f.modules.clear
end
end
end

Expand Down
Loading

0 comments on commit 58ac31a

Please sign in to comment.