From 244f96f089e3aa7dc19b0382314d3c5cd6ef1863 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 9 Mar 2010 01:29:36 -0500 Subject: [PATCH] allowing #text to return unescaped html entities if the user very carefully asks for it. --- lib/loofah/html/document.rb | 24 +++++++++++++++--- lib/loofah/html/document_fragment.rb | 24 +++++++++++++++--- test/integration/test_scrubbers.rb | 37 ++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 8 deletions(-) diff --git a/lib/loofah/html/document.rb b/lib/loofah/html/document.rb index d7b4fc76..e70381a9 100644 --- a/lib/loofah/html/document.rb +++ b/lib/loofah/html/document.rb @@ -19,8 +19,24 @@ class Document < Nokogiri::HTML::Document # Loofah.document("

Title

Content
").text # # => "TitleContent" # - def text - encode_special_chars xpath("/html/body").inner_text + # By default, the returned text will have HTML entities + # escaped. If you want unescaped entities, and you understand + # that the result is unsafe to render in a browser, then you + # can pass an argument as shown: + # + # frag = Loofah.fragment("<script>alert('EVIL');</script>") + # # ok for browser: + # frag.text # => "<script>alert('EVIL');</script>" + # # decidedly not ok for browser: + # frag.text(:encode_special_chars => false) # => "" + # + def text(options={}) + result = xpath("/html/body").inner_text + if options[:encode_special_chars] == false + result # possibly dangerous if rendered in a browser + else + encode_special_chars result + end end alias :inner_text :text alias :to_str :text @@ -35,8 +51,8 @@ def text # Loofah.document("

Title

Content
").to_text # # => "\nTitle\n\nContent\n" # - def to_text - Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text + def to_text(options={}) + Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options) end end end diff --git a/lib/loofah/html/document_fragment.rb b/lib/loofah/html/document_fragment.rb index 87fac7c9..12562455 100644 --- a/lib/loofah/html/document_fragment.rb +++ b/lib/loofah/html/document_fragment.rb @@ -35,8 +35,24 @@ def to_s # Loofah.fragment("

Title

Content
").text # # => "TitleContent" # - def text - encode_special_chars serialize_roots.children.inner_text + # By default, the returned text will have HTML entities + # escaped. If you want unescaped entities, and you understand + # that the result is unsafe to render in a browser, then you + # can pass an argument as shown: + # + # frag = Loofah.fragment("<script>alert('EVIL');</script>") + # # ok for browser: + # frag.text # => "<script>alert('EVIL');</script>" + # # decidedly not ok for browser: + # frag.text(:encode_special_chars => false) # => "" + # + def text(options={}) + result = serialize_roots.children.inner_text + if options[:encode_special_chars] == false + result # possibly dangerous if rendered in a browser + else + encode_special_chars result + end end alias :inner_text :text alias :to_str :text @@ -51,8 +67,8 @@ def text # Loofah.fragment("

Title

Content
").to_text # # => "\nTitle\n\nContent\n" # - def to_text - Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text + def to_text(options={}) + Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options) end private diff --git a/test/integration/test_scrubbers.rb b/test/integration/test_scrubbers.rb index 9c68b48f..1278f15d 100644 --- a/test/integration/test_scrubbers.rb +++ b/test/integration/test_scrubbers.rb @@ -18,6 +18,7 @@ class TestScrubbers < Test::Unit::TestCase ENTITY_HACK_ATTACK = "
Hack attack!
<script>alert('evil')</script>
" ENTITY_HACK_ATTACK_TEXT_SCRUB = "Hack attack!<script>alert('evil')</script>" + ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC = "Hack attack!" context "Document" do context "#scrub!" do @@ -89,6 +90,24 @@ class TestScrubbers < Test::Unit::TestCase assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB, result end + + context "with encode_special_chars => false" do + should "leave behind only inner text with html entities unescaped" do + doc = Loofah::HTML::Document.parse "#{ENTITY_HACK_ATTACK}" + result = doc.text(:encode_special_chars => false) + + assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC, result + end + end + + context "with encode_special_chars => true" do + should "leave behind only inner text with html entities still escaped" do + doc = Loofah::HTML::Document.parse "#{ENTITY_HACK_ATTACK}" + result = doc.text(:encode_special_chars => true) + + assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB, result + end + end end context "#to_s" do @@ -239,6 +258,24 @@ class TestScrubbers < Test::Unit::TestCase assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB, result end + + context "with encode_special_chars => false" do + should "leave behind only inner text with html entities unescaped" do + doc = Loofah::HTML::DocumentFragment.parse "
#{ENTITY_HACK_ATTACK}
" + result = doc.text(:encode_special_chars => false) + + assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC, result + end + end + + context "with encode_special_chars => true" do + should "leave behind only inner text with html entities still escaped" do + doc = Loofah::HTML::DocumentFragment.parse "
#{ENTITY_HACK_ATTACK}
" + result = doc.text(:encode_special_chars => true) + + assert_equal ENTITY_HACK_ATTACK_TEXT_SCRUB, result + end + end end context "#to_s" do