Skip to content

Commit

Permalink
Make tidy_bytes work on 1.9 and improve its performance. [#4350 state…
Browse files Browse the repository at this point in the history
…:resolved]

Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
  • Loading branch information
norman authored and jeremy committed Apr 9, 2010
1 parent ad22017 commit e416f1d
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 44 deletions.
5 changes: 5 additions & 0 deletions activesupport/CHANGELOG
@@ -1,3 +1,8 @@
*Rails 3.0.0 [beta 3] (pending)*

* Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes. #4350 [Norman Clarke]


*Rails 3.0.0 [beta 2] (April 1st, 2010)*

* Reduced load time by deferring configuration of classes using
Expand Down
85 changes: 67 additions & 18 deletions activesupport/lib/active_support/multibyte/chars.rb
Expand Up @@ -19,7 +19,7 @@ module Multibyte #:nodoc:
# bad.explicit_checking_method "T".mb_chars.downcase.to_s
#
# The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
# encodings you can write your own multibyte string handler and configure it through
# encodings you can write your own multibyte string handler and configure it through
# ActiveSupport::Multibyte.proxy_class.
#
# class CharsForUTF32
Expand Down Expand Up @@ -458,8 +458,10 @@ def g_length
end

# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
def tidy_bytes
chars(self.class.tidy_bytes(@wrapped_string))
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
def tidy_bytes(force = false)
chars(self.class.tidy_bytes(@wrapped_string, force))
end

%w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
Expand Down Expand Up @@ -528,7 +530,7 @@ def g_unpack(string)
unpacked << codepoints[marker..pos-1]
marker = pos
end
end
end
unpacked
end

Expand Down Expand Up @@ -644,33 +646,80 @@ def compose_codepoints(codepoints)
codepoints
end

def tidy_byte(byte)
if byte < 160
[UCD.cp1252[byte] || byte].pack("U").unpack("C*")
elsif byte < 192
[194, byte]
else
[195, byte - 64]
end
end
private :tidy_byte

# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
def tidy_bytes(string)
string.split(//u).map do |c|
c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)

if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
n = c.unpack('C')[0]
n < 128 ? n.chr :
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
def tidy_bytes(string, force = false)
if force
return string.unpack("C*").map do |b|
tidy_byte(b)
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
end

bytes = string.unpack("C*")
conts_expected = 0
last_lead = 0

bytes.each_index do |i|

byte = bytes[i]
is_ascii = byte < 128
is_cont = byte > 127 && byte < 192
is_lead = byte > 191 && byte < 245
is_unused = byte > 240
is_restricted = byte > 244

# Impossible or highly unlikely byte? Clean it.
if is_unused || is_restricted
bytes[i] = tidy_byte(byte)
elsif is_cont
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
else
c
if conts_expected > 0
# Expected continuation, but got ASCII or leading? Clean backwards up to
# the leading byte.
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
conts_expected = 0
end
if is_lead
# Final byte is leading? Clean it.
if i == bytes.length - 1
bytes[i] = tidy_byte(bytes.last)
else
# Valid leading byte? Expect continuations determined by position of
# first zero bit, with max of 3.
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
last_lead = i
end
end
end
end.join
end
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end
end

protected

def translate_offset(byte_offset) #:nodoc:
return nil if byte_offset.nil?
return 0 if @wrapped_string == ''

if @wrapped_string.respond_to?(:force_encoding)
@wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
end

begin
@wrapped_string[0...byte_offset].unpack('U*').length
rescue ArgumentError => e
Expand Down
73 changes: 47 additions & 26 deletions activesupport/test/multibyte_chars_test.rb
Expand Up @@ -107,7 +107,7 @@ def setup
# Ruby 1.9 only supports basic whitespace
@whitespace = "\n\t ".force_encoding(Encoding::UTF_8)
end

@byte_order_mark = [65279].pack('U')
end

Expand Down Expand Up @@ -468,14 +468,6 @@ def test_acts_like_string
class MultibyteCharsExtrasTest < Test::Unit::TestCase
include MultibyteTestHelpers

if RUBY_VERSION >= '1.9'
def test_tidy_bytes_is_broken_on_1_9_0
assert_raise(ArgumentError) do
assert_equal_codepoints [0xfffd].pack('U'), chars("\xef\xbf\xbd").tidy_bytes
end
end
end

def test_upcase_should_be_unicode_aware
assert_equal "АБВГД\0F", chars("аБвгд\0f").upcase
assert_equal 'こにちわ', chars('こにちわ').upcase
Expand Down Expand Up @@ -504,7 +496,7 @@ def test_limit_should_not_break_on_blank_strings
def test_limit_should_work_on_a_multibyte_string
example = chars(UNICODE_STRING)
bytesize = UNICODE_STRING.respond_to?(:bytesize) ? UNICODE_STRING.bytesize : UNICODE_STRING.size

assert_equal UNICODE_STRING, example.limit(bytesize)
assert_equal '', example.limit(0)
assert_equal '', example.limit(1)
Expand All @@ -531,7 +523,7 @@ def test_limit_should_keep_under_the_specified_byte_limit
assert example.limit(limit).to_s.length <= limit
end
end

def test_composition_exclusion_is_set_up_properly
# Normalization of DEVANAGARI LETTER QA breaks when composition exclusion isn't used correctly
qa = [0x915, 0x93c].pack('U*')
Expand Down Expand Up @@ -607,28 +599,57 @@ def test_should_compute_grapheme_length
end

def test_tidy_bytes_should_tidy_bytes

single_byte_cases = {
"\x21" => "!", # Valid ASCII byte, low
"\x41" => "A", # Valid ASCII byte, mid
"\x7E" => "~", # Valid ASCII byte, high
"\x80" => "€", # Continuation byte, low (cp125)
"\x94" => "”", # Continuation byte, mid (cp125)
"\x9F" => "Ÿ", # Continuation byte, high (cp125)
"\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
"\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
"\xC2" => "Â", # Start of 2-byte sequence, low
"\xC8" => "È", # Start of 2-byte sequence, mid
"\xDF" => "ß", # Start of 2-byte sequence, high
"\xE0" => "à", # Start of 3-byte sequence, low
"\xE8" => "è", # Start of 3-byte sequence, mid
"\xEF" => "ï", # Start of 3-byte sequence, high
"\xF0" => "ð", # Start of 4-byte sequence
"\xF1" => "ñ", # Unused byte
"\xFF" => "ÿ", # Restricted byte
"\x00" => "\x00" # null char
}

single_byte_cases.each do |bad, good|
assert_equal good, chars(bad).tidy_bytes.to_s
assert_equal "#{good}#{good}", chars("#{bad}#{bad}").tidy_bytes
assert_equal "#{good}#{good}#{good}", chars("#{bad}#{bad}#{bad}").tidy_bytes
assert_equal "#{good}a", chars("#{bad}a").tidy_bytes
assert_equal "#{good}á", chars("#{bad}á").tidy_bytes
assert_equal "a#{good}a", chars("a#{bad}a").tidy_bytes
assert_equal #{good}á", chars(#{bad}á").tidy_bytes
assert_equal "a#{good}", chars("a#{bad}").tidy_bytes
assert_equal #{good}", chars(#{bad}").tidy_bytes
end

byte_string = "\270\236\010\210\245"
tidy_string = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
ascii_padding = 'aa'
utf8_padding = 'éé'

assert_equal_codepoints tidy_string, chars(byte_string).tidy_bytes

assert_equal_codepoints ascii_padding.dup.insert(1, tidy_string),
chars(ascii_padding.dup.insert(1, byte_string)).tidy_bytes
assert_equal_codepoints utf8_padding.dup.insert(2, tidy_string),
chars(utf8_padding.dup.insert(2, byte_string)).tidy_bytes
assert_nothing_raised { chars(byte_string).tidy_bytes.to_s.unpack('U*') }

assert_equal_codepoints "\xC3\xA7", chars("\xE7").tidy_bytes # iso_8859_1: small c cedilla
assert_equal_codepoints "\xE2\x80\x9C", chars("\x93").tidy_bytes # win_1252: left smart quote
assert_equal_codepoints "\xE2\x82\xAC", chars("\x80").tidy_bytes # win_1252: euro
assert_equal_codepoints "\x00", chars("\x00").tidy_bytes # null char
assert_equal_codepoints [0xfffd].pack('U'), chars("\xef\xbf\xbd").tidy_bytes # invalid char
rescue ArgumentError => e
raise e if RUBY_VERSION < '1.9'
# UTF-8 leading byte followed by too few continuation bytes
assert_equal_codepoints "\xc3\xb0\xc2\xa5\xc2\xa4\x21", chars("\xf0\xa5\xa4\x21").tidy_bytes
end

def test_tidy_bytes_should_forcibly_tidy_bytes_if_specified
byte_string = "\xF0\xA5\xA4\xA4" # valid as both CP-1252 and UTF-8, but with different interpretations.
assert_not_equal "𥤤", chars(byte_string).tidy_bytes
# Forcible conversion to UTF-8
assert_equal "𥤤", chars(byte_string).tidy_bytes(true)
end


private

def string_from_classes(classes)
Expand Down

0 comments on commit e416f1d

Please sign in to comment.