Skip to content

Commit 9a73630

Browse files
committed
Add verify and clean methods to ActiveSupport::Multibyte.
When accepting character input from outside of your application you can't blindly trust that all strings are properly encoded. With these methods you can check incoming strings and clean them up if necessary. Signed-off-by: Michael Koziarski <michael@koziarski.com> Conflicts: activesupport/lib/active_support/multibyte.rb
1 parent 5e6dab8 commit 9a73630

File tree

4 files changed

+239
-18
lines changed

4 files changed

+239
-18
lines changed

activesupport/lib/active_support/multibyte.rb

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,35 @@ module Multibyte
2929
#
3030
# Example:
3131
# ActiveSupport::Multibyte.proxy_class = CharsForUTF32
32-
mattr_accessor :proxy_class
33-
self.proxy_class = ActiveSupport::Multibyte::Chars
32+
def self.proxy_class=(klass)
33+
@proxy_class = klass
34+
end
35+
36+
# Returns the currect proxy class
37+
def self.proxy_class
38+
@proxy_class ||= ActiveSupport::Multibyte::Chars
39+
end
40+
41+
# Regular expressions that describe valid byte sequences for a character
42+
VALID_CHARACTER = {
43+
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
44+
'UTF-8' => /\A(?:
45+
[\x00-\x7f] |
46+
[\xc2-\xdf] [\x80-\xbf] |
47+
\xe0 [\xa0-\xbf] [\x80-\xbf] |
48+
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
49+
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
50+
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
51+
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
52+
# Quick check for valid Shift-JIS characters, disregards the odd-even pairing
53+
'Shift_JIS' => /\A(?:
54+
[\x00-\x7e \xa1-\xdf] |
55+
[\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
56+
}
3457
end
3558
end
59+
60+
require 'active_support/multibyte/chars'
61+
require 'active_support/multibyte/exceptions'
62+
require 'active_support/multibyte/unicode_database'
63+
require 'active_support/multibyte/utils'

activesupport/lib/active_support/multibyte/chars.rb

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,7 @@ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
7474
UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
7575
UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
7676

77-
# Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
78-
UTF8_PAT = /\A(?:
79-
[\x00-\x7f] |
80-
[\xc2-\xdf] [\x80-\xbf] |
81-
\xe0 [\xa0-\xbf] [\x80-\xbf] |
82-
[\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
83-
\xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
84-
[\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
85-
\xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
86-
)*\z/xn
77+
UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
8778

8879
attr_reader :wrapped_string
8980
alias to_s wrapped_string
@@ -308,31 +299,31 @@ def center(integer, padstr=' ')
308299
def rstrip
309300
chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
310301
end
311-
302+
312303
# Strips entire range of Unicode whitespace from the left of the string.
313304
def lstrip
314305
chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
315306
end
316-
307+
317308
# Strips entire range of Unicode whitespace from the right and left of the string.
318309
def strip
319310
rstrip.lstrip
320311
end
321-
312+
322313
# Returns the number of codepoints in the string
323314
def size
324315
self.class.u_unpack(@wrapped_string).size
325316
end
326317
alias_method :length, :size
327-
318+
328319
# Reverses all characters in the string.
329320
#
330321
# Example:
331322
# 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
332323
def reverse
333324
chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
334325
end
335-
326+
336327
# Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
337328
# character.
338329
#
@@ -647,7 +638,7 @@ def tidy_bytes(string)
647638
string.split(//u).map do |c|
648639
c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
649640

650-
if !UTF8_PAT.match(c)
641+
if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
651642
n = c.unpack('C')[0]
652643
n < 128 ? n.chr :
653644
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# encoding: utf-8
2+
3+
module ActiveSupport #:nodoc:
4+
module Multibyte #:nodoc:
5+
if Kernel.const_defined?(:Encoding)
6+
# Returns a regular expression that matches valid characters in the current encoding
7+
def self.valid_character
8+
VALID_CHARACTER[Encoding.default_internal.to_s]
9+
end
10+
else
11+
def self.valid_character
12+
case $KCODE
13+
when 'UTF8'
14+
VALID_CHARACTER['UTF-8']
15+
when 'SJIS'
16+
VALID_CHARACTER['Shift_JIS']
17+
end
18+
end
19+
end
20+
21+
if 'string'.respond_to?(:valid_encoding?)
22+
# Verifies the encoding of a string
23+
def self.verify(string)
24+
string.valid_encoding?
25+
end
26+
else
27+
def self.verify(string)
28+
if expression = valid_character
29+
for c in string.split(//)
30+
return false unless valid_character.match(c)
31+
end
32+
end
33+
true
34+
end
35+
end
36+
37+
# Verifies the encoding of the string and raises an exception when it's not valid
38+
def self.verify!(string)
39+
raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
40+
end
41+
42+
if 'string'.respond_to?(:force_encoding)
43+
# Removes all invalid characters from the string.
44+
#
45+
# Note: this method is a no-op in Ruby 1.9
46+
def self.clean(string)
47+
string
48+
end
49+
else
50+
def self.clean(string)
51+
if expression = valid_character
52+
stripped = []; for c in string.split(//)
53+
stripped << c if valid_character.match(c)
54+
end; stripped.join
55+
else
56+
string
57+
end
58+
end
59+
end
60+
end
61+
end
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# encoding: utf-8
2+
3+
require 'abstract_unit'
4+
require 'multibyte_test_helpers'
5+
6+
class MultibyteUtilsTest < ActiveSupport::TestCase
7+
include MultibyteTestHelpers
8+
9+
test "valid_character returns an expression for the current encoding" do
10+
with_encoding('None') do
11+
assert_nil ActiveSupport::Multibyte.valid_character
12+
end
13+
with_encoding('UTF8') do
14+
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
15+
end
16+
with_encoding('SJIS') do
17+
assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
18+
end
19+
end
20+
21+
test "verify verifies ASCII strings are properly encoded" do
22+
with_encoding('None') do
23+
examples.each do |example|
24+
assert ActiveSupport::Multibyte.verify(example)
25+
end
26+
end
27+
end
28+
29+
test "verify verifies UTF-8 strings are properly encoded" do
30+
with_encoding('UTF8') do
31+
assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
32+
assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
33+
end
34+
end
35+
36+
test "verify verifies Shift-JIS strings are properly encoded" do
37+
with_encoding('SJIS') do
38+
assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
39+
assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
40+
end
41+
end
42+
43+
test "verify! raises an exception when it finds an invalid character" do
44+
with_encoding('UTF8') do
45+
assert_raises(ActiveSupport::Multibyte::EncodingError) do
46+
ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
47+
end
48+
end
49+
end
50+
51+
test "verify! doesn't raise an exception when the encoding is valid" do
52+
with_encoding('UTF8') do
53+
assert_nothing_raised do
54+
ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
55+
end
56+
end
57+
end
58+
59+
if RUBY_VERSION < '1.9'
60+
test "clean leaves ASCII strings intact" do
61+
with_encoding('None') do
62+
[
63+
'word', "\270\236\010\210\245"
64+
].each do |string|
65+
assert_equal string, ActiveSupport::Multibyte.clean(string)
66+
end
67+
end
68+
end
69+
70+
test "clean cleans invalid characters from UTF-8 encoded strings" do
71+
with_encoding('UTF8') do
72+
cleaned_utf8 = [8].pack('C*')
73+
assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
74+
assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
75+
end
76+
end
77+
78+
test "clean cleans invalid characters from Shift-JIS encoded strings" do
79+
with_encoding('SJIS') do
80+
cleaned_sjis = [184, 0, 136, 165].pack('C*')
81+
assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
82+
assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
83+
end
84+
end
85+
else
86+
test "clean is a no-op" do
87+
with_encoding('UTF8') do
88+
assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
89+
end
90+
end
91+
end
92+
93+
private
94+
95+
STRINGS = {
96+
'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'),
97+
'invalid ASCII' => [128].pack('C*'),
98+
'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
99+
'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'),
100+
'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'),
101+
'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
102+
}
103+
104+
if Kernel.const_defined?(:Encoding)
105+
def example(key)
106+
STRINGS[key].force_encoding(Encoding.default_internal)
107+
end
108+
109+
def examples
110+
STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) }
111+
end
112+
else
113+
def example(key)
114+
STRINGS[key]
115+
end
116+
117+
def examples
118+
STRINGS.values
119+
end
120+
end
121+
122+
if 'string'.respond_to?(:encoding)
123+
def with_encoding(enc)
124+
before = Encoding.default_internal
125+
126+
case enc
127+
when 'UTF8'
128+
Encoding.default_internal = Encoding::UTF_8
129+
when 'SJIS'
130+
Encoding.default_internal = Encoding::Shift_JIS
131+
else
132+
Encoding.default_internal = Encoding::BINARY
133+
end
134+
yield
135+
136+
Encoding.default_internal = before
137+
end
138+
else
139+
alias with_encoding with_kcode
140+
end
141+
end

0 commit comments

Comments
 (0)