Use multibyte proxy class on 1.9, refactor Unicode.

Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding. Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency. [#4594 state:resolved] Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
rails · May 21, 2010 · f3abc8a · f3abc8a
1 parent ad4be3d
commit f3abc8a
Show file tree

Hide file tree

Showing 13 changed files with 715 additions and 701 deletions.
diff --git a/activesupport/CHANGELOG b/activesupport/CHANGELOG
@@ -1,8 +1,10 @@
 *Rails 3.0.0 [beta 4/release candidate] (unreleased)*
 
+* Ruby 1.9: support UTF-8 case folding.  #4595 [Norman Clarke]
+
 * Renames Array#rand -> Array#random_element. [Santiago Pastorino, Rizwan Reza]
 
-* 1.9 compat: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
+* Ruby 1.9: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
 
 * Aliases Date#sunday to Date#end_of_week. [fxn]
 

diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables
@@ -11,135 +11,138 @@ require 'tmpdir'
 
 module ActiveSupport
   module Multibyte
-    class UnicodeDatabase
-      def load; end
-    end
-
-    class UnicodeDatabaseGenerator
-      BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
-      SOURCES = {
-        :codepoints => BASE_URI + 'UnicodeData.txt',
-        :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
-        :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
-        :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
-      }
-
-      def initialize
-        @ucd = UnicodeDatabase.new
-
-        default = Codepoint.new
-        default.combining_class = 0
-        default.uppercase_mapping = 0
-        default.lowercase_mapping = 0
-        @ucd.codepoints = Hash.new(default)
-      end
+    module Unicode
 
-      def parse_codepoints(line)
-        codepoint = Codepoint.new
-        raise "Could not parse input." unless line =~ /^
-          ([0-9A-F]+);        # code
-          ([^;]+);            # name
-          ([A-Z]+);           # general category
-          ([0-9]+);           # canonical combining class
-          ([A-Z]+);           # bidi class
-          (<([A-Z]*)>)?       # decomposition type
-          ((\ ?[0-9A-F]+)*);  # decompomposition mapping
-          ([0-9]*);           # decimal digit
-          ([0-9]*);           # digit
-          ([^;]*);            # numeric
-          ([YN]*);            # bidi mirrored
-          ([^;]*);            # unicode 1.0 name
-          ([^;]*);            # iso comment
-          ([0-9A-F]*);        # simple uppercase mapping
-          ([0-9A-F]*);        # simple lowercase mapping
-          ([0-9A-F]*)$/ix     # simple titlecase mapping
-        codepoint.code              = $1.hex
-        #codepoint.name              = $2
-        #codepoint.category          = $3
-        codepoint.combining_class   = Integer($4)
-        #codepoint.bidi_class        = $5
-        codepoint.decomp_type       = $7
-        codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
-        #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
-        codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
-        codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
-        #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
-        @ucd.codepoints[codepoint.code] = codepoint
+      class UnicodeDatabase
+        def load; end
       end
 
-      def parse_grapheme_break_property(line)
-        if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
-          type = $2.downcase.intern
-          @ucd.boundary[type] ||= []
-          if $1.include? '..'
-            parts = $1.split '..'
-            @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
-          else
-            @ucd.boundary[type] << $1.hex
+      class DatabaseGenerator
+        BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
+        SOURCES = {
+          :codepoints => BASE_URI + 'UnicodeData.txt',
+          :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
+          :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+          :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
+        }
+
+        def initialize
+          @ucd = Unicode::UnicodeDatabase.new
+
+          default = Codepoint.new
+          default.combining_class = 0
+          default.uppercase_mapping = 0
+          default.lowercase_mapping = 0
+          @ucd.codepoints = Hash.new(default)
+        end
+
+        def parse_codepoints(line)
+          codepoint = Codepoint.new
+          raise "Could not parse input." unless line =~ /^
+            ([0-9A-F]+);        # code
+            ([^;]+);            # name
+            ([A-Z]+);           # general category
+            ([0-9]+);           # canonical combining class
+            ([A-Z]+);           # bidi class
+            (<([A-Z]*)>)?       # decomposition type
+            ((\ ?[0-9A-F]+)*);  # decompomposition mapping
+            ([0-9]*);           # decimal digit
+            ([0-9]*);           # digit
+            ([^;]*);            # numeric
+            ([YN]*);            # bidi mirrored
+            ([^;]*);            # unicode 1.0 name
+            ([^;]*);            # iso comment
+            ([0-9A-F]*);        # simple uppercase mapping
+            ([0-9A-F]*);        # simple lowercase mapping
+            ([0-9A-F]*)$/ix     # simple titlecase mapping
+          codepoint.code              = $1.hex
+          #codepoint.name              = $2
+          #codepoint.category          = $3
+          codepoint.combining_class   = Integer($4)
+          #codepoint.bidi_class        = $5
+          codepoint.decomp_type       = $7
+          codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
+          #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
+          codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
+          codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
+          #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
+          @ucd.codepoints[codepoint.code] = codepoint
+        end
+
+        def parse_grapheme_break_property(line)
+          if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
+            type = $2.downcase.intern
+            @ucd.boundary[type] ||= []
+            if $1.include? '..'
+              parts = $1.split '..'
+              @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
+            else
+              @ucd.boundary[type] << $1.hex
+            end
           end
         end
-      end
 
-      def parse_composition_exclusion(line)
-        if line =~ /^([0-9A-F]+)/i
-          @ucd.composition_exclusion << $1.hex
+        def parse_composition_exclusion(line)
+          if line =~ /^([0-9A-F]+)/i
+            @ucd.composition_exclusion << $1.hex
+          end
         end
-      end
 
-      def parse_cp1252(line)
-        if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
-          @ucd.cp1252[$1.hex] = $2.hex
+        def parse_cp1252(line)
+          if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+            @ucd.cp1252[$1.hex] = $2.hex
+          end
         end
-      end
 
-      def create_composition_map
-        @ucd.codepoints.each do |_, cp|
-          if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
-            @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
-            @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+        def create_composition_map
+          @ucd.codepoints.each do |_, cp|
+            if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
+              @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
+              @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+            end
           end
         end
-      end
 
-      def normalize_boundary_map
-        @ucd.boundary.each do |k,v|
-          if [:lf, :cr].include? k
-            @ucd.boundary[k] = v[0]
+        def normalize_boundary_map
+          @ucd.boundary.each do |k,v|
+            if [:lf, :cr].include? k
+              @ucd.boundary[k] = v[0]
+            end
           end
         end
-      end
 
-      def parse
-        SOURCES.each do |type, url|
-          filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
-          unless File.exist?(filename)
-            $stderr.puts "Downloading #{url.split('/').last}"
-            File.open(filename, 'wb') do |target|
-              open(url) do |source|
-                source.each_line { |line| target.write line }
+        def parse
+          SOURCES.each do |type, url|
+            filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
+            unless File.exist?(filename)
+              $stderr.puts "Downloading #{url.split('/').last}"
+              File.open(filename, 'wb') do |target|
+                open(url) do |source|
+                  source.each_line { |line| target.write line }
+                end
               end
             end
+            File.open(filename) do |file|
+              file.each_line { |line| send "parse_#{type}".intern, line }
+            end
           end
-          File.open(filename) do |file|
-            file.each_line { |line| send "parse_#{type}".intern, line }
-          end
+          create_composition_map
+          normalize_boundary_map
         end
-        create_composition_map
-        normalize_boundary_map
-      end
 
-      def dump_to(filename)
-        File.open(filename, 'wb') do |f|
-          f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+        def dump_to(filename)
+          File.open(filename, 'wb') do |f|
+            f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+          end
         end
       end
     end
   end
 end
 
 if __FILE__ == $0
-  filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
-  generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
+  filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
+  generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
   generator.parse
   print "Writing to: #{filename}"
   generator.dump_to filename

diff --git a/activesupport/lib/active_support/core_ext/string/multibyte.rb b/activesupport/lib/active_support/core_ext/string/multibyte.rb
@@ -2,7 +2,7 @@
 require 'active_support/multibyte'
 
 class String
-  unless '1.9'.respond_to?(:force_encoding)
+  if '1.9'.respond_to?(:force_encoding)
     # == Multibyte proxy
     #
     # +mb_chars+ is a multibyte safe proxy for string methods.
@@ -37,23 +37,13 @@ class String
     # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
     # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte.
     def mb_chars
-      if ActiveSupport::Multibyte.proxy_class.wants?(self)
+      if ActiveSupport::Multibyte.proxy_class.consumes?(self)
         ActiveSupport::Multibyte.proxy_class.new(self)
       else
         self
       end
     end
-
-    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
-    # them), returns false otherwise.
-    def is_utf8?
-      ActiveSupport::Multibyte::Chars.consumes?(self)
-    end
-  else
-    def mb_chars #:nodoc
-      self
-    end
-
+
     def is_utf8? #:nodoc
       case encoding
       when Encoding::UTF_8
@@ -64,5 +54,19 @@ def is_utf8? #:nodoc
         false
       end
     end
+  else
+    def mb_chars
+      if ActiveSupport::Multibyte.proxy_class.wants?(self)
+        ActiveSupport::Multibyte.proxy_class.new(self)
+      else
+        self
+      end
+    end
+
+    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
+    # them), returns false otherwise.
+    def is_utf8?
+      ActiveSupport::Multibyte::Chars.consumes?(self)
+    end
   end
 end
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb
@@ -58,8 +58,9 @@ module Inflector
     #   transliterate("Jürgen")
     #   # => "Juergen"
     def transliterate(string, replacement = "?")
-      I18n.transliterate(Multibyte::Chars.normalize(
-        Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement)
+      I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
+        ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
+          :replacement => replacement)
     end
 
     # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.

diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
@@ -1,30 +1,12 @@
 # encoding: utf-8
-
 require 'active_support/core_ext/module/attribute_accessors'
 
 module ActiveSupport #:nodoc:
   module Multibyte
     autoload :EncodingError, 'active_support/multibyte/exceptions'
     autoload :Chars, 'active_support/multibyte/chars'
-    autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database'
-    autoload :Codepoint, 'active_support/multibyte/unicode_database'
-    autoload :UCD, 'active_support/multibyte/unicode_database'
+    autoload :Unicode, 'active_support/multibyte/unicode'
 
-    # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
-    # information about normalization.
-    NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
-
-    # The Unicode version that is supported by the implementation
-    UNICODE_VERSION = '5.1.0'
-
-    # The default normalization used for operations that require normalization. It can be set to any of the
-    # normalizations in NORMALIZATION_FORMS.
-    #
-    # Example:
-    #   ActiveSupport::Multibyte.default_normalization_form = :c
-    mattr_accessor :default_normalization_form
-    self.default_normalization_form = :kc
-
     # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
     # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
     # an example how to do this.