public
Description: Ruby Multibyte library extracted from ActiveSupport
Homepage:
Clone URL: git://github.com/mattetti/multibyte.git
multibyte / bin / database_generator.rb
100644 154 lines (137 sloc) 4.821 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env ruby
 
begin
  $:.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib'))
  require 'multibyte'
  require 'multibyte/generators/generate_tables'
  require 'multibyte/handlers/utf8_handler'
  
  Multibyte.send :include, Multibyte::Handlers
rescue IOError
end
 
require 'open-uri'
require 'tmpdir'
 
module Multibyte
  
  class Multibyte::Handlers::UnicodeDatabase
    def load; end
  end
  
  class UnicodeDatabaseGenerator
    BASE_URI = "http://www.unicode.org/Public/#{Multibyte::UNICODE_VERSION}/ucd/"
    SOURCES = {
      :codepoints => BASE_URI + 'UnicodeData.txt',
      :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
      :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
      :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
    }
 
    def initialize
      @ucd = Multibyte::Handlers::UnicodeDatabase.new
 
      default = Multibyte::Handlers::Codepoint.new
      default.combining_class = 0
      default.uppercase_mapping = 0
      default.lowercase_mapping = 0
      @ucd.codepoints = Hash.new(default)
    end
 
    def parse_codepoints(line)
      codepoint = Multibyte::Handlers::Codepoint.new
      raise "Could not parse input." unless line =~ /^
([0-9A-F]+); # code
([^;]+); # name
([A-Z]+); # general category
([0-9]+); # canonical combining class
([A-Z]+); # bidi class
(<([A-Z]*)>)? # decomposition type
((\ ?[0-9A-F]+)*); # decompomposition mapping
([0-9]*); # decimal digit
([0-9]*); # digit
([^;]*); # numeric
([YN]*); # bidi mirrored
([^;]*); # unicode 1.0 name
([^;]*); # iso comment
([0-9A-F]*); # simple uppercase mapping
([0-9A-F]*); # simple lowercase mapping
([0-9A-F]*)$/ix # simple titlecase mapping
      codepoint.code = $1.hex
      #codepoint.name = $2
      #codepoint.category = $3
      codepoint.combining_class = Integer($4)
      #codepoint.bidi_class = $5
      codepoint.decomp_type = $7
      codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
      #codepoint.bidi_mirrored = ($13=='Y') ? true : false
      codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
      codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
      #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
      @ucd.codepoints[codepoint.code] = codepoint
    end
 
    def parse_grapheme_break_property(line)
      if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
        type = $2.downcase.intern
        @ucd.boundary ||= {}
        @ucd.boundary[type] ||= []
        if $1.include? '..'
          parts = $1.split '..'
          @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
        else
          @ucd.boundary[type] << $1.hex
        end
      end
    end
 
    def parse_composition_exclusion(line)
      if line =~ /^([0-9A-F]+)/i
        @ucd.composition_exclusion ||= []
        @ucd.composition_exclusion << $1.hex
      end
    end
 
    def parse_cp1252(line)
      if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
        @ucd.cp1252 ||= {}
        @ucd.cp1252[$1.hex] = $2.hex
      end
    end
 
    def create_composition_map
      @ucd.codepoints.each do |_, cp|
        if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
          @ucd.composition_map ||= {}
          @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
          @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
        end
      end
    end
 
    def normalize_boundary_map
      @ucd.boundary.each do |k,v|
        if [:lf, :cr].include? k
          @ucd.boundary[k] = v[0]
        end
      end
    end
 
    def parse
      SOURCES.each do |type, url|
        filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
        unless File.exist?(filename)
          $stderr.puts "Downloading #{url.split('/').last}"
          File.open(filename, 'wb') do |target|
            open(url) do |source|
              source.each_line { |line| target.write line }
            end
          end
        end
        File.open(filename) do |file|
          file.each_line { |line| send "parse_#{type}".intern, line }
        end
      end
      create_composition_map
      normalize_boundary_map
    end
 
    def dump_to(filename)
      File.open(filename, 'wb') do |f|
        f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
      end
    end
  end
end
 
if __FILE__ == $0
  filename = Multibyte::Handlers::UnicodeDatabase.filename
  generator = Multibyte::UnicodeDatabaseGenerator.new
  generator.parse
  print "Writing to: #{filename}"
  generator.dump_to filename
  puts " (#{File.size(filename)} bytes)"
end