pluskid / rmmseg-cpp

an re-implementation of rmmseg (Chinese word segmentation library for Ruby) in C++

This URL has Read+Write access

pluskid (author)
Wed Sep 17 07:20:39 -0700 2008
commit  32787caf7efaaae0ff4feb9e3326d4bb5b736ec2
tree    c23315b048667f007455ea5079d5a22bc2a7a6db
parent  dc2b32e9169b094a81d8a53718dc152ace6b2846
rmmseg-cpp / misc / convert.rb
100755 115 lines (99 sloc) 2.255 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/ruby
 
# A utility used to convert the old RMMSeg dictionary
# to rmmseg-cpp format.
 
# There are several constrains for the new rmmseg-cpp
# dictionary format.
# - length of word should be specified in the dict
# - number and string should be separated by ONE space
# - there should be a newline at the end of file
 
$KCODE='u'
require 'jcode'
 
def usage(msg=nil)
  puts "***ERROR: #{msg}\n\n" if msg
  puts <<EOT
Usage:
 
#{$0} action type input.dic output.dic
 
action: either 'convert' or 'normalize'
- 'convert' is used to convert the dict from
old RMMSeg format.
- 'normalize' is used to normalize an existing
rmmseg-cpp dict.
 
type: either 'words' or 'chars'
 
EOT
  exit(0)
end
 
usage if ARGV.size != 4
usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
 
def output(data)
  File.open(ARGV[3], "w") do |f|
    data.each do |num, word|
      f.puts "#{num} #{word}" if word
    end
  end
end
 
def read_RMMSeg_chars
  max = 0
  File.readlines(ARGV[2]).map do |line|
    if line =~ /^(.)\s+(\d+)$/
      n = $2.to_i
      max = n if n > max
      [n, $1]
    else
      [nil, nil]
    end
  end.map do |num, word|
    if word
      [num*65535/max, word]
    else
      [nil, nil]
    end
  end
end
 
def read_RMMSeg_words
  File.readlines(ARGV[2]).map do |line|
    line.chomp!
    if !line.empty?
      [line.jlength, line]
    else
      [nil, nil]
    end
  end
end
 
def read_rmmseg_cpp_chars
  max = 0
  File.readlines(ARGV[2]).map do |line|
    if line =~ /^(\d+)\s+(.)$/
      n = $1.to_i
      max = n if n > max
      [n, $2]
    else
      [nil, nil]
    end
  end.map do |num, word|
    if word
      [num*65535/max, word]
    else
      [nil, nil]
    end
  end
end
 
def read_rmmseg_cpp_words
  File.readlines(ARGV[2]).map do |line|
    if line =~ /^(\d+)\s+(\w+)$/
      [$1, $2]
    else
      [nil, nil]
    end
  end
end
 
case ARGV[0,2]
when ['convert', 'chars']
  output(read_RMMSeg_chars)
when ['convert', 'words']
  output(read_RMMSeg_words)
when ['normalize', 'chars']
  output(read_rmmseg_cpp_chars)
when ['normalize', 'words']
  output(read_rmmseg_cpp_words)
end