Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

More cleanup

  • Loading branch information...
commit 6dd8dc3364bda6ffc6db4d7ece41f1009262a215 1 parent 140ac78
@CraigCottingham authored
View
372 lib/CharDistributionAnalysis.rb
@@ -28,7 +28,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require "UniversalDetector"
require "EUCTWFreq"
require "EUCKRFreq"
require "GB2312Freq"
@@ -36,212 +35,217 @@
require "JISFreq"
module UniversalDetector
- class CharDistributionAnalysis
- ENOUGH_DATA_THRESHOLD = 1024
- SURE_YES = 0.99
- SURE_NO = 0.01
+ class CharDistributionAnalysis
- def initialize
- @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
- @_mTableSize = nil # Size of above table
- @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
- reset()
- end
-
- def reset
- #"""reset analyser, clear any state"""
- @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
- @_mTotalChars = 0 # Total characters encountered
- @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
- end
-
- def feed(aStr, aCharLen)
- #"""feed a character with known length"""
- if aCharLen == 2
- # we only care about 2-bytes character in our distribution analysis
- order = get_order(aStr)
- else
- order = -1
- end
- if order >= 0
- @_mTotalChars += 1
- # order is valid
- if order < @_mTableSize
- if 512 > @_mCharToFreqOrder[order]
- @_mFreqChars += 1
- end
- end
- end
- end
+ ENOUGH_DATA_THRESHOLD = 1024
+ SURE_YES = 0.99
+ SURE_NO = 0.01
- def get_confidence
- #"""return confidence based on existing data"""
- # if we didn"t receive any character in our consideration range, return negative answer
- if @_mTotalChars <= 0
- return SURE_NO
- end
-
- if @_mTotalChars != @_mFreqChars
- r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
- if r < SURE_YES
- return r
- end
- end
-
- # normalize confidence (we don"t want to be 100% sure)
- return SURE_YES
- end
+ def initialize
+ @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
+ @_mTableSize = nil # Size of above table
+ @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
+ reset()
+ end
- def got_enough_data
- # It is not necessary to receive all data to draw conclusion. For charset detection,
- # certain amount of data is enough
- return @_mTotalChars > ENOUGH_DATA_THRESHOLD
- end
+ def reset
+ #"""reset analyser, clear any state"""
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
+ @_mTotalChars = 0 # Total characters encountered
+ @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
+ end
- def get_order(aStr)
- # We do not handle characters based on the original encoding string, but
- # convert this encoding string to a number, here called order.
- # This allows multiple encodings of a language to share one frequency table.
- return -1
- end
+ def feed(aStr, aCharLen)
+ #"""feed a character with known length"""
+ if aCharLen == 2
+ # we only care about 2-bytes character in our distribution analysis
+ order = get_order(aStr)
+ else
+ order = -1
+ end
+ if order >= 0
+ @_mTotalChars += 1
+ # order is valid
+ if order < @_mTableSize
+ if 512 > @_mCharToFreqOrder[order]
+ @_mFreqChars += 1
+ end
+ end
+ end
end
- class EUCTWDistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = EUCTWCharToFreqOrder
- @_mTableSize = EUCTW_TABLE_SIZE
- @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
- end
+ def get_confidence
+ #"""return confidence based on existing data"""
+ # if we didn"t receive any character in our consideration range, return negative answer
+ if @_mTotalChars <= 0
+ return SURE_NO
+ end
- def get_order(aStr)
- # for euc-TW encoding, we are interested
- # first byte range: 0xc4 -- 0xfe
- # second byte range: 0xa1 -- 0xfe
- # no validation needed here. State machine has done that
- if aStr[0] >= 0xC4
- return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
- else
- return -1
- end
- end
- end
+ UniversalDetector::log.debug("total chars = #{@_mTotalChars}")
+ UniversalDetector::log.debug("freq chars = #{@_mFreqChars}")
+ UniversalDetector::log.debug("typical distribution ratio = #{@_mTypicalDistributionRatio}")
- class EUCKRDistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = EUCKRCharToFreqOrder
- @_mTableSize = EUCKR_TABLE_SIZE
- @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
+ if @_mTotalChars != @_mFreqChars
+ r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
+ if r < SURE_YES
+ return r
end
+ end
- def get_order(aStr)
- # for euc-KR encoding, we are interested
- # first byte range: 0xb0 -- 0xfe
- # second byte range: 0xa1 -- 0xfe
- # no validation needed here. State machine has done that
- if aStr[0] >= 0xB0
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
- else
- return -1;
- end
- end
+ # normalize confidence (we don"t want to be 100% sure)
+ return SURE_YES
end
- class GB2312DistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = GB2312CharToFreqOrder
- @_mTableSize = GB2312_TABLE_SIZE
- @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
- end
+ def got_enough_data
+ # It is not necessary to receive all data to draw conclusion. For charset detection,
+ # certain amount of data is enough
+ return @_mTotalChars > ENOUGH_DATA_THRESHOLD
+ end
- def get_order(aStr)
- # for GB2312 encoding, we are interested
- # first byte range: 0xb0 -- 0xfe
- # second byte range: 0xa1 -- 0xfe
- # no validation needed here. State machine has done that
- if (aStr[0] >= 0xB0) and (aStr[1] >= 0xA1)
- return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
- else
- return -1;
- end
- end
+ def get_order(aStr)
+ # We do not handle characters based on the original encoding string, but
+ # convert this encoding string to a number, here called order.
+ # This allows multiple encodings of a language to share one frequency table.
+ return -1
+ end
+ end
+
+ class EUCTWDistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = EUCTWCharToFreqOrder
+ @_mTableSize = EUCTW_TABLE_SIZE
+ @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
end
- class Big5DistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = Big5CharToFreqOrder
- @_mTableSize = BIG5_TABLE_SIZE
- @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
- end
+ def get_order(aStr)
+ # for euc-TW encoding, we are interested
+ # first byte range: 0xc4 -- 0xfe
+ # second byte range: 0xa1 -- 0xfe
+ # no validation needed here. State machine has done that
+ if aStr[0] >= 0xC4
+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
+ else
+ return -1
+ end
+ end
+ end
+
+ class EUCKRDistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = EUCKRCharToFreqOrder
+ @_mTableSize = EUCKR_TABLE_SIZE
+ @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
+ end
- def get_order(aStr)
- # for big5 encoding, we are interested
- # first byte range: 0xa4 -- 0xfe
- # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
- # no validation needed here. State machine has done that
- if aStr[0] >= 0xA4
- if aStr[1] >= 0xA1
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
- else
- return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
- end
- else
- return -1
- end
- end
+ def get_order(aStr)
+ # for euc-KR encoding, we are interested
+ # first byte range: 0xb0 -- 0xfe
+ # second byte range: 0xa1 -- 0xfe
+ # no validation needed here. State machine has done that
+ if aStr[0] >= 0xB0
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
+ else
+ return -1;
+ end
+ end
+ end
+
+ class GB2312DistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = GB2312CharToFreqOrder
+ @_mTableSize = GB2312_TABLE_SIZE
+ @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
end
- class SJISDistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = JISCharToFreqOrder
- @_mTableSize = JIS_TABLE_SIZE
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
- end
+ def get_order(aStr)
+ # for GB2312 encoding, we are interested
+ # first byte range: 0xb0 -- 0xfe
+ # second byte range: 0xa1 -- 0xfe
+ # no validation needed here. State machine has done that
+ if (aStr[0] >= 0xB0) and (aStr[1] >= 0xA1)
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
+ else
+ return -1;
+ end
+ end
+ end
+
+ class Big5DistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = Big5CharToFreqOrder
+ @_mTableSize = BIG5_TABLE_SIZE
+ @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
+ end
- def get_order(aStr)
- # for sjis encoding, we are interested
- # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
- # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
- # no validation needed here. State machine has done that
- if (aStr[0] >= 0x81) && (aStr[0] <= 0x9F)
- order = 188 * (aStr[0] - 0x81)
- elsif (aStr[0] >= 0xE0) and (aStr[0] <= 0xEF)
- order = 188 * (aStr[0] - 0xE0 + 31)
- else
- return -1;
- end
- order = order + aStr[1] - 0x40
- if aStr[1] > 0x7F
- order =- 1
- end
- return order
- end
+ def get_order(aStr)
+ # for big5 encoding, we are interested
+ # first byte range: 0xa4 -- 0xfe
+ # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
+ # no validation needed here. State machine has done that
+ if aStr[0] >= 0xA4
+ if aStr[1] >= 0xA1
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
+ else
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
+ end
+ else
+ return -1
+ end
+ end
+ end
+
+ class SJISDistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = JISCharToFreqOrder
+ @_mTableSize = JIS_TABLE_SIZE
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
end
- class EUCJPDistributionAnalysis < CharDistributionAnalysis
- def initialize
- super
- @_mCharToFreqOrder = JISCharToFreqOrder
- @_mTableSize = JIS_TABLE_SIZE
- @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
- end
+ def get_order(aStr)
+ # for sjis encoding, we are interested
+ # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
+ # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
+ # no validation needed here. State machine has done that
+ if (aStr[0] >= 0x81) && (aStr[0] <= 0x9F)
+ order = 188 * (aStr[0] - 0x81)
+ elsif (aStr[0] >= 0xE0) and (aStr[0] <= 0xEF)
+ order = 188 * (aStr[0] - 0xE0 + 31)
+ else
+ return -1;
+ end
+ order = order + aStr[1] - 0x40
+ if aStr[1] > 0x7F
+ order =- 1
+ end
+ return order
+ end
+ end
+
+ class EUCJPDistributionAnalysis < CharDistributionAnalysis
+ def initialize
+ super
+ @_mCharToFreqOrder = JISCharToFreqOrder
+ @_mTableSize = JIS_TABLE_SIZE
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
+ end
- def get_order(aStr)
- # for euc-JP encoding, we are interested
- # first byte range: 0xa0 -- 0xfe
- # second byte range: 0xa1 -- 0xfe
- # no validation needed here. State machine has done that
- if aStr[0] >= 0xA0
- return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
- else
- return -1
- end
- end
+ def get_order(aStr)
+ # for euc-JP encoding, we are interested
+ # first byte range: 0xa0 -- 0xfe
+ # second byte range: 0xa1 -- 0xfe
+ # no validation needed here. State machine has done that
+ if aStr[0] >= 0xA0
+ return (94 * (aStr[0] - 0xA1)) + (aStr[1] - 0xA1)
+ else
+ return -1
+ end
end
+ end
end
View
143 lib/CharSetGroupProber.rb
@@ -28,89 +28,88 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
require 'CharSetProber'
module UniversalDetector
- class CharSetGroupProber < CharSetProber
- def initialize
- @_mActiveNum = 0
- @_mProbers = []
- @_mBestGuessProber = nil
- end
+ class CharSetGroupProber < CharSetProber
+ def initialize
+ @_mActiveNum = 0
+ @_mProbers = []
+ @_mBestGuessProber = nil
+ end
- def reset
- super
- @_mActiveNum = 0
- for prober in @_mProbers
- if prober
- prober.reset()
- prober.active = true
- @_mActiveNum += 1
- end
- end
- @_mBestGuessProber = nil
+ def reset
+ super
+ @_mActiveNum = 0
+ for prober in @_mProbers
+ if prober
+ prober.reset()
+ prober.active = true
+ @_mActiveNum += 1
end
+ end
+ @_mBestGuessProber = nil
+ end
- def get_charset_name
- unless @_mBestGuessProber
- get_confidence()
- unless @_mBestGuessProber then return nil end
- # @_mBestGuessProber = @_mProbers[0]
- end
- return @_mBestGuessProber.get_charset_name()
- end
+ def get_charset_name
+ unless @_mBestGuessProber
+ get_confidence()
+ unless @_mBestGuessProber then return nil end
+ # @_mBestGuessProber = @_mProbers[0]
+ end
+ return @_mBestGuessProber.get_charset_name()
+ end
- def feed(aBuf)
- for prober in @_mProbers
- unless prober then next end
- unless prober.active then next end
- st = prober.feed(aBuf)
- unless st then next end
- if st == :FoundIt
- @_mBestGuessProber = prober
- return get_state()
- elsif st == :NotMe
- prober.active = false
- @_mActiveNum -= 1
- if @_mActiveNum <= 0
- @_mState = :NotMe
- return get_state()
- end
- end
- end
+ def feed(aBuf)
+ for prober in @_mProbers
+ unless prober then next end
+ unless prober.active then next end
+ st = prober.feed(aBuf)
+ unless st then next end
+ if st == :FoundIt
+ @_mBestGuessProber = prober
+ return get_state()
+ elsif st == :NotMe
+ prober.active = false
+ @_mActiveNum -= 1
+ if @_mActiveNum <= 0
+ @_mState = :NotMe
return get_state()
+ end
end
+ end
+ return get_state()
+ end
- def get_confidence
- st = get_state()
- if st == :FoundIt
- return 0.99
- elsif st == :NotMe
- return 0.01
- end
+ def get_confidence
+ st = get_state()
+ if st == :FoundIt
+ return 0.99
+ elsif st == :NotMe
+ return 0.01
+ end
- bestConf = 0.0
- @_mBestGuessProber = nil
- for prober in @_mProbers
- unless prober then next end
- unless prober.active
- if UniversalDetector::DEBUG
- p(prober.get_charset_name() + ' not active\n')
- end
- next
- end
- cf = prober.get_confidence()
- if UniversalDetector::DEBUG
- p('%s confidence = %s\n' % [prober.get_charset_name(), cf])
- end
- if bestConf < cf
- bestConf = cf
- @_mBestGuessProber = prober
- end
- end
- unless @_mBestGuessProber then return 0.0 end
- return bestConf
+ bestConf = 0.0
+ @_mBestGuessProber = nil
+ for prober in @_mProbers
+ unless prober then next end
+ unless prober.active
+ if UniversalDetector::DEBUG
+ p(prober.get_charset_name() + ' not active\n')
+ end
+ next
+ end
+ cf = prober.get_confidence()
+ if UniversalDetector::DEBUG
+ p('%s confidence = %s\n' % [prober.get_charset_name(), cf])
+ end
+ if bestConf < cf
+ bestConf = cf
+ @_mBestGuessProber = prober
end
+ end
+ unless @_mBestGuessProber then return 0.0 end
+ return bestConf
end
+ end
end
View
83 lib/CodingStateMachine.rb
@@ -28,49 +28,58 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
+module UniversalDetector
-module UniversalDetector
- class CodingStateMachine
- def initialize(sm)
- @_mModel = sm
- @_mCurrentBytePos = 0
- @_mCurrentCharLen = 0
- reset()
- end
+ class CodingStateMachine
+ def initialize(sm)
+ @_mModel = sm
+ @_mCurrentBytePos = 0
+ @_mCurrentCharLen = 0
+ reset()
+ end
+
+ def reset
+ @_mCurrentState = :Start
+ end
+
+ def next_state(b)
+ UniversalDetector::log.debug("in CodingStateMachine::next_state with byte #{[b].pack('c').unpack('H*')}")
+
+ # for each byte we get its class
+ # if it is first byte, we also get byte length
+ byteCls = @_mModel['classTable'][b]
+ if @_mCurrentState == :Start
+ @_mCurrentBytePos = 0
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
+ end
- def reset
- @_mCurrentState = :Start
- end
+ UniversalDetector::log.debug("current state = #{@_mCurrentState}")
- def next_state(c)
- # for each byte we get its class
- # if it is first byte, we also get byte length
- byteCls = @_mModel['classTable'][c]
+ # from byte's class and stateTable, we get its next state
+ stateValue = {:Start => 0, :Error => 1, :ItsMe => 2}
+ if stateValue[@_mCurrentState]
+ v = stateValue[@_mCurrentState]
+ else
+ v = @_mCurrentState
+ end
- if @_mCurrentState == :Start
- @_mCurrentBytePos = 0
- @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
- end
- # from byte's class and stateTable, we get its next state
- stateValue = {:Start => 0, :Error => 1, :ItsMe => 2}
- unless stateValue[@_mCurrentState]
- v = @_mCurrentState
- else
- v = stateValue[@_mCurrentState]
- end
- @_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls]
+ UniversalDetector::log.debug("v = #{v}")
- @_mCurrentBytePos += 1
- return @_mCurrentState
- end
+ @_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls]
- def get_current_charlen
- return @_mCurrentCharLen
- end
+ UniversalDetector::log.debug("new state = #{@_mCurrentState}")
- def get_coding_state_machine
- return @_mModel['name']
- end
+ @_mCurrentBytePos += 1
+ return @_mCurrentState
end
+
+ def get_current_charlen
+ return @_mCurrentCharLen
+ end
+
+ def get_coding_state_machine
+ return @_mModel['name']
+ end
+ end
+
end
View
106 lib/EUCJPProber.rb
@@ -28,7 +28,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
require 'MultiByteCharSetProber'
require 'CodingStateMachine'
require 'CharDistributionAnalysis'
@@ -36,64 +35,65 @@
require 'JapaneseContextAnalysis'
module UniversalDetector
- class EUCJPProber < MultiByteCharSetProber
- def initialize
- super
- @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
- @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new
- @_mContextAnalyzer = EUCJPContextAnalysis.new
- reset()
- end
-
- def reset
- super
- @_mContextAnalyzer.reset()
- end
- def get_charset_name
- return "EUC-JP"
- end
+ class EUCJPProber < MultiByteCharSetProber
+ def initialize
+ super
+ @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
+ @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new
+ @_mContextAnalyzer = EUCJPContextAnalysis.new
+ reset()
+ end
- def feed(aBuf)
- aLen = aBuf.length
- for i in 0...aLen
- codingState = @_mCodingSM.next_state(aBuf[i])
- if codingState == :Error
- if DEBUG
- p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
- end
- @_mState = :NotMe
- break
- elsif codingState == :ItsMe
- @_mState = :FoundIt
- break
- elsif codingState == :Start
- charLen = @_mCodingSM.get_current_charlen()
- if i == 0
- @_mLastChar[1] = aBuf[0]
- @_mContextAnalyzer.feed(@_mLastChar, charLen)
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
- else
- @_mContextAnalyzer.feed(aBuf[i-1..i+1], charLen)
- @_mDistributionAnalyzer.feed(aBuf[i-1..i+1], charLen)
- end
- end
- end
+ def reset
+ super
+ @_mContextAnalyzer.reset()
+ end
- @_mLastChar[0] = aBuf[aLen - 1]
+ def get_charset_name
+ return "EUC-JP"
+ end
- if get_state() == :Detecting
- if @_mContextAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
- @_mState = :FoundIt
- end
- end
- return get_state()
+ def feed(aBuf)
+ aBuf.each_with_index do | b, i |
+ codingState = @_mCodingSM.next_state(b)
+ if codingState == :Error
+ UniversalDetector::log.debug '%s prober hit error at byte %s' % [ get_charset_name(), i.to_s ]
+ @_mState = :NotMe
+ break
+ elsif codingState == :ItsMe
+ @_mState = :FoundIt
+ break
+ elsif codingState == :Start
+ charLen = @_mCodingSM.get_current_charlen()
+ if i == 0
+ @_mLastChar[1] = b
+ @_mContextAnalyzer.feed(@_mLastChar, charLen)
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
+ else
+ @_mContextAnalyzer.feed(aBuf[i-1..i+1], charLen)
+ @_mDistributionAnalyzer.feed(aBuf[i-1..i+1], charLen)
+ end
end
+ end
+
+ @_mLastChar[0] = aBuf[-1]
- def get_confidence
- contxtCf = @_mContextAnalyzer.get_confidence()
- distribCf = @_mDistributionAnalyzer.get_confidence()
- return [contxtCf, distribCf].max
+ if get_state() == :Detecting
+ if @_mContextAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
+ @_mState = :FoundIt
end
+ end
+
+ get_state()
+ end
+
+ def get_confidence
+ contxtCf = @_mContextAnalyzer.get_confidence()
+ UniversalDetector::log.debug('EUCJPProber - context confidence = %s' % contxtCf)
+ distribCf = @_mDistributionAnalyzer.get_confidence()
+ UniversalDetector::log.debug('EUCJPProber - distribution confidence = %s' % distribCf)
+ return [contxtCf, distribCf].max
end
+ end
end
View
41 lib/JISFreq.rb
@@ -28,31 +28,29 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
-
module UniversalDetector
- # Sampling from about 20M text materials include literature and computer technology
- #
- # Japanese frequency table, applied to both S-JIS and EUC-JP
- # They are sorted in order.
+ # Sampling from about 20M text materials include literature and computer technology
+ #
+ # Japanese frequency table, applied to both S-JIS and EUC-JP
+ # They are sorted in order.
- # 128 --> 0.77094
- # 256 --> 0.85710
- # 512 --> 0.92635
- # 1024 --> 0.97130
- # 2048 --> 0.99431
- #
- # Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
- # Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
- #
- # Typical Distribution Ratio, 25% of IDR
+ # 128 --> 0.77094
+ # 256 --> 0.85710
+ # 512 --> 0.92635
+ # 1024 --> 0.97130
+ # 2048 --> 0.99431
+ #
+ # Ideal Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
+ # Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
+ #
+ # Typical Distribution Ratio, 25% of IDR
- JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
+ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
- # Char to FreqOrder table ,
- JIS_TABLE_SIZE = 4368
+ # Char to FreqOrder table ,
+ JIS_TABLE_SIZE = 4368
- JISCharToFreqOrder = [ \
+ JISCharToFreqOrder = [
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
1262,5072, 619, 127,3431,3512,3230,1899,1700, 232, 228,1294,1298, 284, 283,2041, # 48
@@ -570,6 +568,7 @@ module UniversalDetector
8208,8209,8210,8211,8212,8213,8214,8215,8216,8217,8218,8219,8220,8221,8222,8223, # 8224
8224,8225,8226,8227,8228,8229,8230,8231,8232,8233,8234,8235,8236,8237,8238,8239, # 8240
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, # 8256
- 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271] # 8272
+ 8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271, # 8272
+ ]
end
View
207 lib/JapaneseContextAnalysis.rb
@@ -28,17 +28,15 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
-
module UniversalDetector
- NUM_OF_CATEGORY = 6
- DONT_KNOW = -1
- ENOUGH_REL_THRESHOLD = 100
- MAX_REL_THRESHOLD = 1000
- MINIMUM_DATA_THRESHOLD = 4
-
- # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
- Jp2CharContext = [ \
+ NUM_OF_CATEGORY = 6
+ DONT_KNOW = -1
+ ENOUGH_REL_THRESHOLD = 100
+ MAX_REL_THRESHOLD = 1000
+ MINIMUM_DATA_THRESHOLD = 4
+
+ # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
+ Jp2CharContext = [
[0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1],
[2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4],
[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2],
@@ -121,111 +119,112 @@ module UniversalDetector
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1],
[0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2],
[0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3],
- [0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1],]
-
- class JapaneseContextAnalysis
- def initialize
- reset()
- end
+ [0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1],
+ ]
- def reset
- @_mTotalRel = 0 # total sequence received
- @_mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
- @_mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
- @_mLastCharOrder = -1 # The order of previous char
- @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
- end
-
- def feed(aBuf, aLen)
- if @_mDone then return end
-
- # The buffer we got is byte oriented, and a character may span in more than one
- # buffers. In case the last one or two byte in last buffer is not complete, we
- # record how many byte needed to complete that character and skip these bytes here.
- # We can choose to record those bytes as well and analyse the character once it
- # is complete, but since a character will not make much difference, by simply skipping
- # this character will simply our logic and improve performance.
- i = @_mNeedToSkipCharNum
- while i < aLen
- order, charLen = get_order(aBuf[i..i+2])
- i += charLen
- if i > aLen
- @_mNeedToSkipCharNum = i - aLen
- @_mLastCharOrder = -1
- else
- if (order != -1) and (@_mLastCharOrder != -1)
- @_mTotalRel += 1
- if @_mTotalRel > MAX_REL_THRESHOLD
- @_mDone = true
- break
- end
- @_mRelSample[Jp2CharContext[@_mLastCharOrder][order]] += 1
- end
- @_mLastCharOrder = order
- end
- end
- end
+ class JapaneseContextAnalysis
+ def initialize
+ reset()
+ end
- def got_enough_data
- return @_mTotalRel > ENOUGH_REL_THRESHOLD
- end
+ def reset
+ @_mTotalRel = 0 # total sequence received
+ @_mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
+ @_mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
+ @_mLastCharOrder = -1 # The order of previous char
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
+ end
- def get_confidence
- # This is just one way to calculate confidence. It works well for me.
- if @_mTotalRel > MINIMUM_DATA_THRESHOLD
- return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
- else
- return DONT_KNOW
+ def feed(aBuf, aLen)
+ if @_mDone then return end
+
+ # The buffer we got is byte oriented, and a character may span in more than one
+ # buffers. In case the last one or two byte in last buffer is not complete, we
+ # record how many byte needed to complete that character and skip these bytes here.
+ # We can choose to record those bytes as well and analyse the character once it
+ # is complete, but since a character will not make much difference, by simply skipping
+ # this character will simply our logic and improve performance.
+ i = @_mNeedToSkipCharNum
+ while i < aLen
+ order, charLen = get_order(aBuf[i..i+2])
+ i += charLen
+ if i > aLen
+ @_mNeedToSkipCharNum = i - aLen
+ @_mLastCharOrder = -1
+ else
+ if (order != -1) and (@_mLastCharOrder != -1)
+ @_mTotalRel += 1
+ if @_mTotalRel > MAX_REL_THRESHOLD
+ @_mDone = true
+ break
end
+ @_mRelSample[Jp2CharContext[@_mLastCharOrder][order]] += 1
+ end
+ @_mLastCharOrder = order
end
-
- def get_order(aStr)
- return -1, 1
- end
+ end
end
- class SJISContextAnalysis < JapaneseContextAnalysis
- def get_order(aStr)
- unless aStr then return -1, 1 end
- # find out current char's byte length
- if ((aStr[0] >= 0x81) and (aStr[0] <= 0x9F)) or ((aStr[0] >= 0xE0) and (aStr[0] <= 0xFC))
- charLen = 2
- else
- charLen = 1
- end
-
- # return its order if it is hiragana
- if aStr.length > 1
- if (aStr[0] == '\202') and (aStr[1] >= 0x9F) and (aStr[1] <= 0xF1)
- return ord(aStr[1]) - 0x9F, charLen
- end
- end
-
- return -1, charLen
- end
+ def got_enough_data
+ return @_mTotalRel > ENOUGH_REL_THRESHOLD
end
- class EUCJPContextAnalysis < JapaneseContextAnalysis
- def get_order(aStr)
- unless aStr then return -1, 1 end
- # find out current char's byte length
- # aStr = aStr.to_s
- if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
- charLen = 2
- elsif aStr[0] == 0x8F
- charLen = 3
- else
- charLen = 1
- end
+ def get_confidence
+ # This is just one way to calculate confidence. It works well for me.
+ if @_mTotalRel > MINIMUM_DATA_THRESHOLD
+ return (@_mTotalRel - @_mRelSample[0]) / @_mTotalRel
+ else
+ return DONT_KNOW
+ end
+ end
- # return its order if it is hiragana
- if aStr.length > 1
- if (aStr[0] == 0xA4) and (aStr[1] >= 0xA1) and (aStr[1] <= 0xF3)
- return aStr[1][0] - 0xA1, charLen
- end
- end
+ def get_order(aStr)
+ return -1, 1
+ end
+ end
+
+ class SJISContextAnalysis < JapaneseContextAnalysis
+ def get_order(aStr)
+ unless aStr then return -1, 1 end
+ # find out current char's byte length
+ if ((aStr[0] >= 0x81) and (aStr[0] <= 0x9F)) or ((aStr[0] >= 0xE0) and (aStr[0] <= 0xFC))
+ charLen = 2
+ else
+ charLen = 1
+ end
+
+ # return its order if it is hiragana
+ if aStr.length > 1
+ if (aStr[0] == '\202') and (aStr[1] >= 0x9F) and (aStr[1] <= 0xF1)
+ return ord(aStr[1]) - 0x9F, charLen
+ end
+ end
- return -1, charLen
+ return -1, charLen
+ end
+ end
+
+ class EUCJPContextAnalysis < JapaneseContextAnalysis
+ def get_order(aStr)
+ unless aStr then return -1, 1 end
+ # find out current char's byte length
+ # aStr = aStr.to_s
+ if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
+ charLen = 2
+ elsif aStr[0] == 0x8F
+ charLen = 3
+ else
+ charLen = 1
+ end
+
+ # return its order if it is hiragana
+ if aStr.length > 1
+ if (aStr[0] == 0xA4) and (aStr[1] >= 0xA1) and (aStr[1] <= 0xF3)
+ return aStr[1][0] - 0xA1, charLen
end
+ end
+
+ return -1, charLen
end
+ end
end
View
23 lib/MBCSGroupProber.rb
@@ -28,7 +28,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-# require 'UniversalDetector'
require 'CharSetGroupProber'
require 'UTF8Prober'
require 'SJISProber'
@@ -39,21 +38,13 @@
require 'EUCTWProber'
module UniversalDetector
- class MBCSGroupProber < CharSetGroupProber
+ class MBCSGroupProber < CharSetGroupProber
+ attr_reader :mProbers
- attr_reader :mProbers
-
- def initialize
- super
- @mProbers = [ \
- UTF8Prober.new,
- SJISProber.new,
- EUCJPProber.new,
- GB2312Prober.new,
- EUCKRProber.new,
- Big5Prober.new,
- EUCTWProber.new]
- reset()
- end
+ def initialize
+ super
+ @mProbers = [ UTF8Prober.new, SJISProber.new, EUCJPProber.new, GB2312Prober.new, EUCKRProber.new, Big5Prober.new, EUCTWProber.new ]
+ reset()
end
+ end
end
View
1,023 lib/MBCSSM.rb
@@ -29,487 +29,544 @@
######################### END LICENSE BLOCK #########################
module UniversalDetector
- BIG5_cls = [ \
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
- 1,1,1,1,1,1,0,0, # 08 - 0f
- 1,1,1,1,1,1,1,1, # 10 - 17
- 1,1,1,0,1,1,1,1, # 18 - 1f
- 1,1,1,1,1,1,1,1, # 20 - 27
- 1,1,1,1,1,1,1,1, # 28 - 2f
- 1,1,1,1,1,1,1,1, # 30 - 37
- 1,1,1,1,1,1,1,1, # 38 - 3f
- 2,2,2,2,2,2,2,2, # 40 - 47
- 2,2,2,2,2,2,2,2, # 48 - 4f
- 2,2,2,2,2,2,2,2, # 50 - 57
- 2,2,2,2,2,2,2,2, # 58 - 5f
- 2,2,2,2,2,2,2,2, # 60 - 67
- 2,2,2,2,2,2,2,2, # 68 - 6f
- 2,2,2,2,2,2,2,2, # 70 - 77
- 2,2,2,2,2,2,2,1, # 78 - 7f
- 4,4,4,4,4,4,4,4, # 80 - 87
- 4,4,4,4,4,4,4,4, # 88 - 8f
- 4,4,4,4,4,4,4,4, # 90 - 97
- 4,4,4,4,4,4,4,4, # 98 - 9f
- 4,3,3,3,3,3,3,3, # a0 - a7
- 3,3,3,3,3,3,3,3, # a8 - af
- 3,3,3,3,3,3,3,3, # b0 - b7
- 3,3,3,3,3,3,3,3, # b8 - bf
- 3,3,3,3,3,3,3,3, # c0 - c7
- 3,3,3,3,3,3,3,3, # c8 - cf
- 3,3,3,3,3,3,3,3, # d0 - d7
- 3,3,3,3,3,3,3,3, # d8 - df
- 3,3,3,3,3,3,3,3, # e0 - e7
- 3,3,3,3,3,3,3,3, # e8 - ef
- 3,3,3,3,3,3,3,3, # f0 - f7
- 3,3,3,3,3,3,3,0] # f8 - ff
-
- BIG5_st = [ \
- :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
- :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,#08-0f
- :Error,:Start,:Start,:Start,:Start,:Start,:Start,:Start]#10-17
-
- Big5CharLenTable = [0, 1, 1, 2, 0]
-
- Big5SMModel = {'classTable' => BIG5_cls,
- 'classFactor' => 5,
- 'stateTable' => BIG5_st,
- 'charLenTable' => Big5CharLenTable,
- 'name' => 'Big5'}
-
- # EUC-JP
-
- EUCJP_cls = [ \
- 4,4,4,4,4,4,4,4, # 00 - 07
- 4,4,4,4,4,4,5,5, # 08 - 0f
- 4,4,4,4,4,4,4,4, # 10 - 17
- 4,4,4,5,4,4,4,4, # 18 - 1f
- 4,4,4,4,4,4,4,4, # 20 - 27
- 4,4,4,4,4,4,4,4, # 28 - 2f
- 4,4,4,4,4,4,4,4, # 30 - 37
- 4,4,4,4,4,4,4,4, # 38 - 3f
- 4,4,4,4,4,4,4,4, # 40 - 47
- 4,4,4,4,4,4,4,4, # 48 - 4f
- 4,4,4,4,4,4,4,4, # 50 - 57
- 4,4,4,4,4,4,4,4, # 58 - 5f
- 4,4,4,4,4,4,4,4, # 60 - 67
- 4,4,4,4,4,4,4,4, # 68 - 6f
- 4,4,4,4,4,4,4,4, # 70 - 77
- 4,4,4,4,4,4,4,4, # 78 - 7f
- 5,5,5,5,5,5,5,5, # 80 - 87
- 5,5,5,5,5,5,1,3, # 88 - 8f
- 5,5,5,5,5,5,5,5, # 90 - 97
- 5,5,5,5,5,5,5,5, # 98 - 9f
- 5,2,2,2,2,2,2,2, # a0 - a7
- 2,2,2,2,2,2,2,2, # a8 - af
- 2,2,2,2,2,2,2,2, # b0 - b7
- 2,2,2,2,2,2,2,2, # b8 - bf
- 2,2,2,2,2,2,2,2, # c0 - c7
- 2,2,2,2,2,2,2,2, # c8 - cf
- 2,2,2,2,2,2,2,2, # d0 - d7
- 2,2,2,2,2,2,2,2, # d8 - df
- 0,0,0,0,0,0,0,0, # e0 - e7
- 0,0,0,0,0,0,0,0, # e8 - ef
- 0,0,0,0,0,0,0,0, # f0 - f7
- 0,0,0,0,0,0,0,5] # f8 - ff
-
- EUCJP_st = [ \
- 3, 4, 3, 5,:Start,:Error,:Error,:Error,#00-07
- :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe,:Start,:Error,:Start,:Error,:Error,:Error,#10-17
- :Error,:Error,:Start,:Error,:Error,:Error, 3,:Error,#18-1f
- 3,:Error,:Error,:Error,:Start,:Start,:Start,:Start]#20-27
-
- EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
-
- EUCJPSMModel = {'classTable' => EUCJP_cls,
- 'classFactor' => 6,
- 'stateTable' => EUCJP_st,
- 'charLenTable' => EUCJPCharLenTable,
- 'name' => 'EUC-JP'}
-
- # EUC-KR
-
- EUCKR_cls = [ \
- 1,1,1,1,1,1,1,1, # 00 - 07
- 1,1,1,1,1,1,0,0, # 08 - 0f
- 1,1,1,1,1,1,1,1, # 10 - 17
- 1,1,1,0,1,1,1,1, # 18 - 1f
- 1,1,1,1,1,1,1,1, # 20 - 27
- 1,1,1,1,1,1,1,1, # 28 - 2f
- 1,1,1,1,1,1,1,1, # 30 - 37
- 1,1,1,1,1,1,1,1, # 38 - 3f
- 1,1,1,1,1,1,1,1, # 40 - 47
- 1,1,1,1,1,1,1,1, # 48 - 4f
- 1,1,1,1,1,1,1,1, # 50 - 57
- 1,1,1,1,1,1,1,1, # 58 - 5f
- 1,1,1,1,1,1,1,1, # 60 - 67
- 1,1,1,1,1,1,1,1, # 68 - 6f
- 1,1,1,1,1,1,1,1, # 70 - 77
- 1,1,1,1,1,1,1,1, # 78 - 7f
- 0,0,0,0,0,0,0,0, # 80 - 87
- 0,0,0,0,0,0,0,0, # 88 - 8f
- 0,0,0,0,0,0,0,0, # 90 - 97
- 0,0,0,0,0,0,0,0, # 98 - 9f
- 0,2,2,2,2,2,2,2, # a0 - a7
- 2,2,2,2,2,3,3,3, # a8 - af
- 2,2,2,2,2,2,2,2, # b0 - b7
- 2,2,2,2,2,2,2,2, # b8 - bf
- 2,2,2,2,2,2,2,2, # c0 - c7
- 2,3,2,2,2,2,2,2, # c8 - cf
- 2,2,2,2,2,2,2,2, # d0 - d7
- 2,2,2,2,2,2,2,2, # d8 - df
- 2,2,2,2,2,2,2,2, # e0 - e7
- 2,2,2,2,2,2,2,2, # e8 - ef
- 2,2,2,2,2,2,2,2, # f0 - f7
- 2,2,2,2,2,2,2,0] # f8 - ff
-
- EUCKR_st = [
- :Error,:Start, 3,:Error,:Error,:Error,:Error,:Error,#00-07
- :ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,:Start]#08-0f
-
- EUCKRCharLenTable = [0, 1, 2, 0]
-
- EUCKRSMModel = {'classTable' => EUCKR_cls,
- 'classFactor' => 4,
- 'stateTable' => EUCKR_st,
- 'charLenTable' => EUCKRCharLenTable,
- 'name' => 'EUC-KR'}
-
- # EUC-TW
-
- EUCTW_cls = [ \
- 2,2,2,2,2,2,2,2, # 00 - 07
- 2,2,2,2,2,2,0,0, # 08 - 0f
- 2,2,2,2,2,2,2,2, # 10 - 17
- 2,2,2,0,2,2,2,2, # 18 - 1f
- 2,2,2,2,2,2,2,2, # 20 - 27
- 2,2,2,2,2,2,2,2, # 28 - 2f
- 2,2,2,2,2,2,2,2, # 30 - 37
- 2,2,2,2,2,2,2,2, # 38 - 3f
- 2,2,2,2,2,2,2,2, # 40 - 47
- 2,2,2,2,2,2,2,2, # 48 - 4f
- 2,2,2,2,2,2,2,2, # 50 - 57
- 2,2,2,2,2,2,2,2, # 58 - 5f
- 2,2,2,2,2,2,2,2, # 60 - 67
- 2,2,2,2,2,2,2,2, # 68 - 6f
- 2,2,2,2,2,2,2,2, # 70 - 77
- 2,2,2,2,2,2,2,2, # 78 - 7f
- 0,0,0,0,0,0,0,0, # 80 - 87
- 0,0,0,0,0,0,6,0, # 88 - 8f
- 0,0,0,0,0,0,0,0, # 90 - 97
- 0,0,0,0,0,0,0,0, # 98 - 9f
- 0,3,4,4,4,4,4,4, # a0 - a7
- 5,5,1,1,1,1,1,1, # a8 - af
- 1,1,1,1,1,1,1,1, # b0 - b7
- 1,1,1,1,1,1,1,1, # b8 - bf
- 1,1,3,1,3,3,3,3, # c0 - c7
- 3,3,3,3,3,3,3,3, # c8 - cf
- 3,3,3,3,3,3,3,3, # d0 - d7
- 3,3,3,3,3,3,3,3, # d8 - df
- 3,3,3,3,3,3,3,3, # e0 - e7
- 3,3,3,3,3,3,3,3, # e8 - ef
- 3,3,3,3,3,3,3,3, # f0 - f7
- 3,3,3,3,3,3,3,0] # f8 - ff
-
- EUCTW_st = [ \
- :Error,:Error,:Start, 3, 3, 3, 4,:Error,#00-07
- :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Start,:Error,#10-17
- :Start,:Start,:Start,:Error,:Error,:Error,:Error,:Error,#18-1f
- 5,:Error,:Error,:Error,:Start,:Error,:Start,:Start,#20-27
- :Start,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
-
- EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
-
- EUCTWSMModel = {'classTable' => EUCTW_cls,
- 'classFactor' => 7,
- 'stateTable' => EUCTW_st,
- 'charLenTable' => EUCTWCharLenTable,
- 'name' => 'x-euc-tw'}
-
- # GB2312
-
- GB2312_cls = [ \
- 1,1,1,1,1,1,1,1, # 00 - 07
- 1,1,1,1,1,1,0,0, # 08 - 0f
- 1,1,1,1,1,1,1,1, # 10 - 17
- 1,1,1,0,1,1,1,1, # 18 - 1f
- 1,1,1,1,1,1,1,1, # 20 - 27
- 1,1,1,1,1,1,1,1, # 28 - 2f
- 3,3,3,3,3,3,3,3, # 30 - 37
- 3,3,1,1,1,1,1,1, # 38 - 3f
- 2,2,2,2,2,2,2,2, # 40 - 47
- 2,2,2,2,2,2,2,2, # 48 - 4f
- 2,2,2,2,2,2,2,2, # 50 - 57
- 2,2,2,2,2,2,2,2, # 58 - 5f
- 2,2,2,2,2,2,2,2, # 60 - 67
- 2,2,2,2,2,2,2,2, # 68 - 6f
- 2,2,2,2,2,2,2,2, # 70 - 77
- 2,2,2,2,2,2,2,4, # 78 - 7f
- 5,6,6,6,6,6,6,6, # 80 - 87
- 6,6,6,6,6,6,6,6, # 88 - 8f
- 6,6,6,6,6,6,6,6, # 90 - 97
- 6,6,6,6,6,6,6,6, # 98 - 9f
- 6,6,6,6,6,6,6,6, # a0 - a7
- 6,6,6,6,6,6,6,6, # a8 - af
- 6,6,6,6,6,6,6,6, # b0 - b7
- 6,6,6,6,6,6,6,6, # b8 - bf
- 6,6,6,6,6,6,6,6, # c0 - c7
- 6,6,6,6,6,6,6,6, # c8 - cf
- 6,6,6,6,6,6,6,6, # d0 - d7
- 6,6,6,6,6,6,6,6, # d8 - df
- 6,6,6,6,6,6,6,6, # e0 - e7
- 6,6,6,6,6,6,6,6, # e8 - ef
- 6,6,6,6,6,6,6,6, # f0 - f7
- 6,6,6,6,6,6,6,0] # f8 - ff
-
- GB2312_st = [ \
- :Error,:Start,:Start,:Start,:Start,:Start, 3,:Error,#00-07
- :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,#10-17
- 4,:Error,:Start,:Start,:Error,:Error,:Error,:Error,#18-1f
- :Error,:Error, 5,:Error,:Error,:Error,:ItsMe,:Error,#20-27
- :Error,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
-
- # To be accurate, the length of class 6 can be either 2 or 4.
- # But it is not necessary to discriminate between the two since
- # it is used for frequency analysis only, and we are validing
- # each code range there as well. So it is safe to set it to be
- # 2 here.
- GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
-
- GB2312SMModel = {'classTable' => GB2312_cls,
- 'classFactor' => 7,
- 'stateTable' => GB2312_st,
- 'charLenTable' => GB2312CharLenTable,
- 'name' => 'GB2312'}
-
- # Shift_JIS
-
- SJIS_cls = [ \
- 1,1,1,1,1,1,1,1, # 00 - 07
- 1,1,1,1,1,1,0,0, # 08 - 0f
- 1,1,1,1,1,1,1,1, # 10 - 17
- 1,1,1,0,1,1,1,1, # 18 - 1f
- 1,1,1,1,1,1,1,1, # 20 - 27
- 1,1,1,1,1,1,1,1, # 28 - 2f
- 1,1,1,1,1,1,1,1, # 30 - 37
- 1,1,1,1,1,1,1,1, # 38 - 3f
- 2,2,2,2,2,2,2,2, # 40 - 47
- 2,2,2,2,2,2,2,2, # 48 - 4f
- 2,2,2,2,2,2,2,2, # 50 - 57
- 2,2,2,2,2,2,2,2, # 58 - 5f
- 2,2,2,2,2,2,2,2, # 60 - 67
- 2,2,2,2,2,2,2,2, # 68 - 6f
- 2,2,2,2,2,2,2,2, # 70 - 77
- 2,2,2,2,2,2,2,1, # 78 - 7f
- 3,3,3,3,3,3,3,3, # 80 - 87
- 3,3,3,3,3,3,3,3, # 88 - 8f
- 3,3,3,3,3,3,3,3, # 90 - 97
- 3,3,3,3,3,3,3,3, # 98 - 9f
- #0xa0 is illegal in sjis encoding, but some pages does
- #contain such byte. We need to be more error forgiven.
- 2,2,2,2,2,2,2,2, # a0 - a7
- 2,2,2,2,2,2,2,2, # a8 - af
- 2,2,2,2,2,2,2,2, # b0 - b7
- 2,2,2,2,2,2,2,2, # b8 - bf
- 2,2,2,2,2,2,2,2, # c0 - c7
- 2,2,2,2,2,2,2,2, # c8 - cf
- 2,2,2,2,2,2,2,2, # d0 - d7
- 2,2,2,2,2,2,2,2, # d8 - df
- 3,3,3,3,3,3,3,3, # e0 - e7
- 3,3,3,3,3,4,4,4, # e8 - ef
- 4,4,4,4,4,4,4,4, # f0 - f7
- 4,4,4,4,4,0,0,0] # f8 - ff
-
- SJIS_st = [ \
- :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
- :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe,:Error,:Error,:Start,:Start,:Start,:Start]#10-17
-
- SJISCharLenTable = [0, 1, 1, 2, 0, 0]
-
- SJISSMModel = {'classTable' => SJIS_cls,
- 'classFactor' => 6,
- 'stateTable' => SJIS_st,
- 'charLenTable' => SJISCharLenTable,
- 'name' => 'Shift_JIS'}
-
- # UCS2-BE
-
- UCS2BE_cls = [ \
- 0,0,0,0,0,0,0,0, # 00 - 07
- 0,0,1,0,0,2,0,0, # 08 - 0f
- 0,0,0,0,0,0,0,0, # 10 - 17
- 0,0,0,3,0,0,0,0, # 18 - 1f
- 0,0,0,0,0,0,0,0, # 20 - 27
- 0,3,3,3,3,3,0,0, # 28 - 2f
- 0,0,0,0,0,0,0,0, # 30 - 37
- 0,0,0,0,0,0,0,0, # 38 - 3f
- 0,0,0,0,0,0,0,0, # 40 - 47
- 0,0,0,0,0,0,0,0, # 48 - 4f
- 0,0,0,0,0,0,0,0, # 50 - 57
- 0,0,0,0,0,0,0,0, # 58 - 5f
- 0,0,0,0,0,0,0,0, # 60 - 67
- 0,0,0,0,0,0,0,0, # 68 - 6f
- 0,0,0,0,0,0,0,0, # 70 - 77
- 0,0,0,0,0,0,0,0, # 78 - 7f
- 0,0,0,0,0,0,0,0, # 80 - 87
- 0,0,0,0,0,0,0,0, # 88 - 8f
- 0,0,0,0,0,0,0,0, # 90 - 97
- 0,0,0,0,0,0,0,0, # 98 - 9f
- 0,0,0,0,0,0,0,0, # a0 - a7
- 0,0,0,0,0,0,0,0, # a8 - af
- 0,0,0,0,0,0,0,0, # b0 - b7
- 0,0,0,0,0,0,0,0, # b8 - bf
- 0,0,0,0,0,0,0,0, # c0 - c7
- 0,0,0,0,0,0,0,0, # c8 - cf
- 0,0,0,0,0,0,0,0, # d0 - d7
- 0,0,0,0,0,0,0,0, # d8 - df
- 0,0,0,0,0,0,0,0, # e0 - e7
- 0,0,0,0,0,0,0,0, # e8 - ef
- 0,0,0,0,0,0,0,0, # f0 - f7
- 0,0,0,0,0,0,4,5] # f8 - ff
-
- UCS2BE_st = [ \
- 5, 7, 7,:Error, 4, 3,:Error,:Error,#00-07
- :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe, 6, 6, 6, 6,:Error,:Error,#10-17
- 6, 6, 6, 6, 6,:ItsMe, 6, 6,#18-1f
- 6, 6, 6, 6, 5, 7, 7,:Error,#20-27
- 5, 8, 6, 6,:Error, 6, 6, 6,#28-2f
- 6, 6, 6, 6,:Error,:Error,:Start,:Start]#30-37
-
- UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
-
- UCS2BESMModel = {'classTable' => UCS2BE_cls,
- 'classFactor' => 6,
- 'stateTable' => UCS2BE_st,
- 'charLenTable' => UCS2BECharLenTable,
- 'name' => 'UTF-16BE'}
-
- # UCS2-LE
-
- UCS2LE_cls = [ \
- 0,0,0,0,0,0,0,0, # 00 - 07
- 0,0,1,0,0,2,0,0, # 08 - 0f
- 0,0,0,0,0,0,0,0, # 10 - 17
- 0,0,0,3,0,0,0,0, # 18 - 1f
- 0,0,0,0,0,0,0,0, # 20 - 27
- 0,3,3,3,3,3,0,0, # 28 - 2f
- 0,0,0,0,0,0,0,0, # 30 - 37
- 0,0,0,0,0,0,0,0, # 38 - 3f
- 0,0,0,0,0,0,0,0, # 40 - 47
- 0,0,0,0,0,0,0,0, # 48 - 4f
- 0,0,0,0,0,0,0,0, # 50 - 57
- 0,0,0,0,0,0,0,0, # 58 - 5f
- 0,0,0,0,0,0,0,0, # 60 - 67
- 0,0,0,0,0,0,0,0, # 68 - 6f
- 0,0,0,0,0,0,0,0, # 70 - 77
- 0,0,0,0,0,0,0,0, # 78 - 7f
- 0,0,0,0,0,0,0,0, # 80 - 87
- 0,0,0,0,0,0,0,0, # 88 - 8f
- 0,0,0,0,0,0,0,0, # 90 - 97
- 0,0,0,0,0,0,0,0, # 98 - 9f
- 0,0,0,0,0,0,0,0, # a0 - a7
- 0,0,0,0,0,0,0,0, # a8 - af
- 0,0,0,0,0,0,0,0, # b0 - b7
- 0,0,0,0,0,0,0,0, # b8 - bf
- 0,0,0,0,0,0,0,0, # c0 - c7
- 0,0,0,0,0,0,0,0, # c8 - cf
- 0,0,0,0,0,0,0,0, # d0 - d7
- 0,0,0,0,0,0,0,0, # d8 - df
- 0,0,0,0,0,0,0,0, # e0 - e7
- 0,0,0,0,0,0,0,0, # e8 - ef
- 0,0,0,0,0,0,0,0, # f0 - f7
- 0,0,0,0,0,0,4,5] # f8 - ff
-
- UCS2LE_st = [ \
- 6, 6, 7, 6, 4, 3,:Error,:Error,#00-07
- :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
- :ItsMe,:ItsMe, 5, 5, 5,:Error,:ItsMe,:Error,#10-17
- 5, 5, 5,:Error, 5,:Error, 6, 6,#18-1f
- 7, 6, 8, 8, 5, 5, 5,:Error,#20-27
- 5, 5, 5,:Error,:Error,:Error, 5, 5,#28-2f
- 5, 5, 5,:Error, 5,:Error,:Start,:Start]#30-37
-
- UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
-
- UCS2LESMModel = {'classTable' => UCS2LE_cls,
- 'classFactor' => 6,
- 'stateTable' => UCS2LE_st,
- 'charLenTable' => UCS2LECharLenTable,
- 'name' => 'UTF-16LE'}
-
- # UTF-8
-
- UTF8_cls = [ \
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
- 1,1,1,1,1,1,0,0, # 08 - 0f
- 1,1,1,1,1,1,1,1, # 10 - 17
- 1,1,1,0,1,1,1,1, # 18 - 1f
- 1,1,1,1,1,1,1,1, # 20 - 27
- 1,1,1,1,1,1,1,1, # 28 - 2f
- 1,1,1,1,1,1,1,1, # 30 - 37
- 1,1,1,1,1,1,1,1, # 38 - 3f
- 1,1,1,1,1,1,1,1, # 40 - 47
- 1,1,1,1,1,1,1,1, # 48 - 4f
- 1,1,1,1,1,1,1,1, # 50 - 57
- 1,1,1,1,1,1,1,1, # 58 - 5f
- 1,1,1,1,1,1,1,1, # 60 - 67
- 1,1,1,1,1,1,1,1, # 68 - 6f
- 1,1,1,1,1,1,1,1, # 70 - 77
- 1,1,1,1,1,1,1,1, # 78 - 7f
- 2,2,2,2,3,3,3,3, # 80 - 87
- 4,4,4,4,4,4,4,4, # 88 - 8f
- 4,4,4,4,4,4,4,4, # 90 - 97
- 4,4,4,4,4,4,4,4, # 98 - 9f
- 5,5,5,5,5,5,5,5, # a0 - a7
- 5,5,5,5,5,5,5,5, # a8 - af
- 5,5,5,5,5,5,5,5, # b0 - b7
- 5,5,5,5,5,5,5,5, # b8 - bf
- 0,0,6,6,6,6,6,6, # c0 - c7
- 6,6,6,6,6,6,6,6, # c8 - cf
- 6,6,6,6,6,6,6,6, # d0 - d7
- 6,6,6,6,6,6,6,6, # d8 - df
- 7,8,8,8,8,8,8,8, # e0 - e7
- 8,8,8,8,8,9,8,8, # e8 - ef
- 10,11,11,11,11,11,11,11, # f0 - f7
- 12,13,13,13,14,15,0,0] # f8 - ff
-
- UTF8_st = [ \
- :Error,:Start,:Error,:Error,:Error,:Error, 12, 10,#00-07
- 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#10-17
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#18-1f
- :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#20-27
- :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#28-2f
- :Error,:Error, 5, 5, 5, 5,:Error,:Error,#30-37
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#38-3f
- :Error,:Error,:Error, 5, 5, 5,:Error,:Error,#40-47
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#48-4f
- :Error,:Error, 7, 7, 7, 7,:Error,:Error,#50-57
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#58-5f
- :Error,:Error,:Error,:Error, 7, 7,:Error,:Error,#60-67
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#68-6f
- :Error,:Error, 9, 9, 9, 9,:Error,:Error,#70-77
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#78-7f
- :Error,:Error,:Error,:Error,:Error, 9,:Error,:Error,#80-87
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#88-8f
- :Error,:Error, 12, 12, 12, 12,:Error,:Error,#90-97
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#98-9f
- :Error,:Error,:Error,:Error,:Error, 12,:Error,:Error,#a0-a7
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#a8-af
- :Error,:Error, 12, 12, 12,:Error,:Error,:Error,#b0-b7
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#b8-bf
- :Error,:Error,:Start,:Start,:Start,:Start,:Error,:Error,#c0-c7
- :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error]#c8-cf
-
- UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
-
- UTF8SMModel = {'classTable' => UTF8_cls,
- 'classFactor' => 16,
- 'stateTable' => UTF8_st,
- 'charLenTable' => UTF8CharLenTable,
- 'name' => 'UTF-8'}
+
+ ##
+ ## BIG5
+ ##
+
+ BIG5_cls = [
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
+ 1,1,1,1,1,1,0,0, # 08 - 0f
+ 1,1,1,1,1,1,1,1, # 10 - 17
+ 1,1,1,0,1,1,1,1, # 18 - 1f
+ 1,1,1,1,1,1,1,1, # 20 - 27
+ 1,1,1,1,1,1,1,1, # 28 - 2f
+ 1,1,1,1,1,1,1,1, # 30 - 37
+ 1,1,1,1,1,1,1,1, # 38 - 3f
+ 2,2,2,2,2,2,2,2, # 40 - 47
+ 2,2,2,2,2,2,2,2, # 48 - 4f
+ 2,2,2,2,2,2,2,2, # 50 - 57
+ 2,2,2,2,2,2,2,2, # 58 - 5f
+ 2,2,2,2,2,2,2,2, # 60 - 67
+ 2,2,2,2,2,2,2,2, # 68 - 6f
+ 2,2,2,2,2,2,2,2, # 70 - 77
+ 2,2,2,2,2,2,2,1, # 78 - 7f
+ 4,4,4,4,4,4,4,4, # 80 - 87
+ 4,4,4,4,4,4,4,4, # 88 - 8f
+ 4,4,4,4,4,4,4,4, # 90 - 97
+ 4,4,4,4,4,4,4,4, # 98 - 9f
+ 4,3,3,3,3,3,3,3, # a0 - a7
+ 3,3,3,3,3,3,3,3, # a8 - af
+ 3,3,3,3,3,3,3,3, # b0 - b7
+ 3,3,3,3,3,3,3,3, # b8 - bf
+ 3,3,3,3,3,3,3,3, # c0 - c7
+ 3,3,3,3,3,3,3,3, # c8 - cf
+ 3,3,3,3,3,3,3,3, # d0 - d7
+ 3,3,3,3,3,3,3,3, # d8 - df
+ 3,3,3,3,3,3,3,3, # e0 - e7
+ 3,3,3,3,3,3,3,3, # e8 - ef
+ 3,3,3,3,3,3,3,3, # f0 - f7
+ 3,3,3,3,3,3,3,0, # f8 - ff
+ ]
+
+ BIG5_st = [
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error, #00-07
+ :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error, #08-0f
+ :Error,:Start,:Start,:Start,:Start,:Start,:Start,:Start, #10-17
+ ]
+
+ Big5CharLenTable = [ 0, 1, 1, 2, 0 ]
+
+ Big5SMModel = {
+ 'classTable' => BIG5_cls,
+ 'classFactor' => 5,
+ 'stateTable' => BIG5_st,
+ 'charLenTable' => Big5CharLenTable,
+ 'name' => 'Big5',
+ }
+
+ ##
+ ## EUC-JP
+ ##
+
+ EUCJP_cls = [
+ 4,4,4,4,4,4,4,4, # 00 - 07
+ 4,4,4,4,4,4,5,5, # 08 - 0f
+ 4,4,4,4,4,4,4,4, # 10 - 17
+ 4,4,4,5,4,4,4,4, # 18 - 1f
+ 4,4,4,4,4,4,4,4, # 20 - 27
+ 4,4,4,4,4,4,4,4, # 28 - 2f
+ 4,4,4,4,4,4,4,4, # 30 - 37
+ 4,4,4,4,4,4,4,4, # 38 - 3f
+ 4,4,4,4,4,4,4,4, # 40 - 47
+ 4,4,4,4,4,4,4,4, # 48 - 4f
+ 4,4,4,4,4,4,4,4, # 50 - 57
+ 4,4,4,4,4,4,4,4, # 58 - 5f
+ 4,4,4,4,4,4,4,4, # 60 - 67
+ 4,4,4,4,4,4,4,4, # 68 - 6f
+ 4,4,4,4,4,4,4,4, # 70 - 77
+ 4,4,4,4,4,4,4,4, # 78 - 7f
+ 5,5,5,5,5,5,5,5, # 80 - 87
+ 5,5,5,5,5,5,1,3, # 88 - 8f
+ 5,5,5,5,5,5,5,5, # 90 - 97
+ 5,5,5,5,5,5,5,5, # 98 - 9f
+ 5,2,2,2,2,2,2,2, # a0 - a7
+ 2,2,2,2,2,2,2,2, # a8 - af
+ 2,2,2,2,2,2,2,2, # b0 - b7
+ 2,2,2,2,2,2,2,2, # b8 - bf
+ 2,2,2,2,2,2,2,2, # c0 - c7
+ 2,2,2,2,2,2,2,2, # c8 - cf
+ 2,2,2,2,2,2,2,2, # d0 - d7
+ 2,2,2,2,2,2,2,2, # d8 - df
+ 0,0,0,0,0,0,0,0, # e0 - e7
+ 0,0,0,0,0,0,0,0, # e8 - ef
+ 0,0,0,0,0,0,0,0, # f0 - f7
+ 0,0,0,0,0,0,0,5, # f8 - ff
+ ]
+
+ EUCJP_st = [
+ 3, 4, 3, 5,:Start,:Error,:Error,:Error, #00-07
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe, #08-0f
+ :ItsMe,:ItsMe,:Start,:Error,:Start,:Error,:Error,:Error, #10-17
+ :Error,:Error,:Start,:Error,:Error,:Error, 3,:Error, #18-1f
+ 3,:Error,:Error,:Error,:Start,:Start,:Start,:Start, #20-27
+ ]
+
+ EUCJPCharLenTable = [ 2, 2, 2, 3, 1, 0 ]
+
+ EUCJPSMModel = {
+ 'classTable' => EUCJP_cls,
+ 'classFactor' => 6,
+ 'stateTable' => EUCJP_st,
+ 'charLenTable' => EUCJPCharLenTable,
+ 'name' => 'EUC-JP',
+ }
+
+ ##
+ ## EUC-KR
+ ##
+
+ EUCKR_cls = [
+ 1,1,1,1,1,1,1,1, # 00 - 07
+ 1,1,1,1,1,1,0,0, # 08 - 0f
+ 1,1,1,1,1,1,1,1, # 10 - 17
+ 1,1,1,0,1,1,1,1, # 18 - 1f
+ 1,1,1,1,1,1,1,1, # 20 - 27
+ 1,1,1,1,1,1,1,1, # 28 - 2f
+ 1,1,1,1,1,1,1,1, # 30 - 37
+ 1,1,1,1,1,1,1,1, # 38 - 3f
+ 1,1,1,1,1,1,1,1, # 40 - 47
+ 1,1,1,1,1,1,1,1, # 48 - 4f
+ 1,1,1,1,1,1,1,1, # 50 - 57
+ 1,1,1,1,1,1,1,1, # 58 - 5f
+ 1,1,1,1,1,1,1,1, # 60 - 67
+ 1,1,1,1,1,1,1,1, # 68 - 6f
+ 1,1,1,1,1,1,1,1, # 70 - 77
+ 1,1,1,1,1,1,1,1, # 78 - 7f
+ 0,0,0,0,0,0,0,0, # 80 - 87
+ 0,0,0,0,0,0,0,0, # 88 - 8f
+ 0,0,0,0,0,0,0,0, # 90 - 97
+ 0,0,0,0,0,0,0,0, # 98 - 9f
+ 0,2,2,2,2,2,2,2, # a0 - a7
+ 2,2,2,2,2,3,3,3, # a8 - af
+ 2,2,2,2,2,2,2,2, # b0 - b7
+ 2,2,2,2,2,2,2,2, # b8 - bf
+ 2,2,2,2,2,2,2,2, # c0 - c7
+ 2,3,2,2,2,2,2,2, # c8 - cf
+ 2,2,2,2,2,2,2,2, # d0 - d7
+ 2,2,2,2,2,2,2,2, # d8 - df
+ 2,2,2,2,2,2,2,2, # e0 - e7
+ 2,2,2,2,2,2,2,2, # e8 - ef
+ 2,2,2,2,2,2,2,2, # f0 - f7
+ 2,2,2,2,2,2,2,0, # f8 - ff
+ ]
+
+ EUCKR_st = [
+ :Error,:Start, 3,:Error,:Error,:Error,:Error,:Error, #00-07
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,:Start, #08-0f
+ ]
+
+ EUCKRCharLenTable = [ 0, 1, 2, 0 ]
+
+ EUCKRSMModel = {
+ 'classTable' => EUCKR_cls,
+ 'classFactor' => 4,
+ 'stateTable' => EUCKR_st,
+ 'charLenTable' => EUCKRCharLenTable,
+ 'name' => 'EUC-KR'
+ }
+
+ ##
+ ## EUC-TW
+ ##
+
+ EUCTW_cls = [
+ 2,2,2,2,2,2,2,2, # 00 - 07
+ 2,2,2,2,2,2,0,0, # 08 - 0f
+ 2,2,2,2,2,2,2,2, # 10 - 17
+ 2,2,2,0,2,2,2,2, # 18 - 1f
+ 2,2,2,2,2,2,2,2, # 20 - 27
+ 2,2,2,2,2,2,2,2, # 28 - 2f
+ 2,2,2,2,2,2,2,2, # 30 - 37
+ 2,2,2,2,2,2,2,2, # 38 - 3f
+ 2,2,2,2,2,2,2,2, # 40 - 47
+ 2,2,2,2,2,2,2,2, # 48 - 4f
+ 2,2,2,2,2,2,2,2, # 50 - 57
+ 2,2,2,2,2,2,2,2, # 58 - 5f
+ 2,2,2,2,2,2,2,2, # 60 - 67
+ 2,2,2,2,2,2,2,2, # 68 - 6f
+ 2,2,2,2,2,2,2,2, # 70 - 77
+ 2,2,2,2,2,2,2,2, # 78 - 7f
+ 0,0,0,0,0,0,0,0, # 80 - 87
+ 0,0,0,0,0,0,6,0, # 88 - 8f
+ 0,0,0,0,0,0,0,0, # 90 - 97
+ 0,0,0,0,0,0,0,0, # 98 - 9f
+ 0,3,4,4,4,4,4,4, # a0 - a7
+ 5,5,1,1,1,1,1,1, # a8 - af
+ 1,1,1,1,1,1,1,1, # b0 - b7
+ 1,1,1,1,1,1,1,1, # b8 - bf
+ 1,1,3,1,3,3,3,3, # c0 - c7
+ 3,3,3,3,3,3,3,3, # c8 - cf
+ 3,3,3,3,3,3,3,3, # d0 - d7
+ 3,3,3,3,3,3,3,3, # d8 - df
+ 3,3,3,3,3,3,3,3, # e0 - e7
+ 3,3,3,3,3,3,3,3, # e8 - ef
+ 3,3,3,3,3,3,3,3, # f0 - f7
+ 3,3,3,3,3,3,3,0, # f8 - ff
+ ]
+
+ EUCTW_st = [
+ :Error,:Error,:Start, 3, 3, 3, 4,:Error, #00-07
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe, #08-0f
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Start,:Error, #10-17
+ :Start,:Start,:Start,:Error,:Error,:Error,:Error,:Error, #18-1f
+ 5,:Error,:Error,:Error,:Start,:Error,:Start,:Start, #20-27
+ :Start,:Error,:Start,:Start,:Start,:Start,:Start,:Start, #28-2f
+ ]
+
+ EUCTWCharLenTable = [ 0, 0, 1, 2, 2, 2, 3 ]
+
+ EUCTWSMModel = {
+ 'classTable' => EUCTW_cls,
+ 'classFactor' => 7,
+ 'stateTable' => EUCTW_st,
+ 'charLenTable' => EUCTWCharLenTable,
+ 'name' => 'x-euc-tw',
+ }
+
+ ##
+ ## GB2312
+ ##
+
+ GB2312_cls = [
+ 1,1,1,1,1,1,1,1, # 00 - 07
+ 1,1,1,1,1,1,0,0, # 08 - 0f
+ 1,1,1,1,1,1,1,1, # 10 - 17
+ 1,1,1,0,1,1,1,1, # 18 - 1f
+ 1,1,1,1,1,1,1,1, # 20 - 27
+ 1,1,1,1,1,1,1,1, # 28 - 2f
+ 3,3,3,3,3,3,3,3, # 30 - 37
+ 3,3,1,1,1,1,1,1, # 38 - 3f
+ 2,2,2,2,2,2,2,2, # 40 - 47
+ 2,2,2,2,2,2,2,2, # 48 - 4f
+ 2,2,2,2,2,2,2,2, # 50 - 57
+ 2,2,2,2,2,2,2,2, # 58 - 5f
+ 2,2,2,2,2,2,2,2, # 60 - 67
+ 2,2,2,2,2,2,2,2, # 68 - 6f
+ 2,2,2,2,2,2,2,2, # 70 - 77
+ 2,2,2,2,2,2,2,4, # 78 - 7f
+ 5,6,6,6,6,6,6,6, # 80 - 87
+ 6,6,6,6,6,6,6,6, # 88 - 8f
+ 6,6,6,6,6,6,6,6, # 90 - 97
+ 6,6,6,6,6,6,6,6, # 98 - 9f
+ 6,6,6,6,6,6,6,6, # a0 - a7
+ 6,6,6,6,6,6,6,6, # a8 - af
+ 6,6,6,6,6,6,6,6, # b0 - b7
+ 6,6,6,6,6,6,6,6, # b8 - bf
+ 6,6,6,6,6,6,6,6, # c0 - c7
+ 6,6,6,6,6,6,6,6, # c8 - cf
+ 6,6,6,6,6,6,6,6, # d0 - d7
+ 6,6,6,6,6,6,6,6, # d8 - df
+ 6,6,6,6,6,6,6,6, # e0 - e7
+ 6,6,6,6,6,6,6,6, # e8 - ef
+ 6,6,6,6,6,6,6,6, # f0 - f7
+ 6,6,6,6,6,6,6,0, # f8 - ff
+ ]
+
+ GB2312_st = [
+ :Error,:Start,:Start,:Start,:Start,:Start, 3,:Error, #00-07
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe, #08-0f
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start, #10-17
+ 4,:Error,:Start,:Start,:Error,:Error,:Error,:Error, #18-1f
+ :Error,:Error, 5,:Error,:Error,:Error,:ItsMe,:Error, #20-27
+ :Error,:Error,:Start,:Start,:Start,:Start,:Start,:Start, #28-2f
+ ]
+
+ # To be accurate, the length of class 6 can be either 2 or 4.
+ # But it is not necessary to discriminate between the two since
+ # it is used for frequency analysis only, and we are validing
+ # each code range there as well. So it is safe to set it to be
+ # 2 here.
+ GB2312CharLenTable = [ 0, 1, 1, 1, 1, 1, 2 ]
+
+ GB2312SMModel = {
+ 'classTable' => GB2312_cls,
+ 'classFactor' => 7,
+ 'stateTable' => GB2312_st,
+ 'charLenTable' => GB2312CharLenTable,
+ 'name' => 'GB2312',
+ }
+
+ ##
+ ## Shift_JIS
+ ##
+
+ SJIS_cls = [
+ 1,1,1,1,1,1,1,1, # 00 - 07
+ 1,1,1,1,1,1,0,0, # 08 - 0f
+ 1,1,1,1,1,1,1,1, # 10 - 17
+ 1,1,1,0,1,1,1,1, # 18 - 1f
+ 1,1,1,1,1,1,1,1, # 20 - 27
+ 1,1,1,1,1,1,1,1, # 28 - 2f
+ 1,1,1,1,1,1,1,1, # 30 - 37
+ 1,1,1,1,1,1,1,1, # 38 - 3f
+ 2,2,2,2,2,2,2,2, # 40 - 47
+ 2,2,2,2,2,2,2,2, # 48 - 4f
+ 2,2,2,2,2,2,2,2, # 50 - 57
+ 2,2,2,2,2,2,2,2, # 58 - 5f
+ 2,2,2,2,2,2,2,2, # 60 - 67
+ 2,2,2,2,2,2,2,2, # 68 - 6f
+ 2,2,2,2,2,2,2,2, # 70 - 77
+ 2,2,2,2,2,2,2,1, # 78 - 7f
+ 3,3,3,3,3,3,3,3, # 80 - 87
+ 3,3,3,3,3,3,3,3, # 88 - 8f
+ 3,3,3,3,3,3,3,3, # 90 - 97
+ 3,3,3,3,3,3,3,3, # 98 - 9f
+ #0xa0 is illegal in sjis encoding, but some pages does
+ #contain such byte. We need to be more error forgiven.
+ 2,2,2,2,2,2,2,2, # a0 - a7
+ 2,2,2,2,2,2,2,2, # a8 - af
+ 2,2,2,2,2,2,2,2, # b0 - b7
+ 2,2,2,2,2,2,2,2, # b8 - bf
+ 2,2,2,2,2,2,2,2, # c0 - c7
+ 2,2,2,2,2,2,2,2, # c8 - cf
+ 2,2,2,2,2,2,2,2, # d0 - d7
+ 2,2,2,2,2,2,2,2, # d8 - df
+ 3,3,3,3,3,3,3,3, # e0 - e7
+ 3,3,3,3,3,4,4,4, # e8 - ef
+ 4,4,4,4,4,4,4,4, # f0 - f7
+ 4,4,4,4,4,0,0,0, # f8 - ff
+ ]
+
+ SJIS_st = [
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error, #00-07
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe, #08-0f
+ :ItsMe,:ItsMe,:Error,:Error,:Start,:Start,:Start,:Start, #10-17
+ ]
+
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
+
+ SJISSMModel = {
+ 'classTable' => SJIS_cls,
+ 'classFactor' => 6,
+ 'stateTable' => SJIS_st,
+ 'charLenTable' => SJISCharLenTable,
+ 'name' => 'Shift_JIS',
+ }
+
+ ##
+ ## UCS2-BE
+ ##
+
+ UCS2BE_cls = [
+ 0,0,0,0,0,0,0,0, # 00 - 07
+ 0,0,1,0,0,2,0,0, # 08 - 0f
+ 0,0,0,0,0,0,0,0, # 10 - 17
+ 0,0,0,3,0,0,0,0, # 18 - 1f
+ 0,0,0,0,0,0,0,0, # 20 - 27
+ 0,3,3,3,3,3,0,0, # 28 - 2f
+ 0,0,0,0,0,0,0,0, # 30 - 37
+ 0,0,0,0,0,0,0,0, # 38 - 3f
+ 0,0,0,0,0,0,0,0, # 40 - 47
+ 0,0,0,0,0,0,0,0, # 48 - 4f
+ 0,0,0,0,0,0,0,0, # 50 - 57
+ 0,0,0,0,0,0,0,0, # 58 - 5f
+ 0,0,0,0,0,0,0,0, # 60 - 67
+ 0,0,0,0,0,0,0,0, # 68 - 6f
+ 0,0,0,0,0,0,0,0, # 70 - 77
+ 0,0,0,0,0,0,0,0, # 78 - 7f
+ 0,0,0,0,0,0,0,0, # 80 - 87
+ 0,0,0,0,0,0,0,0, # 88 - 8f
+ 0,0,0,0,0,0,0,0, # 90 - 97
+ 0,0,0,0,0,0,0,0, # 98 - 9f
+ 0,0,0,0,0,0,0,0, # a0 - a7
+ 0,0,0,0,0,0,0,0, # a8 - af
+ 0,0,0,0,0,0,0,0, # b0 - b7
+ 0,0,0,0,0,0,0,0, # b8 - bf
+ 0,0,0,0,0,0,0,0, # c0 - c7
+ 0,0,0,0,0,0,0,0, # c8 - cf
+ 0,0,0,0,0,0,0,0, # d0 - d7
+ 0,0,0,0,0,0,0,0, # d8 - df
+ 0,0,0,0,0,0,0,0, # e0 - e7
+ 0,0,0,0,0,0,0,0, # e8 - ef
+ 0,0,0,0,0,0,0,0, # f0 - f7
+ 0,0,0,0,0,0,4,5, # f8 - ff
+ ]
+
+ UCS2BE_st = [
+ 5, 7, 7,:Error, 4, 3,:Error,:Error, #00-07
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe, #08-0f
+ :ItsMe,:ItsMe, 6, 6, 6, 6,:Error,:Error, #10-17
+ 6, 6, 6, 6, 6,:ItsMe, 6, 6, #18-1f
+ 6, 6, 6, 6, 5, 7, 7,:Error, #20-27
+ 5, 8, 6, 6,:Error, 6, 6, 6, #28-2f
+ 6, 6, 6, 6,:Error,:Error,:Start,:Start, #30-37
+ ]
+
+ UCS2BECharLenTable = [ 2, 2, 2, 0, 2, 2 ]
+
+ UCS2BESMModel = {
+ 'classTable' => UCS2BE_cls,
+ 'classFactor' => 6,
+ 'stateTable' => UCS2BE_st,
+ 'charLenTable' => UCS2BECharLenTable,
+ 'name' => 'UTF-16BE',
+ }
+
+ ##
+ ## UCS2-LE
+ ##
+
+ UCS2LE_cls = [ \
+ 0,0,0,0,0,0,0,0, # 00 - 07
+ 0,0,1,0,0,2,0,0, # 08 - 0f
+ 0,0,0,0,0,0,0,0, # 10 - 17
+ 0,0,0,3,0,0,0,0, # 18 - 1f
+ 0,0,0,0,0,0,0,0, # 20 - 27
+ 0,3,3,3,3,3,0,0, # 28 - 2f
+ 0,0,0,0,0,0,0,0, # 30 - 37
+ 0,0,0,0,0,0,0,0, # 38 - 3f
+ 0,0,0,0,0,0,0,0, # 40 - 47
+ 0,0,0,0,0,0,0,0, # 48 - 4f
+ 0,0,0,0,0,0,0,0, # 50 - 57
+ 0,0,0,0,0,0,0,0, # 58 - 5f
+ 0,0,0,0,0,0,0,0, # 60 - 67
+ 0,0,0