Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

We’re showing branches in this repository, but you can also compare across forks.

...
  • 2 commits
  • 5 files changed
  • 0 commit comments
  • 1 contributor
12 lib/CharSetProber.rb
View
@@ -53,18 +53,18 @@ def get_confidence
end
def filter_high_bit_only(aBuf)
- aBuf = aBuf.gsub(/([\x00-\x7F])+/, '')
- return aBuf
+ # aBuf.gsub(/([\x00-\x7F])+/, '')
+ aBuf.select { | b | (b & 0x80) != 0 }
end
def filter_without_english_letters(aBuf)
- aBuf = aBuf.gsub(/([A-Za-z])+/, '')
- return aBuf
+ # aBuf.gsub(/([A-Za-z])+/, '')
+ aBuf.reject { | b | ((b >= 0x41) && (b <= 0x5A)) || ((b >= 0x61) && (b <= 0x7A)) }
end
def filter_with_english_letters(aBuf)
- # TODO
- return aBuf
+ # aBuf.gsub(/([^A-Za-z])+/, '')
+ aBuf.select { | b | ((b >= 0x41) && (b <= 0x5A)) || ((b >= 0x61) && (b <= 0x7A)) }
end
end #class
13 lib/JapaneseContextAnalysis.rb
View
@@ -188,8 +188,7 @@ class SJISContextAnalysis < JapaneseContextAnalysis
def get_order(aStr)
unless aStr then return -1, 1 end
# find out current char's byte length
- if ((aStr[0] >= 0x81) and (aStr[0] <= 0x9F)) or \
- ((aStr[0] >= 0xE0) and (aStr[0] <= 0xFC))
+ if ((aStr[0] >= 0x81) and (aStr[0] <= 0x9F)) or ((aStr[0] >= 0xE0) and (aStr[0] <= 0xFC))
charLen = 2
else
charLen = 1
@@ -197,9 +196,7 @@ def get_order(aStr)
# return its order if it is hiragana
if aStr.length > 1
- if (aStr[0] == '\202') and \
- (aStr[1] >= 0x9F) and \
- (aStr[1] <= 0xF1)
+ if (aStr[0] == '\202') and (aStr[1] >= 0x9F) and (aStr[1] <= 0xF1)
return ord(aStr[1]) - 0x9F, charLen
end
end
@@ -212,7 +209,7 @@ class EUCJPContextAnalysis < JapaneseContextAnalysis
def get_order(aStr)
unless aStr then return -1, 1 end
# find out current char's byte length
- aStr = aStr.to_s
+ # aStr = aStr.to_s
if (aStr[0] == 0x8E) or ((aStr[0] >= 0xA1) and (aStr[0] <= 0xFE))
charLen = 2
elsif aStr[0] == 0x8F
@@ -223,9 +220,7 @@ def get_order(aStr)
# return its order if it is hiragana
if aStr.length > 1
- if (aStr[0] == 0xA4) and \
- (aStr[1] >= 0xA1) and \
- (aStr[1] <= 0xF3)
+ if (aStr[0] == 0xA4) and (aStr[1] >= 0xA1) and (aStr[1] <= 0xF3)
return aStr[1][0] - 0xA1, charLen
end
end
4 lib/MultiByteCharSetProber.rb
View
@@ -38,7 +38,7 @@ def initialize
super
@_mDistributionAnalyzer = nil
@_mCodingSM = nil
- @_mLastChar = ['\x00', '\x00']
+ @_mLastChar = [ 0x00, 0x00 ]
end
def reset
@@ -49,7 +49,7 @@ def reset
if @_mDistributionAnalyzer
@_mDistributionAnalyzer.reset()
end
- @_mLastChar = ['\x00', '\x00']
+ @_mLastChar = [ 0x00, 0x00 ]
end
def get_charset_name
13 lib/UniversalDetector.rb
View
@@ -75,7 +75,7 @@ class Detector
def initialize
# @_highBitDetector = Regexp.new('[\x80-\xFF]', nil, 'n')
- @_escDetector = /\033|~\{/
+ # @_escDetector = /\033|~\{/
@_mEscCharSetProber = nil
@_mCharSetProbers = []
reset
@@ -96,6 +96,15 @@ def reset
end
end
+ def contains_escape?(data)
+ return true if data.include? 0x1B
+
+ idx = data.find_index 0x7E
+ return (data[idx + 1] == 0x7B) if idx
+
+ false
+ end
+
def contains_high_bit?(data)
data.any? { | b | (b & 0x80) != 0 }
end
@@ -138,7 +147,7 @@ def feed(data)
if @_mInputState == :PureAscii
if contains_high_bit?(data)
@_mInputState = :Highbyte
- elsif (@_mLastChar + data) =~ @_escDetector
+ elsif contains_escape?(@_mLastChar + data)
@_mInputState = :EscAscii
end
end
29 spec/universal_detector_spec.rb
View
@@ -17,4 +17,33 @@
u.contains_high_bit?([ 0x00, 0x7F ]).should eq(false)
end
+end
+
+describe UniversalDetector, '#contains_escape?' do
+
+ it 'returns true if data contains an escape character' do
+ u = UniversalDetector::Detector.instance
+ u.contains_escape?([ 0x77, 0x30, 0x30, 0x1B, 0x74 ]).should eq(true)
+ end
+
+ it 'returns true if data contains an escape sequence' do
+ u = UniversalDetector::Detector.instance
+ u.contains_escape?([ 0x77, 0x30, 0x30, 0x7E, 0x7B, 0x74 ]).should eq(true)
+ end
+
+ it 'returns true if data contains more than one escape character' do
+ u = UniversalDetector::Detector.instance
+ u.contains_escape?([ 0x77, 0x1B, 0x30, 0x30, 0x1B, 0x74 ]).should eq(true)
+ end
+
+ it 'returns true is data contains more than one escape sequence' do
+ u = UniversalDetector::Detector.instance
+ u.contains_escape?([ 0x77, 0x7E, 0x7B, 0x30, 0x30, 0x7E, 0x7B, 0x74 ]).should eq(true)
+ end
+
+ it 'returns false if data does not contain any escape characters or sequences' do
+ u = UniversalDetector::Detector.instance
+ u.contains_escape?([ 0x77, 0x30, 0x30, 0x74 ]).should eq(false)
+ end
+
end

No commit comments for this range

Something went wrong with that request. Please try again.