-
Notifications
You must be signed in to change notification settings - Fork 2
/
unicode.vim
155 lines (138 loc) · 4.63 KB
/
unicode.vim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
" Vim file
" Maintainer: janus_wel <janus.wel.3@gmail.com>
" Last Change: 2009/12/03 19:39:41.
" Version: 0.10
" Remark: Provides below functions. All functions affect to the
" character under the cursor.
"
" * GetUtf8ByteSequence()
" get the list that has numbers of UTF-8 byte sequence
" * GetUtf8ByteSequenceStr()
" get the string of UTF-8 byte sequence
" * GetUnicodeCodePoint()
" get the number of Unicode code point
" * GetUnicodePattern()
" get the search pattern of the character
"
" Refer: http://d.hatena.ne.jp/krogue/20080616/1213590577
" http://homepage1.nifty.com/nomenclator/unicode/ucs_utf.htm
" preparation {{{1
" check if this plugin is already loaded or not
if exists('loaded_unicode')
finish
endif
let loaded_unicode = 1
" check vim has the required feature
if !has('multi_byte')
finish
endif
" reset the value of 'cpoptions' for portability
let s:save_cpoptions = &cpoptions
set cpoptions&vim
" main {{{1
" about UTF-8 byte sequence {{{2
function! GetUtf8ByteSequence()
let char = matchstr(getline('.'), '.', col('.') - 1)
if char == ''
return [0]
endif
let bytes = iconv(char, &encoding, 'utf-8')
let numof = strlen(bytes)
let result = []
let i = 0
while i < numof
call add(result, char2nr(bytes[i]))
let i += 1
endwhile
return result
endfunction
" for display
function! GetUtf8ByteSequenceStr()
let utf8 = GetUtf8ByteSequence()
let result = []
for byte in utf8
call add(result, printf('%02x', byte))
endfor
return join(result)
endfunction
" about Unicode code point {{{2
function! GetUnicodeCodePoint()
" inverse transform to Unicode code point
let utf8 = GetUtf8ByteSequence()
let idx = len(utf8) - 1
" the condition is determined by a number of byte sequence
let conditions = s:conditions[idx]
" check if the byte sequence are valid or not
for condition in conditions
let i = 0
for [lower, upper] in condition
if (lower <= utf8[i]) || (utf8[i] <= upper)
return s:funcs[idx](utf8)
endif
let i += 1
endfor
endfor
throw 'Found the malformed utf-8 byte sequence: ' . utf8
endfunction
function! GetUnicodePattern()
return printf('%%\u%04x', GetUnicodeCodePoint())
endfunction
" stuff
function! s:OneByteToUnicode(utf8)
" 0xxxxxxx -> 00000000-0xxxxxxx
return a:utf8[0]
endfunction
function! s:TwoBytesToUnicode(utf8)
" 110xxxxx 10yyyyyy -> 00000xxx-xxyyyyyy
return s:RS(s:LS(a:utf8[0], 0x08), 0x20) * 0x0100
\ + s:LS(a:utf8[0], 0x40) + s:RS(s:LS(a:utf8[1], 0x04), 0x04)
endfunction
function! s:ThreeBytesToUnicode(utf8)
" 1110xxxx 10yyyyyy 10zzzzzz -> xxxxyyyy-yyzzzzzz
return (s:LS(a:utf8[0], 0x10) + s:RS(s:LS(a:utf8[1], 0x04), 0x10)) * 0x0100
\ + s:LS(a:utf8[1], 0x40) + s:RS(s:LS(a:utf8[2], 0x04), 0x04)
endfunction
function! s:FourBytesToUnicode(utf8)
" 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 00000000-000wwwxx-xxxxyyyy-yyzzzzzzz
return (s:RS(s:LS(a:utf8[0], 0x20), 0x08) + s:RS(s:LS(a:utf8[1], 0x04), 0x40)) * 0x010000
\ + (s:LS(a:utf8[1], 0x10) + s:RS(s:LS(a:utf8[2], 0x04), 0x10)) * 0x0100
\ + s:LS(a:utf8[1], 0x40) + s:RS(s:LS(a:utf8[2], 0x04), 0x04)
endfunction
" left shift (8 bits)
function! s:LS(nr, bits)
return a:nr * a:bits % 0x0100
endfunction
" right shift
function! s:RS(nr, bits)
return a:nr / a:bits
endfunction
" constants
" the list that has Funcrefs to calculate Unicode code point
let s:funcs = [
\ function('s:OneByteToUnicode'),
\ function('s:TwoBytesToUnicode'),
\ function('s:ThreeBytesToUnicode'),
\ function('s:FourBytesToUnicode'),
\ ]
" conditions that be used to check if the byte sequence are valid or not
let s:conditions = [
\ [[[0, 0x7f]]],
\ [[[0xc2, 0xdf], [0x80, 0xbf]]],
\ [
\ [[0xe0, 0xe0], [0xa0, 0xbf], [0x80, 0xbf]],
\ [[0xe1, 0xec], [0x80, 0xbf], [0x80, 0xbf]],
\ [[0xed, 0xed], [0x80, 0x9f], [0x80, 0xbf]],
\ [[0xee, 0xef], [0x80, 0xbf], [0x80, 0xbf]],
\ ],
\ [
\ [[0xf0, 0xf0], [0x90, 0xbf], [0x80, 0xbf], [0x80, 0xbf]],
\ [[0xf1, 0xf3], [0x80, 0xbf], [0x80, 0xbf], [0x80, 0xbf]],
\ [[0xf4, 0xf4], [0x80, 0x8f], [0x80, 0xbf], [0x80, 0xbf]],
\ ],
\ ]
" post-processing {{{1
" restore the value of 'cpoptions'
let &cpoptions = s:save_cpoptions
unlet s:save_cpoptions
" }}}1
" vim: ts=4 sw=4 sts=0 et fdm=marker fdc=3