Skip to content

Commit

Permalink
Handle Punctuation in get_*_text (#3)
Browse files Browse the repository at this point in the history
* Handle Punctuation in `get_*_text`

* Handle Negative Numbers and Optimize
  • Loading branch information
graphemecluster committed Mar 10, 2023
1 parent ea6a43b commit 2e411a4
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 24 deletions.
34 changes: 22 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,27 @@ Usage:

```python
>>> import ToJyutping
>>> ToJyutping.get_jyutping_list('一瓩係乜嘢嚟㗎?')
[('', 'jat1'), ('', 'cin1 ngaa5'), ('', 'hai6'), ('', 'mat1'), ('', 'je5'), ('', 'lai4'), ('', 'gaa3'), ('', None)]
>>> ToJyutping.get_jyutping('一瓩係乜嘢嚟㗎?')
'一(jat1)瓩(cin1 ngaa5)係(hai6)乜(mat1)嘢(je5)嚟(lai4)㗎(gaa3)?'
>>> ToJyutping.get_jyutping_text('一瓩係乜嘢嚟㗎?')
'jat1 cin1 ngaa5 hai6 mat1 je5 lai4 gaa3'
>>> ToJyutping.get_ipa_list('一瓩係乜嘢嚟㗎?')
[('', 'jɐt̚˥'), ('', 't͡sʰiːn˥.ŋaː˩˧'), ('', 'hɐi̯˨'), ('', 'mɐt̚˥'), ('', 'jɛː˩˧'), ('', 'lɐi̯˨˩'), ('', 'kaː˧'), ('', None)]
>>> ToJyutping.get_ipa('一瓩係乜嘢嚟㗎?')
'一[jɐt̚˥]瓩[t͡sʰiːn˥.ŋaː˩˧]係[hɐi̯˨]乜[mɐt̚˥]嘢[jɛː˩˧]嚟[lɐi̯˨˩]㗎[kaː˧]?'
>>> ToJyutping.get_ipa_text('一瓩係乜嘢嚟㗎?')
'jɐt̚˥.t͡sʰiːn˥.ŋaː˩˧.hɐi̯˨.mɐt̚˥.jɛː˩˧.lɐi̯˨˩.kaː˧'
>>> ToJyutping.get_jyutping_list('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
[('', 'gam3'), ('', 'ngaam1'), ('', 'lou5'), ('', 'sai3'), ('', 'jiu1'), ('', 'kau4'), ('', 'keoi5'), ('', 'dang2'), ('', 'zan6'), ('', 'jiu3'), ('', 'hoi1'), ('', 'wui2'), ('', None), ('', 'zing6'), ('', 'dai1'), ('', 'ge2'), ('', 'je5'), ('', 'ngo5'), ('', 'wui5'), ('', 'gaau2'), ('', 'dim6'), ('', 'ga3'), ('', 'laa3'), ('', None)]
>>> ToJyutping.get_jyutping('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
'咁(gam3)啱(ngaam1)老(lou5)世(sai3)要(jiu1)求(kau4)佢(keoi5)等(dang2)陣(zan6)要(jiu3)開(hoi1)會(wui2),剩(zing6)低(dai1)嘅(ge2)嘢(je5)我(ngo5)會(wui5)搞(gaau2)掂(dim6)㗎(ga3)喇(laa3)。'
>>> ToJyutping.get_jyutping_text('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
>>> ToJyutping.get_ipa_list('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
[('', 'kɐm˧'), ('', 'ŋaːm˥'), ('', 'lou̯˩˧'), ('', 'sɐi̯˧'), ('', 'jiːu̯˥'), ('', 'kʰɐu̯˨˩'), ('', 'kʰɵy̑˩˧'), ('', 'tɐŋ˧˥'), ('', 't͡sɐn˨'), ('', 'jiːu̯˧'), ('', 'hɔːi̯˥'), ('', 'wuːi̯˧˥'), ('', None), ('', 't͡seŋ˨'), ('', 'tɐi̯˥'), ('', 'kɛː˧˥'), ('', 'jɛː˩˧'), ('', 'ŋɔː˩˧'), ('', 'wuːi̯˩˧'), ('', 'kaːu̯˧˥'), ('', 'tiːm˨'), ('', 'kɐ˧'), ('', 'laː˧'), ('', None)]
>>> ToJyutping.get_ipa('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
'咁[kɐm˧]啱[ŋaːm˥]老[lou̯˩˧]世[sɐi̯˧]要[jiːu̯˥]求[kʰɐu̯˨˩]佢[kʰɵy̑˩˧]等[tɐŋ˧˥]陣[t͡sɐn˨]要[jiːu̯˧]開[hɔːi̯˥]會[wuːi̯˧˥],剩[t͡seŋ˨]低[tɐi̯˥]嘅[kɛː˧˥]嘢[jɛː˩˧]我[ŋɔː˩˧]會[wuːi̯˩˧]搞[kaːu̯˧˥]掂[tiːm˨]㗎[kɐ˧]喇[laː˧]。'
>>> ToJyutping.get_ipa_text('咁啱老世要求佢等陣要開會,剩低嘅嘢我會搞掂㗎喇。')
'kɐm˧.ŋaːm˥.lou̯˩˧.sɐi̯˧.jiːu̯˥.kʰɐu̯˨˩.kʰɵy̑˩˧.tɐŋ˧˥.t͡sɐn˨.jiːu̯˧.hɔːi̯˥.wuːi̯˧˥ | t͡seŋ˨.tɐi̯˥.kɛː˧˥.jɛː˩˧.ŋɔː˩˧.wuːi̯˩˧.kaːu̯˧˥.tiːm˨.kɐ˧.laː˧'
```

In rare cases, the pronunciation of a single character can contain more than one syllable:

```python
>>> ToJyutping.get_jyutping_list('一瓩')
[('', 'jat1'), ('', 'cin1 ngaa5')]
>>> ToJyutping.get_ipa_list('一瓩')
[('', 'jɐt̚˥'), ('', 't͡sʰiːn˥.ŋaː˩˧')]
```

Helper:
Expand All @@ -34,3 +43,4 @@ Helper:
```

Note that autocorrection is intentionally not included in this helper, and an error is thrown if strings like `jyt6` are passed into the function.
Punctuation is ignored in the helper.
15 changes: 4 additions & 11 deletions src/ToJyutping/ToJyutping.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from os import path
import pygtrie
import re
import utils

here = path.abspath(path.dirname(__file__))

t = pygtrie.CharTrie()
with open(path.join(here, 'jyut6ping3.simple.dict.yaml')) as f:
with open(path.join(here, 'jyut6ping3.simple.dict.yaml'), encoding='utf-8') as f:
for line in f:
k, v = line.rstrip().split('\t')
t[k] = v
Expand All @@ -30,11 +31,7 @@ def get_jyutping(s):
return l

def get_jyutping_text(s):
l = []
for k, v in get_jyutping_list(s):
if v:
l += [v]
return ' '.join(l)
return utils.format_romanization_text(s, get_jyutping_list)

def get_ipa_list(s):
l = []
Expand All @@ -49,11 +46,7 @@ def get_ipa(s):
return l

def get_ipa_text(s):
l = []
for k, v in get_jyutping_list(s):
if v:
l += [jyutping2ipa(v)]
return '.'.join(l)
return utils.format_ipa_text(s, get_ipa_list)

initial = { 'b': 'p', 'p': 'pʰ', 'm': 'm', 'f': 'f', 'd': 't', 't': 'tʰ', 'n': 'n', 'l': 'l', 'g': 'k', 'k': 'kʰ',
'ng': 'ŋ', 'gw': 'kʷ', 'kw': 'kʷʰ', 'w': 'w', 'h': 'h', 'z': 't͡s', 'c': 't͡sʰ', 's': 's', 'j': 'j' }
Expand Down
163 changes: 163 additions & 0 deletions src/ToJyutping/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import re

punct_dict = dict(
zip(
'''!"'(),-./:;?[]{}~·‐‑‒–—―‘’“”…⋮⋯⸱⸳⸺⸻、。〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟・︐︑︒︓︔︕︖︗︘︙︱︲︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹇﹈﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹣!"'(),-./:;?[]{}~⦅⦆。「」、・''',
'''!"'(),-./:;?[]{}~·------‘’“”………··--,.‘’“”“”‘’[][][][][]~“””·,,.:;!?[]…--(){}[][]“”‘’“”‘’[],,.;:?!-(){}[]-!"'(),-./:;?[]{}~().“”,·'''
)
)

left_bracket = '([{‘“'
right_bracket = ')]}’”'
left_bracket_to_right = dict(zip(left_bracket, right_bracket))
left_bracket = {*left_bracket}
right_bracket = {*right_bracket}
left_punct = {*left_bracket}
right_punct = {*'!,.:;?…', *right_bracket}
other_punct = {*'''"'·-~'''}
left_or_other_punct = {' ', *left_punct, *other_punct}
right_or_other_punct = {*right_punct, *other_punct}

minus_signs = {*'-﹣-'} # U+2212 is unnecessary
decimal_seps = {*'''',.·⸱⸳﹒'.'''}
digits = {*'00𝟎𝟘𝟢𝟬𝟶🯰11𝟏𝟙𝟣𝟭𝟷🯱22𝟐𝟚𝟤𝟮𝟸🯲33𝟑𝟛𝟥𝟯𝟹🯳44𝟒𝟜𝟦𝟰𝟺🯴55𝟓𝟝𝟧𝟱𝟻🯵66𝟔𝟞𝟨𝟲𝟼🯶77𝟕𝟟𝟩𝟳𝟽🯷88𝟖𝟠𝟪𝟴𝟾🯸99𝟗𝟡𝟫𝟵𝟿🯹'}
unknown_or_hyphen = {'', '-'}

def format_romanization_text(s, conv):
def inner(m):
t = [None]
d = [None]
for k, v in conv(m[0]):
if v:
t += [v]
d += [None]
elif not k.isspace():
t += [punct_dict.get(k, '')]
d += [k]
t += [None]
d += [None]
l = ''
b = ''
for i, (p, c, n) in enumerate(zip(t, t[1:], t[2:]), 1):
def between():
nonlocal t, i
j = i - 1
while j and t[j] in right_bracket:
j -= 1
f = j and t[j] and len(t[j]) > 1
j = i + 1
while j < len(t) - 1 and t[j] in left_bracket:
j += 1
g = j and t[j] and len(t[j]) > 1
return f and g

def lspace():
nonlocal l
if l and l[-1] not in left_or_other_punct:
l += ' '

def rspace():
nonlocal n, l
if i < len(d) - 2 and d[i + 2] in digits if d[i + 1] in minus_signs else n not in right_or_other_punct:
l += ' '

if len(c) > 1:
lspace()
l += c
rspace()
elif not c or d[i] in minus_signs and d[i + 1] in digits and p not in unknown_or_hyphen:
if not l.endswith('[…]'):
l += '[…]'
elif d[i] in decimal_seps and d[i + 1] in digits and d[i - 1] in digits:
continue
elif c in left_punct:
lspace()
l += c
b += left_bracket_to_right[c]
elif c in right_punct:
l += c
rspace()
try:
b = b[:b.rindex(c)]
except ValueError:
pass
elif c == '-':
if p == '-':
continue
if n == '-' or between():
l += ' – '
else:
l += c
elif c == '~':
if p == '~' and n != '~' or between():
l += '~ '
else:
l += c
elif c == '·':
l += c
else:
j = len(b) - 1
y = False
while j >= 0 and b[j] not in right_bracket:
if b[j] == c:
y = True
break
j -= 1
if y:
b = b[:j]
l += c
rspace()
else:
lspace()
l += c
b += c
return ' '.join(l.split())

return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s)

major_break = {*'.!?…'}
minor_break = {*',/:;-~()[]{}'}

def format_ipa_text(s, conv):
def inner(m):
t = []
d = []
for k, v in conv(m[0]):
if v:
t += [v]
d += [None]
elif not k.isspace():
t += [punct_dict.get(k, '')]
d += [k]
d += [None]
l = []
for i, c in enumerate(t):
if len(c) > 1:
l += [c]
elif not c or d[i] in minus_signs and d[i + 1] in digits and i and t[i - 1] not in unknown_or_hyphen:
if not l or l[-1] != '⸨…⸩':
l += ['⸨…⸩']
elif l:
if d[i] in decimal_seps and d[i + 1] in digits and i and d[i - 1] in digits:
continue
if c in major_break:
if len(l[-1]) > 1:
l += ['‖']
else:
l[-1] = '‖'
elif c in minor_break and len(l[-1]) > 1:
l += ['|']
if len(l[-1]) == 1:
l.pop()
s = ''
for i, c in enumerate(l):
s += c
if i < len(l) - 1:
n = l[i + 1]
if c != '⸨…⸩' and len(c) > 1 and n != '⸨…⸩' and len(n) > 1:
s += '.'
else:
s += ' '
return s

return re.sub(r'[^\0-\x1f\x80-\x9f]+', inner, s)
2 changes: 1 addition & 1 deletion src/ToJyutping/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.2.1'
__version__ = '0.2.2'

0 comments on commit 2e411a4

Please sign in to comment.