Skip to content

Commit dd0baf9

Browse files
futurulusStanford NLP
authored and
Stanford NLP
committed
Add new data splits and training scripts
1 parent 19a194f commit dd0baf9

File tree

20 files changed

+6003
-32
lines changed

20 files changed

+6003
-32
lines changed

scripts/arabic-segmenter/edits.py

+244
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
#!/usr/bin/env python2.7
2+
# -*- coding: utf-8 -*-
3+
4+
import re
5+
6+
from utf8utils import uprint
7+
8+
9+
NOSEG = '<noseg>'
10+
SEG_MARKER = ':'
11+
SEG = ' %s ' % SEG_MARKER
12+
13+
LONG_VOWELS = u'ايوى'
14+
ALIFS = u'اأإٱآ'
15+
HAAS = u'هح'
16+
17+
18+
def get_edits(line, options):
19+
if '\t' not in line:
20+
if options.verbose:
21+
uprint("ignoring line that doesn't have two parts:")
22+
uprint(' ' + repr(line))
23+
return
24+
raw, seg = line.split('\t')
25+
26+
# Special cases:
27+
# - an odd edit with no segmentations [e.g. ع -> على]
28+
if raw != seg and SEG_MARKER not in seg:
29+
return [u'<other>'] * len(raw)
30+
# - token deleted
31+
if seg == '':
32+
return [u' <del> '] * len(raw)
33+
# - nothing on the raw side
34+
if raw == '':
35+
if options.verbose:
36+
uprint("ignoring line with empty raw text:")
37+
uprint(' ' + repr(line))
38+
return
39+
40+
edits = []
41+
42+
last_raw = ''
43+
last_seg = ''
44+
while len(raw) != 0:
45+
# Possible edits, in order that they are searched for:
46+
# :+Al // li + definite article + word starting with l
47+
if raw.endswith(u'لل') and seg.endswith(u'ل%sالل' % SEG_MARKER):
48+
edits.append(u' %s+ال' % SEG_MARKER)
49+
seg = seg[:-3]
50+
# +A:+A // mA + A... verbal negation spelled as just m
51+
elif is_ma_alif(seg, raw):
52+
edits.append(u' +ا%s+ا ' % SEG_MARKER)
53+
seg = seg[:-3]
54+
# x:x // shadda breaking: character duplicated on either side of
55+
# segmentation
56+
# x>xx // shadda breaking: character duplicated, no segmentation
57+
elif is_shadda(seg, raw):
58+
if seg.endswith(SEG_MARKER + raw[-1]):
59+
edits.append(u' x:x ')
60+
seg = seg[:-2]
61+
else:
62+
assert seg.endswith(raw[-1] * 2), repr(seg + '\t' + raw)
63+
edits.append(u' x>xx ')
64+
seg = seg[:-1]
65+
# :+x // added an letter after segmentation (alif for
66+
# li + definite article, noon for recovered first person
67+
# prefix or y -> ny in dialect)
68+
elif is_seg_plus(seg, raw):
69+
edits.append(u' %s+%s ' % (SEG_MARKER, seg[-2]))
70+
seg = seg[:-2]
71+
# +x: // added a letter before segmentation (usually noon, for
72+
# plurals, mim~A, Al~A, etc.)
73+
elif is_plus_seg(seg, raw):
74+
edits.append(u' +%s%s ' % (seg[-3], SEG_MARKER))
75+
seg = seg[:-2]
76+
# <del> // deleted lengthening effect (yAAAAAA -> yA)
77+
elif is_lengthening(seg, raw, last_raw):
78+
edits.append(u' <del> ')
79+
seg += u' '
80+
# : // ordinary segmentation boundary
81+
elif seg.endswith(SEG_MARKER + raw[-1]):
82+
edits.append(SEG)
83+
seg = seg[:-1]
84+
# <noseg> // character doesn't change, no segmentation added
85+
elif len(seg) != 0 and seg[-1] == raw[-1]:
86+
edits.append(NOSEG)
87+
# <other> // normalized E or El to ElY
88+
elif is_alaa_normalization(seg, raw):
89+
edits.append(u'<other>')
90+
seg = seg[:-2]
91+
if raw[-1] != u'ع':
92+
assert raw[-2] == u'ع'
93+
seg = seg + ' '
94+
# +V: // added a long vowel (verbal or dialect -wA ending, jussive
95+
# normalization)
96+
elif len(seg) >= 2 and seg[-2] == raw[-1] and seg[-1] in LONG_VOWELS:
97+
if len(seg) >= 3 and seg[-3] == SEG_MARKER:
98+
edits.append(u' %s+%s ' % (SEG_MARKER, seg[-1]))
99+
seg = seg[:-2]
100+
else:
101+
edits.append(u' +%s ' % seg[-1])
102+
seg = seg[:-1]
103+
# y:+h // recover dialectal silent haa after segmentation
104+
elif seg.endswith(u'ي' + SEG_MARKER + u'ه') and raw.endswith(u'ي'):
105+
edits.append(u' ي%s+ه ' % SEG_MARKER)
106+
seg = seg[:-2]
107+
# <del> // deleted a long vowel (dialect ending normalization: mostly
108+
# -kwA -> -kw and -kY -> -k) or dialectal silent haa
109+
elif (len(raw) >= 2 and norm_endswith(seg, raw[-2], HAAS) and
110+
raw[-1] in LONG_VOWELS + u'ه'):
111+
edits.append(u' <del> ')
112+
seg += u' '
113+
# <del> // deleted diacritic
114+
elif is_diacritic(raw[-1]):
115+
edits.append(u' <del> ')
116+
seg += u' '
117+
# x>y: // change x to y after a segment boundary
118+
elif (len(seg) >= 2 and seg[-2] == SEG_MARKER and
119+
is_common_rewrite(seg, raw)):
120+
edits.append(u' %s%s>%s ' % (SEG_MARKER, raw[-1], seg[-1]))
121+
seg = seg[:-1]
122+
# x>y // change x to y without a segmentation (orthography
123+
# normalization)
124+
elif is_common_rewrite(seg, raw):
125+
edits.append(u' %s>%s ' % (raw[-1], seg[-1]))
126+
else:
127+
if options.verbose:
128+
uprint('ignoring line with unknown edit:')
129+
uprint(' ' + line)
130+
uprint('(seg = %s; raw = %s)' % (seg, raw))
131+
uprint('(edits = %s)' % edits)
132+
return
133+
last_raw = raw[-1]
134+
seg = seg[:-1]
135+
last_seg = raw[-1]
136+
raw = raw[:-1]
137+
138+
if len(seg) != 0:
139+
if options.verbose:
140+
uprint('ignoring line with unknown edit:')
141+
uprint(' ' + line)
142+
uprint('(extra seg: %s)' % seg)
143+
uprint('(edits = %s)' % edits)
144+
return
145+
146+
edits.reverse()
147+
return edits
148+
149+
150+
def is_ma_alif(seg, raw):
151+
return (len(seg) >= 5 and len(raw) >= 2 and
152+
is_common_rewrite(seg[-1], raw[-1]) and
153+
raw[-2] == u'م' and
154+
seg[-5:-1] == u'ما%sا' % SEG_MARKER)
155+
156+
157+
def is_seg_plus(seg, raw):
158+
return (len(seg) >= 4 and len(raw) >= 2 and
159+
is_common_rewrite(seg[-1], raw[-1]) and
160+
seg[-2] != raw[-2] and
161+
seg[-2] in u'اني' and
162+
seg[-3] == SEG_MARKER and
163+
is_common_rewrite(seg[-4], raw[-2]))
164+
165+
166+
def is_plus_seg(seg, raw):
167+
return (len(seg) >= 4 and len(raw) >= 2 and
168+
is_common_rewrite(seg[-1], raw[-1]) and
169+
seg[-2] == SEG_MARKER and
170+
seg[-3] != raw[-2] and
171+
seg[-3] in u'ان' and
172+
is_common_rewrite(seg[-4], raw[-2]))
173+
174+
175+
def is_shadda(seg, raw):
176+
seg = seg.replace(SEG_MARKER, '')
177+
if len(raw) == 0 or not seg.endswith(raw[-1]):
178+
return False
179+
last = seg[-1]
180+
for i in range(2, min(len(seg) + 1, len(raw) + 1)):
181+
if seg[-i] != last: return False
182+
if seg[-i] != raw[-i]: return True
183+
# equal through the min of the two lengths, so check if it's
184+
# a beginning-of-word shadda
185+
return seg == raw[-1] + raw
186+
187+
188+
def is_lengthening(seg, raw, last):
189+
seg = seg.replace(SEG_MARKER, '')
190+
if len(raw) < 2 or len(seg) == 0: return False
191+
if raw[-1] != raw[-2]: return False
192+
if raw[-1] != seg[-1]: return False
193+
if len(seg) >= 2 and raw[-1] == seg[-2]: return False
194+
return True
195+
196+
197+
DIACRITIC = re.compile(ur'[~_\u0640\u064b-\u065e\u0670]')
198+
# tatweel dagger alif
199+
# most diacritics
200+
def is_diacritic(char):
201+
return DIACRITIC.match(char) is not None
202+
203+
204+
COMMON_REWRITES = [
205+
u'تة', # recovered taa marbuta
206+
u'يىئ', # normalized Egyptian yaa
207+
u'وؤ', # normalized waw hamza
208+
u'هةو', # normalized 3sg ending
209+
HAAS, # normalized future particle
210+
ALIFS, # normalized alifs
211+
u'اأإئؤقءي', # normalized various hamzas (written or spoken)
212+
u'ىهةا', # normalized words ending in /a/ sound
213+
u'تثط', # normalized letters pronounced /t/
214+
u'دذضظ', # normalized letters pronounced /d/
215+
u'سص', # normalized letters pronounced /s/
216+
u'زذظ', # normalized letters pronounced /z/
217+
]
218+
219+
def is_common_rewrite(seg, raw):
220+
if len(seg) == 0 or len(raw) == 0: return False
221+
if seg == raw: return True
222+
for group in COMMON_REWRITES:
223+
if seg[-1] in group and raw[-1] in group:
224+
return True
225+
return False
226+
227+
228+
def is_alaa_normalization(seg, raw):
229+
return ((raw.endswith(u'ع') or raw.endswith(u'عل')) and
230+
seg.endswith(u'على'))
231+
232+
233+
def norm_endswith(str, target_ending, norm_group):
234+
'''
235+
Return True if `str` ends with `target_ending`, ignoring differences
236+
between characters in `norm_group`. Otherwise return False.
237+
'''
238+
if len(str) < len(target_ending): return False
239+
source_ending = str[-len(target_ending):]
240+
assert len(source_ending) == len(target_ending)
241+
for s, t in zip(source_ending, target_ending):
242+
if s != t and (s not in norm_group or t not in norm_group):
243+
return False
244+
return True
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
if [ $# -ne 2 ]; then
6+
echo "Usage: `basename $0` <integrated_file> <output_file>"
7+
exit
8+
fi
9+
10+
INPUT=$1
11+
OUTPUT=$2
12+
13+
# ===== Raw side =====
14+
15+
cat ${INPUT} | \
16+
# Escape TEDEval special characters
17+
sed 's/:/#pm#/g' | \
18+
sed 's/(/#lp#/g' | \
19+
sed 's/)/#rp#/g' | \
20+
# Concatenate t: UTF-8 field entries
21+
awk '
22+
BEGIN { FS = "·"; ORS = ""; }
23+
/^t/ { print $9 ($4 == "t" ? "" : "\n"); }
24+
/^TREE/ { print "\n"; }
25+
' | \
26+
# Fix problems that really mess up evaluation:
27+
# Patch over two badly annotated lines
28+
# sed 's/^8ـ9$/89/' | \
29+
sed 's/3-5-\.2/3/' | \
30+
sed 's/^8\.$/8/' | \
31+
# Delete all tatweels after numbers
32+
sed 's/\([0-9]\)ـ/\1/g' | \
33+
# Split all digits from miscellaneous characters
34+
sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
35+
sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' > ${OUTPUT}.__raw__
36+
37+
38+
# =====Segmented side=====
39+
40+
cat ${INPUT} | \
41+
# Escape TEDEval special characters
42+
sed 's/:/#pm#/g' | \
43+
# Concatenate t: Buckwalter field entries
44+
awk '
45+
BEGIN { FS = "·"; ORS = ":"; }
46+
/^t/ { print $10 ($4 == "t" ? "" : "\n"); }
47+
/^TREE/ { print "\n"; }
48+
' | \
49+
# Remove empty segments
50+
sed 's/^://' | \
51+
# Remove errant annotations such as AlY_1 (turn it into AlY)
52+
sed 's/_[0-9]\+$//' | \
53+
# Delete all tatweels after numbers
54+
sed 's/\([0-9]\)ـ/\1/g' | \
55+
# Split all digits from miscellaneous characters
56+
sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
57+
sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' | \
58+
# Normalize alifs
59+
sed 's/[<>IO{|]/A/g' | \
60+
# Convert Buckwalter to UTF-8
61+
java edu.stanford.nlp.international.arabic.Buckwalter | \
62+
# Undo Buckwaltering of escaped :, escape other TEDEval characters
63+
sed 's/#ةم#/#pm#/g' | \
64+
sed 's/(/#lp#/g' | \
65+
sed 's/)/#rp#/g' > ${OUTPUT}.__segmented__
66+
67+
# Join columns with tabs
68+
pr -m -t -s\ ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__ | \
69+
# Make sure empty lines are actually empty
70+
sed 's/^\t$//' | \
71+
# Remove sentences with no words
72+
awk '
73+
BEGIN { empty = 1; }
74+
{
75+
if ($0 == "") {
76+
if (!empty) {
77+
empty = 1;
78+
print;
79+
}
80+
} else {
81+
empty = 0;
82+
print;
83+
}
84+
}
85+
' | \
86+
# Remove tokens that consist entirely of tatweel or diacritics
87+
perl -C -ne 'print unless /^[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*\t[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*$/' > $OUTPUT
88+
89+
# Clean up
90+
rm ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__

0 commit comments

Comments
 (0)