|
| 1 | +#!/usr/bin/env python2.7 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import re |
| 5 | + |
| 6 | +from utf8utils import uprint |
| 7 | + |
| 8 | + |
| 9 | +NOSEG = '<noseg>' |
| 10 | +SEG_MARKER = ':' |
| 11 | +SEG = ' %s ' % SEG_MARKER |
| 12 | + |
| 13 | +LONG_VOWELS = u'ايوى' |
| 14 | +ALIFS = u'اأإٱآ' |
| 15 | +HAAS = u'هح' |
| 16 | + |
| 17 | + |
| 18 | +def get_edits(line, options): |
| 19 | + if '\t' not in line: |
| 20 | + if options.verbose: |
| 21 | + uprint("ignoring line that doesn't have two parts:") |
| 22 | + uprint(' ' + repr(line)) |
| 23 | + return |
| 24 | + raw, seg = line.split('\t') |
| 25 | + |
| 26 | + # Special cases: |
| 27 | + # - an odd edit with no segmentations [e.g. ع -> على] |
| 28 | + if raw != seg and SEG_MARKER not in seg: |
| 29 | + return [u'<other>'] * len(raw) |
| 30 | + # - token deleted |
| 31 | + if seg == '': |
| 32 | + return [u' <del> '] * len(raw) |
| 33 | + # - nothing on the raw side |
| 34 | + if raw == '': |
| 35 | + if options.verbose: |
| 36 | + uprint("ignoring line with empty raw text:") |
| 37 | + uprint(' ' + repr(line)) |
| 38 | + return |
| 39 | + |
| 40 | + edits = [] |
| 41 | + |
| 42 | + last_raw = '' |
| 43 | + last_seg = '' |
| 44 | + while len(raw) != 0: |
| 45 | + # Possible edits, in order that they are searched for: |
| 46 | + # :+Al // li + definite article + word starting with l |
| 47 | + if raw.endswith(u'لل') and seg.endswith(u'ل%sالل' % SEG_MARKER): |
| 48 | + edits.append(u' %s+ال' % SEG_MARKER) |
| 49 | + seg = seg[:-3] |
| 50 | + # +A:+A // mA + A... verbal negation spelled as just m |
| 51 | + elif is_ma_alif(seg, raw): |
| 52 | + edits.append(u' +ا%s+ا ' % SEG_MARKER) |
| 53 | + seg = seg[:-3] |
| 54 | + # x:x // shadda breaking: character duplicated on either side of |
| 55 | + # segmentation |
| 56 | + # x>xx // shadda breaking: character duplicated, no segmentation |
| 57 | + elif is_shadda(seg, raw): |
| 58 | + if seg.endswith(SEG_MARKER + raw[-1]): |
| 59 | + edits.append(u' x:x ') |
| 60 | + seg = seg[:-2] |
| 61 | + else: |
| 62 | + assert seg.endswith(raw[-1] * 2), repr(seg + '\t' + raw) |
| 63 | + edits.append(u' x>xx ') |
| 64 | + seg = seg[:-1] |
| 65 | + # :+x // added an letter after segmentation (alif for |
| 66 | + # li + definite article, noon for recovered first person |
| 67 | + # prefix or y -> ny in dialect) |
| 68 | + elif is_seg_plus(seg, raw): |
| 69 | + edits.append(u' %s+%s ' % (SEG_MARKER, seg[-2])) |
| 70 | + seg = seg[:-2] |
| 71 | + # +x: // added a letter before segmentation (usually noon, for |
| 72 | + # plurals, mim~A, Al~A, etc.) |
| 73 | + elif is_plus_seg(seg, raw): |
| 74 | + edits.append(u' +%s%s ' % (seg[-3], SEG_MARKER)) |
| 75 | + seg = seg[:-2] |
| 76 | + # <del> // deleted lengthening effect (yAAAAAA -> yA) |
| 77 | + elif is_lengthening(seg, raw, last_raw): |
| 78 | + edits.append(u' <del> ') |
| 79 | + seg += u' ' |
| 80 | + # : // ordinary segmentation boundary |
| 81 | + elif seg.endswith(SEG_MARKER + raw[-1]): |
| 82 | + edits.append(SEG) |
| 83 | + seg = seg[:-1] |
| 84 | + # <noseg> // character doesn't change, no segmentation added |
| 85 | + elif len(seg) != 0 and seg[-1] == raw[-1]: |
| 86 | + edits.append(NOSEG) |
| 87 | + # <other> // normalized E or El to ElY |
| 88 | + elif is_alaa_normalization(seg, raw): |
| 89 | + edits.append(u'<other>') |
| 90 | + seg = seg[:-2] |
| 91 | + if raw[-1] != u'ع': |
| 92 | + assert raw[-2] == u'ع' |
| 93 | + seg = seg + ' ' |
| 94 | + # +V: // added a long vowel (verbal or dialect -wA ending, jussive |
| 95 | + # normalization) |
| 96 | + elif len(seg) >= 2 and seg[-2] == raw[-1] and seg[-1] in LONG_VOWELS: |
| 97 | + if len(seg) >= 3 and seg[-3] == SEG_MARKER: |
| 98 | + edits.append(u' %s+%s ' % (SEG_MARKER, seg[-1])) |
| 99 | + seg = seg[:-2] |
| 100 | + else: |
| 101 | + edits.append(u' +%s ' % seg[-1]) |
| 102 | + seg = seg[:-1] |
| 103 | + # y:+h // recover dialectal silent haa after segmentation |
| 104 | + elif seg.endswith(u'ي' + SEG_MARKER + u'ه') and raw.endswith(u'ي'): |
| 105 | + edits.append(u' ي%s+ه ' % SEG_MARKER) |
| 106 | + seg = seg[:-2] |
| 107 | + # <del> // deleted a long vowel (dialect ending normalization: mostly |
| 108 | + # -kwA -> -kw and -kY -> -k) or dialectal silent haa |
| 109 | + elif (len(raw) >= 2 and norm_endswith(seg, raw[-2], HAAS) and |
| 110 | + raw[-1] in LONG_VOWELS + u'ه'): |
| 111 | + edits.append(u' <del> ') |
| 112 | + seg += u' ' |
| 113 | + # <del> // deleted diacritic |
| 114 | + elif is_diacritic(raw[-1]): |
| 115 | + edits.append(u' <del> ') |
| 116 | + seg += u' ' |
| 117 | + # x>y: // change x to y after a segment boundary |
| 118 | + elif (len(seg) >= 2 and seg[-2] == SEG_MARKER and |
| 119 | + is_common_rewrite(seg, raw)): |
| 120 | + edits.append(u' %s%s>%s ' % (SEG_MARKER, raw[-1], seg[-1])) |
| 121 | + seg = seg[:-1] |
| 122 | + # x>y // change x to y without a segmentation (orthography |
| 123 | + # normalization) |
| 124 | + elif is_common_rewrite(seg, raw): |
| 125 | + edits.append(u' %s>%s ' % (raw[-1], seg[-1])) |
| 126 | + else: |
| 127 | + if options.verbose: |
| 128 | + uprint('ignoring line with unknown edit:') |
| 129 | + uprint(' ' + line) |
| 130 | + uprint('(seg = %s; raw = %s)' % (seg, raw)) |
| 131 | + uprint('(edits = %s)' % edits) |
| 132 | + return |
| 133 | + last_raw = raw[-1] |
| 134 | + seg = seg[:-1] |
| 135 | + last_seg = raw[-1] |
| 136 | + raw = raw[:-1] |
| 137 | + |
| 138 | + if len(seg) != 0: |
| 139 | + if options.verbose: |
| 140 | + uprint('ignoring line with unknown edit:') |
| 141 | + uprint(' ' + line) |
| 142 | + uprint('(extra seg: %s)' % seg) |
| 143 | + uprint('(edits = %s)' % edits) |
| 144 | + return |
| 145 | + |
| 146 | + edits.reverse() |
| 147 | + return edits |
| 148 | + |
| 149 | + |
| 150 | +def is_ma_alif(seg, raw): |
| 151 | + return (len(seg) >= 5 and len(raw) >= 2 and |
| 152 | + is_common_rewrite(seg[-1], raw[-1]) and |
| 153 | + raw[-2] == u'م' and |
| 154 | + seg[-5:-1] == u'ما%sا' % SEG_MARKER) |
| 155 | + |
| 156 | + |
| 157 | +def is_seg_plus(seg, raw): |
| 158 | + return (len(seg) >= 4 and len(raw) >= 2 and |
| 159 | + is_common_rewrite(seg[-1], raw[-1]) and |
| 160 | + seg[-2] != raw[-2] and |
| 161 | + seg[-2] in u'اني' and |
| 162 | + seg[-3] == SEG_MARKER and |
| 163 | + is_common_rewrite(seg[-4], raw[-2])) |
| 164 | + |
| 165 | + |
| 166 | +def is_plus_seg(seg, raw): |
| 167 | + return (len(seg) >= 4 and len(raw) >= 2 and |
| 168 | + is_common_rewrite(seg[-1], raw[-1]) and |
| 169 | + seg[-2] == SEG_MARKER and |
| 170 | + seg[-3] != raw[-2] and |
| 171 | + seg[-3] in u'ان' and |
| 172 | + is_common_rewrite(seg[-4], raw[-2])) |
| 173 | + |
| 174 | + |
| 175 | +def is_shadda(seg, raw): |
| 176 | + seg = seg.replace(SEG_MARKER, '') |
| 177 | + if len(raw) == 0 or not seg.endswith(raw[-1]): |
| 178 | + return False |
| 179 | + last = seg[-1] |
| 180 | + for i in range(2, min(len(seg) + 1, len(raw) + 1)): |
| 181 | + if seg[-i] != last: return False |
| 182 | + if seg[-i] != raw[-i]: return True |
| 183 | + # equal through the min of the two lengths, so check if it's |
| 184 | + # a beginning-of-word shadda |
| 185 | + return seg == raw[-1] + raw |
| 186 | + |
| 187 | + |
| 188 | +def is_lengthening(seg, raw, last): |
| 189 | + seg = seg.replace(SEG_MARKER, '') |
| 190 | + if len(raw) < 2 or len(seg) == 0: return False |
| 191 | + if raw[-1] != raw[-2]: return False |
| 192 | + if raw[-1] != seg[-1]: return False |
| 193 | + if len(seg) >= 2 and raw[-1] == seg[-2]: return False |
| 194 | + return True |
| 195 | + |
| 196 | + |
| 197 | +DIACRITIC = re.compile(ur'[~_\u0640\u064b-\u065e\u0670]') |
| 198 | +# tatweel dagger alif |
| 199 | +# most diacritics |
| 200 | +def is_diacritic(char): |
| 201 | + return DIACRITIC.match(char) is not None |
| 202 | + |
| 203 | + |
| 204 | +COMMON_REWRITES = [ |
| 205 | + u'تة', # recovered taa marbuta |
| 206 | + u'يىئ', # normalized Egyptian yaa |
| 207 | + u'وؤ', # normalized waw hamza |
| 208 | + u'هةو', # normalized 3sg ending |
| 209 | + HAAS, # normalized future particle |
| 210 | + ALIFS, # normalized alifs |
| 211 | + u'اأإئؤقءي', # normalized various hamzas (written or spoken) |
| 212 | + u'ىهةا', # normalized words ending in /a/ sound |
| 213 | + u'تثط', # normalized letters pronounced /t/ |
| 214 | + u'دذضظ', # normalized letters pronounced /d/ |
| 215 | + u'سص', # normalized letters pronounced /s/ |
| 216 | + u'زذظ', # normalized letters pronounced /z/ |
| 217 | +] |
| 218 | + |
| 219 | +def is_common_rewrite(seg, raw): |
| 220 | + if len(seg) == 0 or len(raw) == 0: return False |
| 221 | + if seg == raw: return True |
| 222 | + for group in COMMON_REWRITES: |
| 223 | + if seg[-1] in group and raw[-1] in group: |
| 224 | + return True |
| 225 | + return False |
| 226 | + |
| 227 | + |
| 228 | +def is_alaa_normalization(seg, raw): |
| 229 | + return ((raw.endswith(u'ع') or raw.endswith(u'عل')) and |
| 230 | + seg.endswith(u'على')) |
| 231 | + |
| 232 | + |
| 233 | +def norm_endswith(str, target_ending, norm_group): |
| 234 | + ''' |
| 235 | + Return True if `str` ends with `target_ending`, ignoring differences |
| 236 | + between characters in `norm_group`. Otherwise return False. |
| 237 | + ''' |
| 238 | + if len(str) < len(target_ending): return False |
| 239 | + source_ending = str[-len(target_ending):] |
| 240 | + assert len(source_ending) == len(target_ending) |
| 241 | + for s, t in zip(source_ending, target_ending): |
| 242 | + if s != t and (s not in norm_group or t not in norm_group): |
| 243 | + return False |
| 244 | + return True |
0 commit comments