stanfordnlp
diff --git a/‎scripts/arabic-segmenter/edits.py
+244 b/‎scripts/arabic-segmenter/edits.py
+244
diff --git a/‎scripts/arabic-segmenter/integrated_to_gold
+90 b/‎scripts/arabic-segmenter/integrated_to_gold
+90
@@ -0,0 +1,244 @@
+#!/usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+import re
+
+from utf8utils import uprint
+
+
+NOSEG = '<noseg>'
+SEG_MARKER = ':'
+SEG = '   %s   ' % SEG_MARKER
+
+LONG_VOWELS = u'ايوى'
+ALIFS = u'اأإٱآ'
+HAAS = u'هح'
+
+
+def get_edits(line, options):
+    if '\t' not in line:
+        if options.verbose:
+            uprint("ignoring line that doesn't have two parts:")
+            uprint('    ' + repr(line))
+        return
+    raw, seg = line.split('\t')
+
+    # Special cases:
+    # - an odd edit with no segmentations [e.g. ع -> على]
+    if raw != seg and SEG_MARKER not in seg:
+        return [u'<other>'] * len(raw)
+    # - token deleted
+    if seg == '':
+        return [u' <del> '] * len(raw)
+    # - nothing on the raw side
+    if raw == '':
+        if options.verbose:
+            uprint("ignoring line with empty raw text:")
+            uprint('    ' + repr(line))
+        return
+
+    edits = []
+
+    last_raw = ''
+    last_seg = ''
+    while len(raw) != 0:
+        # Possible edits, in order that they are searched for:
+        #    :+Al // li + definite article + word starting with l
+        if raw.endswith(u'لل') and seg.endswith(u'ل%sالل' % SEG_MARKER):
+            edits.append(u'   %s+ال' % SEG_MARKER)
+            seg = seg[:-3]
+        #  +A:+A  // mA + A... verbal negation spelled as just m
+        elif is_ma_alif(seg, raw):
+            edits.append(u' +ا%s+ا ' % SEG_MARKER)
+            seg = seg[:-3]
+        #   x:x   // shadda breaking: character duplicated on either side of
+        #            segmentation
+        #   x>xx  // shadda breaking: character duplicated, no segmentation
+        elif is_shadda(seg, raw):
+            if seg.endswith(SEG_MARKER + raw[-1]):
+                edits.append(u'  x:x  ')
+                seg = seg[:-2]
+            else:
+                assert seg.endswith(raw[-1] * 2), repr(seg + '\t' + raw)
+                edits.append(u'  x>xx ')
+                seg = seg[:-1]
+        #    :+x  // added an letter after segmentation (alif for
+        #            li + definite article, noon for recovered first person
+        #            prefix or y -> ny in dialect)
+        elif is_seg_plus(seg, raw):
+            edits.append(u'   %s+%s ' % (SEG_MARKER, seg[-2]))
+            seg = seg[:-2]
+        #  +x:    // added a letter before segmentation (usually noon, for
+        #            plurals, mim~A, Al~A, etc.)
+        elif is_plus_seg(seg, raw):
+            edits.append(u' +%s%s   ' % (seg[-3], SEG_MARKER))
+            seg = seg[:-2]
+        #  <del>  // deleted lengthening effect (yAAAAAA -> yA)
+        elif is_lengthening(seg, raw, last_raw):
+            edits.append(u' <del> ')
+            seg += u' '
+        #    :    // ordinary segmentation boundary
+        elif seg.endswith(SEG_MARKER + raw[-1]):
+            edits.append(SEG)
+            seg = seg[:-1]
+        # <noseg> // character doesn't change, no segmentation added
+        elif len(seg) != 0 and seg[-1] == raw[-1]:
+            edits.append(NOSEG)
+        # <other> // normalized E or El to ElY
+        elif is_alaa_normalization(seg, raw):
+            edits.append(u'<other>')
+            seg = seg[:-2]
+            if raw[-1] != u'ع':
+                assert raw[-2] == u'ع'
+                seg = seg + ' '
+        #  +V:    // added a long vowel (verbal or dialect -wA ending, jussive
+        #            normalization)
+        elif len(seg) >= 2 and seg[-2] == raw[-1] and seg[-1] in LONG_VOWELS:
+            if len(seg) >= 3 and seg[-3] == SEG_MARKER:
+                edits.append(u'   %s+%s ' % (SEG_MARKER, seg[-1]))
+                seg = seg[:-2]
+            else:
+                edits.append(u'   +%s  ' % seg[-1])
+                seg = seg[:-1]
+        #   y:+h  // recover dialectal silent haa after segmentation
+        elif seg.endswith(u'ي' + SEG_MARKER + u'ه') and raw.endswith(u'ي'):
+            edits.append(u'  ي%s+ه ' % SEG_MARKER)
+            seg = seg[:-2]
+        #  <del>  // deleted a long vowel (dialect ending normalization: mostly
+        #            -kwA -> -kw and -kY -> -k) or dialectal silent haa
+        elif (len(raw) >= 2 and norm_endswith(seg, raw[-2], HAAS) and
+              raw[-1] in LONG_VOWELS + u'ه'):
+            edits.append(u' <del> ')
+            seg += u' '
+        #  <del>  // deleted diacritic
+        elif is_diacritic(raw[-1]):
+            edits.append(u' <del> ')
+            seg += u' '
+        # x>y:    // change x to y after a segment boundary
+        elif (len(seg) >= 2 and seg[-2] == SEG_MARKER and
+              is_common_rewrite(seg, raw)):
+            edits.append(u'   %s%s>%s ' % (SEG_MARKER, raw[-1], seg[-1]))
+            seg = seg[:-1]
+        #   x>y   // change x to y without a segmentation (orthography
+        #            normalization)
+        elif is_common_rewrite(seg, raw):
+            edits.append(u'  %s>%s  ' % (raw[-1], seg[-1]))
+        else:
+            if options.verbose:
+                uprint('ignoring line with unknown edit:')
+                uprint('    ' + line)
+                uprint('(seg = %s; raw = %s)' % (seg, raw))
+                uprint('(edits = %s)' % edits)
+            return
+        last_raw = raw[-1]
+        seg = seg[:-1]
+        last_seg = raw[-1]
+        raw = raw[:-1]
+
+    if len(seg) != 0:
+        if options.verbose:
+            uprint('ignoring line with unknown edit:')
+            uprint('    ' + line)
+            uprint('(extra seg: %s)' % seg)
+            uprint('(edits = %s)' % edits)
+        return
+
+    edits.reverse()
+    return edits
+
+
+def is_ma_alif(seg, raw):
+    return (len(seg) >= 5 and len(raw) >= 2 and
+            is_common_rewrite(seg[-1], raw[-1]) and
+            raw[-2] == u'م' and
+            seg[-5:-1] == u'ما%sا' % SEG_MARKER)
+
+
+def is_seg_plus(seg, raw):
+    return (len(seg) >= 4 and len(raw) >= 2 and
+            is_common_rewrite(seg[-1], raw[-1]) and
+            seg[-2] != raw[-2] and
+            seg[-2] in u'اني' and
+            seg[-3] == SEG_MARKER and
+            is_common_rewrite(seg[-4], raw[-2]))
+
+
+def is_plus_seg(seg, raw):
+    return (len(seg) >= 4 and len(raw) >= 2 and
+            is_common_rewrite(seg[-1], raw[-1]) and
+            seg[-2] == SEG_MARKER and
+            seg[-3] != raw[-2] and
+            seg[-3] in u'ان' and
+            is_common_rewrite(seg[-4], raw[-2]))
+
+
+def is_shadda(seg, raw):
+    seg = seg.replace(SEG_MARKER, '')
+    if len(raw) == 0 or not seg.endswith(raw[-1]):
+        return False
+    last = seg[-1]
+    for i in range(2, min(len(seg) + 1, len(raw) + 1)):
+        if seg[-i] != last: return False
+        if seg[-i] != raw[-i]: return True
+    # equal through the min of the two lengths, so check if it's
+    # a beginning-of-word shadda
+    return seg == raw[-1] + raw
+
+
+def is_lengthening(seg, raw, last):
+    seg = seg.replace(SEG_MARKER, '')
+    if len(raw) < 2 or len(seg) == 0: return False
+    if raw[-1] != raw[-2]: return False
+    if raw[-1] != seg[-1]: return False
+    if len(seg) >= 2 and raw[-1] == seg[-2]: return False
+    return True
+
+
+DIACRITIC = re.compile(ur'[~_\u0640\u064b-\u065e\u0670]')
+#                           tatweel            dagger alif
+#                                 most diacritics
+def is_diacritic(char):
+    return DIACRITIC.match(char) is not None
+
+
+COMMON_REWRITES = [
+    u'تة',       # recovered taa marbuta
+    u'يىئ',      # normalized Egyptian yaa
+    u'وؤ',       # normalized waw hamza
+    u'هةو',      # normalized 3sg ending
+    HAAS,        # normalized future particle
+    ALIFS,       # normalized alifs
+    u'اأإئؤقءي', # normalized various hamzas (written or spoken)
+    u'ىهةا',     # normalized words ending in /a/ sound
+    u'تثط',      # normalized letters pronounced /t/
+    u'دذضظ',     # normalized letters pronounced /d/
+    u'سص',       # normalized letters pronounced /s/
+    u'زذظ',      # normalized letters pronounced /z/
+]
+
+def is_common_rewrite(seg, raw):
+    if len(seg) == 0 or len(raw) == 0: return False
+    if seg == raw: return True
+    for group in COMMON_REWRITES:
+        if seg[-1] in group and raw[-1] in group:
+            return True
+    return False
+
+
+def is_alaa_normalization(seg, raw):
+    return ((raw.endswith(u'ع') or raw.endswith(u'عل')) and
+            seg.endswith(u'على'))
+
+
+def norm_endswith(str, target_ending, norm_group):
+    '''
+    Return True if `str` ends with `target_ending`, ignoring differences
+    between characters in `norm_group`. Otherwise return False.
+    '''
+    if len(str) < len(target_ending): return False
+    source_ending = str[-len(target_ending):]
+    assert len(source_ending) == len(target_ending)
+    for s, t in zip(source_ending, target_ending):
+        if s != t and (s not in norm_group or t not in norm_group):
+            return False
+    return True
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+set -e
+
+if [ $# -ne 2 ]; then
+    echo "Usage: `basename $0` <integrated_file> <output_file>"
+    exit
+fi
+
+INPUT=$1
+OUTPUT=$2
+
+# ===== Raw side =====
+
+cat ${INPUT} | \
+# Escape TEDEval special characters
+sed 's/:/#pm#/g' | \
+sed 's/(/#lp#/g' | \
+sed 's/)/#rp#/g' | \
+# Concatenate t: UTF-8 field entries
+awk '
+BEGIN { FS = "·"; ORS = ""; }
+/^t/ { print $9 ($4 == "t" ? "" : "\n"); }
+/^TREE/ { print "\n"; }
+' | \
+# Fix problems that really mess up evaluation:
+# Patch over two badly annotated lines
+# sed 's/^8ـ9$/89/' | \
+sed 's/3-5-\.2/3/' | \
+sed 's/^8\.$/8/' | \
+# Delete all tatweels after numbers
+sed 's/\([0-9]\)ـ/\1/g' | \
+# Split all digits from miscellaneous characters
+sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
+sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' > ${OUTPUT}.__raw__
+
+
+# =====Segmented side=====
+
+cat ${INPUT} | \
+# Escape TEDEval special characters
+sed 's/:/#pm#/g' | \
+# Concatenate t: Buckwalter field entries
+awk '
+BEGIN { FS = "·"; ORS = ":"; }
+/^t/ { print $10 ($4 == "t" ? "" : "\n"); }
+/^TREE/ { print "\n"; }
+' | \
+# Remove empty segments
+sed 's/^://' | \
+# Remove errant annotations such as AlY_1 (turn it into AlY)
+sed 's/_[0-9]\+$//' | \
+# Delete all tatweels after numbers
+sed 's/\([0-9]\)ـ/\1/g' | \
+# Split all digits from miscellaneous characters
+sed 's/\([0-9]\)\([^0-9 -]\)/\1\n\2/g' | \
+sed 's/\([^0-9 -]\)\([0-9]\)/\1\n\2/g' | \
+# Normalize alifs
+sed 's/[<>IO{|]/A/g' | \
+# Convert Buckwalter to UTF-8
+java edu.stanford.nlp.international.arabic.Buckwalter | \
+# Undo Buckwaltering of escaped :, escape other TEDEval characters
+sed 's/#ةم#/#pm#/g' | \
+sed 's/(/#lp#/g' | \
+sed 's/)/#rp#/g' > ${OUTPUT}.__segmented__
+
+# Join columns with tabs
+pr -m -t -s\	 ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__ | \
+# Make sure empty lines are actually empty
+sed 's/^\t$//' | \
+# Remove sentences with no words
+awk '
+BEGIN { empty = 1; }
+{
+  if ($0 == "") {
+    if (!empty) {
+      empty = 1;
+      print;
+    }
+  } else {
+    empty = 0;
+    print;
+  }
+}
+' | \
+# Remove tokens that consist entirely of tatweel or diacritics
+perl -C -ne 'print unless /^[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*\t[~_\x{0640}\x{064b}-\x{065e}\x{0670}]*$/' > $OUTPUT
+
+# Clean up
+rm ${OUTPUT}.__raw__ ${OUTPUT}.__segmented__