In [1]:
import copy
import json
from aligner_word import Aligner
import re
from camel_tools.utils.normalize import (
    normalize_alef_ar,
    normalize_alef_maksura_ar,
    normalize_teh_marbuta_ar
)
aligner = Aligner()

In [2]:
def read_data(path):
    with open(path) as f:
        return [x.strip() for x in f.readlines()]

In [3]:
def read_alignment(path):
    example = []
    examples = []
    with open(path, mode='r') as f:
        for line in f.readlines():
            line = line.strip()
            if line:
                data = line.split('\t')
                ex = data[:-1] + [eval(data[-1])]
                example.append(ex)
            else:
                examples.append(example)
                example = []
        
        # adding the last example
        if example:
            examples.append(example)
    return examples

In [4]:
class Range:
    def __init__(self, start, end, ops):
        self.start = start
        self.end = end
        self.ops = ops
        
    def __repr__(self):
        return str(self.to_dict())

    def to_json_str(self):
        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)

    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        return output
    
    
def sort_range(v):
    span = v[-1]
    return [span[0], span[0]] if span[1] == None else [span[1], span[1]]

def capture_bug(alignment):
    i = 0
    found_span = False
    buggy_span = []
    
    while i < len(alignment):
        potential_buggy = {}
        start_idx = i - 1
        
        # if we see a sequence of deletes, replaces, and inserts --> this is a potential bug
        while i < len(alignment) and ('DELETE' in alignment[i][2] or 'REPLACE' in alignment[i][2] or
                                      'INSERT' in alignment[i][1]):
        
            potential_buggy[i] = alignment[i]
            i += 1
        
        if len(potential_buggy) > 1:

            # save the start and end anchors and the sequence of edits
            buggy_span.append(Range(start_idx, i, potential_buggy))
#             buggy_span.append(Range(start_idx, i, dict(sorted(potential_buggy.items(),
#                                                               key=lambda x: sort_range(x[1])))))
    
        elif len(potential_buggy) == 0:
            i += 1
            
    return buggy_span

def perform_align(inputs):
    return inputs[0], aligner.align(*inputs)

In [5]:
def normalize_str(s):
    norm_s = normalize_alef_ar(s)
    norm_s = normalize_alef_maksura_ar(norm_s)
    norm_s = normalize_teh_marbuta_ar(norm_s)
    return norm_s

def clean_potential_merge(src, tgt):
    i = 0
    src_ = normalize_str(' '.join(src)).split()
    tgt_ = normalize_str(' '.join(tgt)).split()
    
    pairwise = [''.join(x) for x in list(zip(src_, src_[1:]))]
    new_src = []
    
    while i < len(pairwise):

        if pairwise[i] in tgt_:
            new_src.append(pairwise[i])
            i += 2
        else:
            new_src.append(src[i])
            i += 1
            
            
    while i < len(src):
        new_src.append(src[i])
        i += 1
    
    return new_src

def construct_src_tgt(bug):
    src = []
    tgt = []
    for v in bug.ops.values():

        if v[2] == 'DELETE':
            src.append(v[0])

        elif v[1] == 'INSERT':
            tgt.append(v[0])

        elif v[2] == 'REPLACE':
            src.append(v[0])
            tgt.append(v[1])

    src = clean_potential_merge(src, tgt)
    
    return ' S '.join(src), ' S '.join(tgt)

In [134]:
def clean_alignment(src_align, tgt_align):
    clean_src = [x.strip() for x in src_align]
    clean_tgt = [x.strip() for x in tgt_align]
        
#     print(clean_src)
#     print(clean_tgt)
#     clean_src = []
#     clean_tgt = []
    
#     # cleaning merges
#     is_merge = False
#     for x, y in zip(src, tgt):
# #         import pdb; pdb.set_trace()
#         if x == 'S' and y == '':
#             is_merge = True
#         else:
#             if is_merge:
#                 clean_src[-1] = clean_src[-1] + ' ' + x
#                 clean_tgt[-1] = clean_tgt[-1] + y
#                 is_merge = False
#             else:
#                 clean_src.append(x)
#                 clean_tgt.append(y)
    
#     # merging target if needed
    pairwise = list(zip(clean_tgt, clean_tgt[1:]))
    i = 0
    clean_tgt_ = []

    while i < len(pairwise):
        element = pairwise[i]

        if element[0] != 'S' and element[1] != 'S' and element[0] != '' and element[1] != '':
            clean_tgt_.append(element[0] + element[1])
            i += 2
        else:
            clean_tgt_.append(clean_tgt[i])
            i += 1
        
        
    while i < len(clean_tgt):
        clean_tgt_.append(clean_tgt[i])
        i += 1
    
    print(clean_src)
    print(clean_tgt_)
    

In [135]:
basic_alignment = read_alignment('/scratch/ba63/gec/data/ZAEBUC-v1.0/data/ar/dev/dev.alignment')
src = read_data('/scratch/ba63/gec/data/ZAEBUC-v1.0/data/ar/dev/dev.sent.raw.pnx.tok')
cor = read_data('/scratch/ba63/gec/data/ZAEBUC-v1.0/data/ar/dev/dev.sent.cor.pnx.tok')

In [136]:
# basic_alignment = read_alignment('/scratch/ba63/gec/data/QALB-0.9.1-Dec03-2021-SharedTasks/data'\
#                                  '/2014/tune/tune.alignment')
# src = read_data('/scratch/ba63/gec/data/QALB-0.9.1-Dec03-2021-SharedTasks/data/'\
#                 '2014/tune/QALB-2014-L1-Tune.sent.no_ids.clean')
# cor = read_data('/scratch/ba63/gec/data/QALB-0.9.1-Dec03-2021-SharedTasks/data/'\
#                 '2014/tune/QALB-2014-L1-Tune.cor.no_ids')

In [145]:
bug_ex

[{'start': 0, 'end': 4, 'ops': {1: ['الجتماعي', 'الاجتماعي', 'REPLACE', (2, 2)], 2: ['هي', '', 'DELETE', (3, None)], 3: ['الان', 'الآن', 'REPLACE', (4, 3)]}},
 {'start': 4, 'end': 7, 'ops': {5: ['احدى', 'أحد', 'REPLACE', (6, 5)], 6: ['اكثر', 'أكثر', 'REPLACE', (7, 6)]}},
 {'start': 14, 'end': 17, 'ops': {15: ['اأجتماعي', 'الاجتماعي', 'REPLACE', (16, 15)], 16: ['لهو', 'له', 'REPLACE', (17, 16)]}},
 {'start': 21, 'end': 24, 'ops': {22: ['و', '', 'DELETE', (23, None)], 23: ['الفرد', 'والفرد', 'REPLACE', (24, 22)]}},
 {'start': 28, 'end': 33, 'ops': {29: ['ال', 'الوحدة', 'REPLACE', (30, 28)], 30: ['حده', '', 'DELETE', (31, None)], 31: ['و', '', 'DELETE', (32, None)], 32: ['الكآبه', 'والكآبة', 'REPLACE', (33, 29)]}},
 {'start': 33, 'end': 36, 'ops': {34: ['و', '', 'DELETE', (35, None)], 35: ['في', 'وفي', 'REPLACE', (36, 31)]}},
 {'start': 46, 'end': 53, 'ops': {47: ['العائله', 'العائلة', 'REPLACE', (48, 43)], 48: ['و', '', 'DELETE', (49, None)], 49: ['الفرد', 'والفرد', 'REPLACE', (50, 44)],

In [138]:
# src = [', لوكان'.split()]
# tgt = ['. لو كان']
# inputs = list(zip(src, tgt))
# perform_align(inputs[0])

In [144]:
bug_ex = capture_bug(basic_alignment[3])
for bug in bug_ex:
    src, tgt = construct_src_tgt(bug)
    inputs = list(zip([src.split()], [tgt]))
    test_x, test_y = perform_align(inputs[0])
    
    print(src)
    print(tgt)
    print('========')
    print(test_x)
    print(test_y)
    print('========')
    clean_alignment(test_x, test_y)
    print()

الجتماعي S هي S الان
الاجتماعي S الآن
['الجتماعي', 'S', 'هي', 'S', 'الان']
['الاجتماع', '', 'ي', ' S ', 'الآن']
['الجتماعي', 'S', 'هي', 'S', 'الان']
['الاجتماع', '', 'ي', 'S', 'الآن']

احدى S اكثر
أحد S أكثر
['احدى', 'S', 'اكثر']
['أحد', ' S ', 'أكثر']
['احدى', 'S', 'اكثر']
['أحد', 'S', 'أكثر']

اأجتماعي S لهو
الاجتماعي S له
['اأجتماعي', 'S', 'لهو']
['الاجتماعي ', 'S', ' له']
['اأجتماعي', 'S', 'لهو']
['الاجتماعي', 'S', 'له']

والفرد
والفرد
['والفرد']
['والفرد']
['والفرد']
['والفرد']

ال S حده S والكابه
الوحدة S والكآبة
['ال', 'S', 'حده', 'S', 'والكابه']
['ال', 'و', 'حدة', ' S ', 'والكآبة']
['ال', 'S', 'حده', 'S', 'والكابه']
['الو', 'حدة', 'S', 'والكآبة']

وفي
وفي
['وفي']
['وفي']
['وفي']
['وفي']

العائله S والفرد S وبين S الاشخاص
العائلة S والفرد S وبين S الأشخاص
['العائله', 'S', 'والفرد', 'S', 'وبين', 'S', 'الاشخاص']
['العائلة', ' S ', 'والفرد', ' S ', 'وبين', ' S ', 'الأشخاص']
['العائله', 'S', 'والفرد', 'S', 'وبين', 'S', 'الاشخاص']
['العائلة', 'S', 'والفرد', 'S', 'وبين', 'S', 'الأشخاص

In [142]:
print(src)
print(tgt)

, S ايضا
. S أيضا


In [132]:
# print(test_x)
# print(test_y)

['والمجتمع']
['والمجتمع S .']
