In [260]:
import rules
import json
from collections import Counter

In [261]:
def break_rule(rule):
    """Breaks multi-edit rule into muliple rules"""
    res = []
    for edit in rule.edits:
        edit = json.dumps(edit, ensure_ascii=False, sort_keys=True, indent=None)
        res.append(f'["CharEditScriptRule", {{"edits": [{edit}]}}, {{}}]')
    return res

In [270]:
def get_stats(rules):
    print(f'There are {len(rules)} in total')
    single_rules = []
    single_rules_str = []
    multi_rules = []
    broken_rules = []
    
    # get the single edit and multi edit rules
    for rule in rules:
        if len(rule.edits) <= 1: 
            single_rules.append(rule)
            single_rules_str.append(rule.to_str())
        else: # get the mutli
            multi_rules.append(rule)
            broken_rules.extend(break_rule(rule))

    assert len(single_rules) == len(set(single_rules))
    assert len(multi_rules) == len(set(multi_rules))
    
    print(f'There are {len(single_rules)} single rules')
    print(f'There are {len(multi_rules)} multi rules')
    print('----------------------')
    
    print(f'Breaking multi rules resulted in {len(broken_rules)} rules ({len(set(broken_rules))} types)')
    
    
    broken_oov = [rule for rule in broken_rules if rule not in single_rules_str]
    broken_in = [rule for rule in broken_rules if rule in single_rules_str]
    
    print(f'Out of the rules we broke, {len(broken_oov)} ({len(set(broken_oov))} types)'
          f' did not appear as single rules')
    print(f'Out of the rules we broke, {len(broken_in)} ({len(set(broken_in))} types)'
          f' appear as single rules')
    
    # Out of the rules we broke, we should discard the ones we've already seen
    # and keep the ones we haven't seen.
    
    pruned_rules = single_rules_str + list(set(broken_oov))
    print(f'We have {len(pruned_rules)} single rules after pruning')
    for x in pruned_rules:
        print(x)

In [273]:
# Load the rules
rules_list = []
with open('/scratch/ba63/gec/rule-based-data/MIX/3/RULES.LIST', "r", encoding="utf-8") as rules_file:
    for line in rules_file:
        line = line.rstrip("\n")
        rules_list.append(rules.Rule.from_str(line))

In [274]:
get_stats(rules_list)

There are 2647 in total
There are 1169 single rules
There are 1478 multi rules
----------------------
Breaking multi rules resulted in 3055 rules (427 types)
Out of the rules we broke, 195 (104 types) did not appear as single rules
Out of the rules we broke, 2860 (323 types) appear as single rules
We have 1273 single rules after pruning
["CharEditScriptRule", {"edits": []}, {}]
["CharEditScriptRule", {"edits": [["replace_all", "[UNCORRECTABLE_ERROR]"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -2, -1]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -1]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -1]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -2]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -3]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -5, -1]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -5, -4]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -7, -6]]}, {

In [259]:
1169 + 104

1273

In [147]:
x = break_rule(single_rules[0])

In [150]:
rules.Rule.from_str(x[0])

<rules.CharEditScriptRule at 0x15552a144d60>

In [146]:
single_rules[0].to_str()

'["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, " ،"]]}, {}]'

In [16]:
for x in x_rules:
    print(x.to_str())

["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, " ،"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, " ؟"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "أ"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "ا"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "ت"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "ذا ."]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "ذا ،"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -3, -2], ["insert", -1, "ذا"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -3], ["delete", -2, -1]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -3], ["insert", -1, " ."]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -3], ["insert", -1, " ،"]]}, {}]
["CharEditScriptRule", {"edits": [["delete", -4, -3], ["replace", -2, -1, "ى"]]}, {}]
["CharEditScr