In [4]:
import json
from pathlib import Path

wrong_file_path = Path("wrong_files_old.json")

with open(wrong_file_path) as f_in:
    data = json.load(f_in)
print(len(data))

wrong_svg_path = Path("/home/ptorras/DATA/DoloresDB/Transcriptions/WrongSVG")
right_svg_path = Path("/home/ptorras/DATA/DoloresDB/Transcriptions/SVG")

817


In [9]:
from lxml import etree

NAMESPACES = {
    "svg": "http://www.w3.org/2000/svg",
    "xlink": "http://www.w3.org/1999/xlink",
    "mei": "http://www.music-encoding.org/ns/mei",
}

id_mappings = {}  # file-> dict[wrong id, right id]

matched = 0
unmatched = 0
different = 0
for right_svg_file in right_svg_path.glob("*.svg"):
    wrong_svg_file = wrong_svg_path / right_svg_file.name
    if not wrong_svg_file.exists():
        unmatched += 1
    matched += 1

    file_mapping = {}
    id_mappings[right_svg_file.name] = file_mapping

    right_svg_data = etree.parse(right_svg_file)
    wrong_svg_data = etree.parse(wrong_svg_file)

    right_svg_root = right_svg_data.getroot()
    wrong_svg_root = wrong_svg_data.getroot()

    for ii, (right_node, wrong_node) in enumerate(zip(right_svg_root.findall(".//*", namespaces=NAMESPACES),
                                                      wrong_svg_root.findall(".//*", namespaces=NAMESPACES))):
        if right_node.tag == wrong_node.tag:
            if "id" in right_node.attrib and "id" in wrong_node.attrib:
                file_mapping[wrong_node.attrib["id"]] = right_node.attrib["id"]
        else:
            print(
                f"Node tags are different on file {right_svg_file.name} position {ii} and line: L({right_node.sourceline}) > {right_node.tag} vs L({right_node.sourceline}) > {wrong_node.tag}")
            different += 1
            del id_mappings[right_svg_file.name]
            break

print(f"Matched: {matched} -- Unmatched {unmatched}")
print(f"There are {different} different SVG files.")

Node tags are different on file XAC_ACAN_SMIAu63_098.10.svg position 85 and line: L(128) > {http://www.w3.org/2000/svg}path vs L(128) > {http://www.w3.org/2000/svg}use
Node tags are different on file XAC_ACGAX_SEOAu942.08_016.05.svg position 54 and line: L(81) > {http://www.w3.org/2000/svg}g vs L(81) > {http://www.w3.org/2000/svg}path
Node tags are different on file XAC_ACAN_SMIAu83_011.09.svg position 18 and line: L(29) > {http://www.w3.org/2000/svg}style vs L(29) > {http://www.w3.org/2000/svg}symbol
Node tags are different on file XAC_ACGAX_SEOAu942.06_003.06.svg position 28 and line: L(43) > {http://www.w3.org/2000/svg}symbol vs L(43) > {http://www.w3.org/2000/svg}style
Node tags are different on file XAC_ACAN_SMIAu83_086.01.svg position 304 and line: L(454) > {http://www.w3.org/2000/svg}g vs L(454) > {http://www.w3.org/2000/svg}use
Node tags are different on file XAC_ACAN_SMIAu37_007.05.svg position 597 and line: L(930) > {http://www.w3.org/2000/svg}path vs L(930) > {http://www.w3.

In [12]:
with open("./mapping.json", "w") as f_out:
    json.dump(id_mappings, f_out, indent=4)

In [None]:
id_mappings["XAC_ACGAX_SEOAu455.01_004.09.svg"]

In [18]:
import re

re_line = re.compile(r"(.+)\.(\d{2})\.svg")
line_mappings = {}

for k, v in id_mappings.items():
    match = re_line.match(k)

    if match is None:
        raise ValueError("Malformed filename")

    project_name = f"{match.group(1)}_final.json"
    line_index = int(match.group(2))

    line_mappings.setdefault(project_name, {})

    for wrong_id, right_id in v.items():
        if wrong_id != right_id:
            line_mappings[project_name][f"line{line_index}:{wrong_id}"] = f"line{line_index}:{right_id}"

In [20]:
with open("./project_mapping.json", "w") as f_out:
    json.dump(line_mappings, f_out, indent=4)

In [24]:
wrong_annotation_path = Path("/home/ptorras/DATA/DoloresDB/Alignments")
right_annotation_path = Path("/home/ptorras/DATA/DoloresDB/CorrectedAlignments")

for alignment_file in wrong_annotation_path.glob("*.json"):
    with open(alignment_file) as f_in:
        data = json.load(f_in)

    new_data = data.copy()

    if alignment_file.name not in line_mappings:
        continue

    translation = line_mappings[alignment_file.name]

    for ann in new_data["annotations"]:
        if ann["id"] in translation:
            ann["id"] = translation[ann["id"]]

    with open(right_annotation_path / alignment_file.name, "w") as f_out:
        json.dump(new_data, f_out, indent=4)


In [27]:
with open("./wrong_ids_corrected.json") as f_in:
    wrong_ids = json.load(f_in)
len(wrong_ids)

309

In [31]:
len(wrong_ids["XAC_ACUR_TagFAu128_029_final.json"]["errors"])

183

In [26]:
line_mappings["XAC_ACGAX_SEOAu942.02_004_final.json"]

{'line4:E0A3-1fpfrpq': 'line4:E0A3-ioquh7',
 'line4:E083-1fpfrpq': 'line4:E083-ioquh7',
 'line4:E084-1fpfrpq': 'line4:E084-ioquh7',
 'line4:E0A4-1fpfrpq': 'line4:E0A4-ioquh7',
 'line4:E261-1fpfrpq': 'line4:E261-ioquh7',
 'line4:E240-1fpfrpq': 'line4:E240-ioquh7',
 'line4:E4E6-1fpfrpq': 'line4:E4E6-ioquh7',
 'line4:E220-1fpfrpq': 'line4:E220-ioquh7',
 'line4:E4A3-1fpfrpq': 'line4:E4A3-ioquh7',
 'line4:s17i2tnt': 'line4:s1dnlne5',
 'line4:l1dhaef6': 'line4:l1wyxf0p',
 'line4:t1k7hdsy': 'line4:t10hiki3',
 'line4:s115fo85': 'line4:snfsmir',
 'line4:m1s31hvy_1': 'line4:time1_1',
 'line4:lru7x7h': 'line4:l1nc8ggw',
 'line4:smn6wjv': 'line4:note1.stem',
 'line4:s1mird70': 'line4:note2.stem',
 'line4:ss9sibq': 'line4:note3.stem',
 'line4:aty9ko': 'line4:note3.articulation1.staccato',
 'line4:s1ilalui': 'line4:note4.stem',
 'line4:bi4mh1c': 'line4:pP1_m1.right_barline',
 'line4:bi4mh1c.barline_tok1': 'line4:pP1_m1.right_barline.barline_tok1',
 'line4:s74r6hb': 'line4:s1okl5go',
 'line4:lia6veo'