# Move Identifier

In [1]:
import os
import collections
from glob import glob
from shutil import rmtree
import xml.etree.ElementTree as ET

In [2]:
TASK = '../input/de-barch'
DEST = '../output/de-barch'

In [3]:
def cleanDir(path):
    if os.path.exists(path):
        rmtree(path)
    os.makedirs(path, exist_ok=True)  

In [11]:
NS = 'urn:isbn:1-931666-22-9'
EMI = 'ehri_main_identifier'
TP = 'call number'
EA = '3.1.1'
ET.register_namespace('', NS)

cLevels = collections.Counter()

def moveId(fileIn, fileOut):
    tree = ET.parse(fileIn)
    root = tree.getroot()
    didNoId = set()
    didMultipleId = set()
    didOneId = set()
    for cElem in root.iter(f'{{{NS}}}c'):
        level = cElem.attrib['level']
        cLevels[level] += 1
        for didElem in cElem.findall(f'{{{NS}}}did'):
            unitidElems = didElem.findall(f'{{{NS}}}unitid')
            ids = [u for u in unitidElems if u.attrib['type'] == TP]
            nIds = len(ids)
            if nIds == 0:
                didNoId.add(didElem)
            elif nIds == 1:
                didOneId.add(didElem)
            else:
                didMultipleId.add(didElem)
            if nIds == 0:
                unitidElem = ET.Element('unitid')
                unitidElem.text = cElem.attrib['id']
                unitidElem.set('label', EMI)
                unitidElem.set('encodinganalog', EA)
                didElem.insert(0, unitidElem)
    print(f'{len(didOneId):<4} x <did> with single identifier')
    print(f'{len(didNoId):<4} x <did> without identifier')
    print(f'{len(didMultipleId):<4} x <did> with multiple identifiers')
    tree.write(fileOut, encoding='unicode', xml_declaration=True)

In [12]:
cleanDir(DEST)
for path in glob(f'{TASK}/*.xml'):
    (dir, fileName) = os.path.split(path)
    (base, ext) = os.path.splitext(fileName)
    outPath = f'{DEST}/{base}{ext}'
    print(f'processing {base}')
    moveId(path, outPath)

processing DE-1958_RH_26-18
132  x <did> with single identifier
39   x <did> without identifier
0    x <did> with multiple identifiers


In [10]:
for l in cLevels:
    print(f'{cLevels[l]:>4} x {l}')

   1 x fonds
   7 x class
 132 x file
  13 x series
  18 x subseries
