# Move Identifier

In [1]:
import sys
import os
import collections
import ssl
from glob import glob
from shutil import rmtree
import xml.etree.ElementTree as ET
from urllib.request import urlopen

In [2]:
TEMP = '_temp'
EAD = f'{TEMP}/ead'
EAD_NORM = f'{TEMP}/ead-norm'
EAD_PLUS = f'{TEMP}/ead-plus'
SOURCE_BASE = 'https://open-data.bundesarchiv.de'
SOURCE_BIG = f'{SOURCE_BASE}/all_bestand/best_ov_barch.xml'
SOURCE_ITEM = f'{SOURCE_BASE}/apex-ead/DE-1958_'
CACHE_BIG = f'{TEMP}/de-barch-eads.xml'
IDS = 'info/ids.csv'

In [3]:
def prepare():
    if not os.path.exists(TEMP):
        os.makedirs(TEMP, exist_ok=True)
    #if os.path.exists(EAD):
    #     rmtree(EAD)
    os.makedirs(EAD, exist_ok=True)  
    os.makedirs(EAD_NORM, exist_ok=True)  
    os.makedirs(EAD_PLUS, exist_ok=True)  

In [4]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def fetchXML(url, xmlFile):
    result = False
    sys.stdout.write(f'\tget {url} ...')
    try:
        with urlopen(url, context=ctx) as uh:
            xml = uh.read()
        with open(xmlFile, 'wb') as wh:
            wh.write(xml)
        result = True
    except Exception:
        pass
    print('OK' if result else 'Failed')
    return result

In [5]:
def getIds():
    with open(IDS) as fh:
        ids = {line.strip() for line in fh}
    return ids

In [6]:
NS = 'urn:isbn:1-931666-22-9'
EMI = 'ehri_main_identifier'
TP = 'call number'
EA = '3.1.1'
ET.register_namespace('', NS)
ET.register_namespace('xlink', 'http://www.w3.org/1999/xlink')
ET.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance')

def extractEADS(fetch=False):
    if fetch:
        sys.stdout('Downloading {SOURCE_BIG} ... ')
        fetchXML(SOURCE_BIG, CACHE_BIG)
    ids = getIds()
    found = set()
    missing = set()
    print(f'Expect {len(ids)} EADs')
    if not os.path.exists(CACHE_BIG):
        print(f'No file {CACHE_BIG}')
        return
    tree = ET.parse(CACHE_BIG)
    root = tree.getroot()
    cElems = 0
    written = 0
    for cElem in root.iter(f'{{{NS}}}c'):
        cElems += 1
        for didElem in cElem.findall(f'{{{NS}}}did'):
            unitidElems = didElem.findall(f'{{{NS}}}unitid')
            theseIds = [u.text for u in unitidElems if u.attrib['type'] == TP]
            if len(theseIds) == 1:
                theId = theseIds[0]
                if theId in ids:
                    found.add(theId)
                    fileOut = f'{EAD}/{theId.upper().replace(" ", "_")}.xml'
                    cTree = ET.ElementTree(cElem)
                    cTree.write(fileOut, encoding='unicode', xml_declaration=True)
                    written += 1
    missing = ids - found
    print(f'{written} EAD files written out of {cElems} c-elements')
    print(f'found {len(found)} EADs and missed {len(missing)} EADS')
    for theId in sorted(missing):
        print(f'{theId}')
    
def getEADs(fetch=0):
    ids = getIds()
    missing = set()
    found = set()
    print(f'Expect {len(ids)} EADs')
    for theId in sorted(ids):
        fileName = f'{theId.upper().replace(" ", "_")}.xml'
        filePath = f'{EAD}/{fileName}'
        if fetch == 1 or (fetch == -1 and not os.path.exists(filePath)):
            url = f'{SOURCE_ITEM}{fileName}'
            fetchXML(url, filePath)
        elif os.path.exists(filePath):
            print(f'\talready present {url}')
        if not os.path.exists(filePath):
            missing.add(theId)
        else:
            found.add(theId)
    print(f'found {len(found)} EADs and missed {len(missing)} EADS')
    for theId in sorted(missing):
        print(f'{theId}')

In [7]:
def moveId(fileIn, fileOut, fileNorm=None, debug=False):
    tree = ET.parse(fileIn)
    if fileNorm is not None:
        tree.write(fileNorm, encoding='unicode', xml_declaration=True)
    root = tree.getroot()
    didNoId = set()
    didMultId = set()
    didOneId = set()
    for cElem in root.iter('{{{}}}c'.format(NS)):
        for didElem in cElem.findall('{{{}}}did'.format(NS)):
            unitidElems = didElem.findall('{{{}}}unitid'.format(NS))
            ids = [u for u in unitidElems if u.attrib['type'] == TP]
            nIds = len(ids)
            if nIds == 0:
                didNoId.add(didElem)
            elif nIds == 1:
                didOneId.add(didElem)
            else:
                didMultId.add(didElem)
            if nIds == 0:
                unitidElem = ET.Element('unitid')
                unitidElem.text = cElem.attrib['id']
                unitidElem.set('label', EMI)
                unitidElem.set('encodinganalog', EA)
                didElem.insert(0, unitidElem)
    if debug:
        print('{} => {}'.format(fileIn, fileOut))
        print('\t{:<4} x <did> with single identifier'.format(len(didOneId)))
        print('\t{:<4} x <did> without identifier'.format(len(didNoId)))
        print(
            '\t{:<4} x <did> with multiple identifiers'.format(len(didMultId))
        )
    tree.write(fileOut, encoding='unicode', xml_declaration=True)

In [8]:
prepare()

In [9]:
#extractEADS()
getEADs(fetch=-1)

Expect 122 EADs
	get https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_10.xml ...Failed
	get https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_21.xml ...Failed
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_21.xml
	get https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_4.xml ...Failed
	get https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml ...Failed
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	already present https://open-data.bundesarchiv.de/apex-ead/DE-1958_ALLPROZ_5.xml
	alre

In [36]:
sorted(getIds())

['AllProz 10',
 'AllProz 21',
 'AllProz 3',
 'AllProz 4',
 'AllProz 5',
 'AllProz 6',
 'B 102',
 'B 103',
 'B 114',
 'B 115',
 'B 122',
 'B 125',
 'B 126',
 'B 129',
 'B 136',
 'B 150',
 'B 177',
 'B 178',
 'B 183',
 'B 184',
 'B 239',
 'B 240',
 'B 259',
 'B 260',
 'B 280',
 'B 323',
 'B 323-F',
 'B 326',
 'B 330',
 'B 331-BNS',
 'B 331-BW',
 'B 331-HB',
 'B 331-NRW',
 'B 331-S',
 'B 332',
 'B 373',
 'B 400',
 'B 401',
 'B 427',
 'BY 5',
 'BY 6',
 'BY 9',
 'DC 20',
 'DN 1',
 'DN 3',
 'DN 6',
 'DO 1',
 'DO 4',
 'DP 1',
 'DP 3',
 'DR 1',
 'DR 3',
 'DY 30',
 'DY 34',
 'DY 55',
 'DY 57',
 'DY 6',
 'N 1018',
 'N 1043',
 'N 1075',
 'N 1094',
 'N 1109',
 'N 1110',
 'N 1118',
 'N 1120',
 'N 1121',
 'N 1125',
 'N 1126',
 'N 1128',
 'N 1160',
 'N 1180',
 'N 1235',
 'N 1241',
 'N 1271',
 'N 1291',
 'N 1292',
 'N 1294',
 'N 1296',
 'N 1304',
 'N 1310',
 'N 1329',
 'N 1340',
 'N 1344',
 'N 1362',
 'N 1373',
 'N 1388',
 'N 1400',
 'N 1415',
 'N 1428',
 'N 1493',
 'N 1497',
 'N 1566',
 'N 1567',
 'N

In [45]:
test = {
    'N_1632'
}

for inPath in glob(f'{EAD}/*.xml'):
    inFile = os.path.basename(inPath)
    (name, ext) = os.path.splitext(inFile)
    if test and name not in test:
        continue
    normPath = f'{EAD_NORM}/{name}{ext}' if test else None
    outPath = f'{EAD_PLUS}/{name}{ext}'
    print(f'{name}')
    moveId(inPath, outPath, fileNorm=normPath)

N_1632
