In [9]:
import xml.etree.ElementTree as ET
import re

# Load Marc xml

In [10]:
# load Marc xml
tree = ET.parse("exportExample.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)

In [11]:
def create_creator(author):
    '''
    function to create the multiple creators
    Input is either the main author: records.find(".//datafield[@tag='100']") 
    - or the side Authors: records.findall(".//datafield[@tag='700']")
    the function appends subfields to the "creators" Tag and fills them with text from the mrcXml
    '''
    creator = ET.SubElement(creators, "creator")

    creatorName = ET.SubElement(creator, "creatorName")
    creatorName.attrib = {"nameType":"Personal"}
    creatorName.text = author.find("subfield[@code='a']").text

    givenName = ET.SubElement(creator, "givenName")
    givenName.text = author.find("subfield[@code='a']").text.split(",")[1]

    familyName = ET.SubElement(creator, "familyName")
    familyName.text = author.find("subfield[@code='a']").text.split(",")[0]

    # name Identifier and affiliation not in MRC Data

    # nameIdentifier = ET.SubElement(creator, "nameIdentifier")
    # nameIdentifier.attrib = {
    #     "schemeURI":"https://orcid.org/",
    #     "nameIdentifierScheme":"ORCID"
    #     }
    # nameIdentifier.text = " "

    # affiliation = ET.SubElement(creator, "affiliation")
    # affiliation.text = " "

# MarcXml_2_DataCite

In [19]:
records = collection[0] # first record in collection # later replace with: for record in collection

dcOutput = ET.Element("resource")
dcOutput.attrib = {
    "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
    "xsi:schemaLocation": "http://datacite.org/schema/kernel-4 https://schema.datacite.org/meta/kernel-4.4/metadata.xsd"
    }

# Identifier -------------------------------------------------------------
identifierMRC = records.findall(".//datafield[@tag='024']")

for item in identifierMRC:
    if item.find("subfield[@code='2']").text == "doi":
        identifier = ET.Element("identifier")
        identifier.attrib = {"identifierType":"DOI"}
        identifier.text = item.find("subfield[@code='a']").text

        dcOutput.append(identifier)
        
    elif item.find("subfield[@code='2']").text == "urn":
        altidentifiers = ET.Element("alternateIdentifiers")
        altidentifier = ET.SubElement(altidentifiers, "alternateIdentifier")
        altidentifier.attrib = {"alternateIdentifierType":item.find("subfield[@code='2']").text}
        altidentifier.text = item.find("subfield[@code='a']").text

        dcOutput.append(altidentifiers)

# Creators -------------------------------------------------------------
main_authorMRC = records.find(".//datafield[@tag='100']")
side_authorMRC = records.findall(".//datafield[@tag='700']")

creators = ET.Element("creators")
create_creator(main_authorMRC)
for author in side_authorMRC:
    create_creator(author)

dcOutput.append(creators)

# Titles -------------------------------------------------------------
titleMRC = records.find(".//datafield[@tag='245']")
titles = ET.Element("titles")
# ------------------------------ Main Title
title = ET.SubElement(titles, "title")
title.text = titleMRC.find("subfield[@code='a']").text
# ------------------------------ Subtitle
subtitle = ET.SubElement(titles, "title")
subtitle.attrib = {
    "titleType":"Subtitle"
    }
subtitle.text = titleMRC.find("subfield[@code='b']").text

dcOutput.append(titles)

# publisher -------------------------------------------------------------
publisherMRC = records.find(".//datafield[@tag='264']")

publisher = ET.Element("publisher")
publisher.text = publisherMRC.find("subfield[@code='b']").text

dcOutput.append(publisher)

# publication Year -------------------------------------------------------------
publicationYear = ET.Element("publicationYear")
pubDate = re.search("\d{4}", publisherMRC.find("subfield[@code='c']").text).group() # search for 4 digits in the "c" Subfield
publicationYear.text = pubDate

dcOutput.append(publicationYear)

# Language -------------------------------------------------------------
languageMRC = records.find(".//datafield[@tag='041']")

language = ET.Element("language")
language.text = languageMRC.find("subfield[@code='a']").text

dcOutput.append(language)

# Resource Type -------------------------------------------------------------
resourceType = ET.Element("resourceType")
resourceType.attrib = {"resourceTypeGeneral":"Text"}
# resourceType.text = " "

dcOutput.append(resourceType)

# Formats -------------------------------------------------------------
formats = ET.Element("formats")
formatDC = ET.SubElement(formats, "format")
formatDC.text = "PDF"

dcOutput.append(formats)

# Descriptions -------------------------------------------------------------
abstractMRC = records.findall(".//datafield[@tag='520']")
seriesInformationMRC = records.find(".//datafield[@tag='490']")

descriptions = ET.Element("descriptions")

# ------------- Abstract
for item in abstractMRC:
    description = ET.SubElement(descriptions, "description")
    description.attrib = {"descriptionType":"Abstract"}
    # cut "eng: " or "ger: " from abstract text
    oldText = item.find("subfield[@code='a']").text
    toCut = re.search("^eng: |^ger: ", oldText).group()
    abstractText = oldText.replace(toCut, "")
    description.text = abstractText
# ------------- Series Information
description = ET.SubElement(descriptions, "description")
description.attrib = {"descriptionType":"SeriesInformation"}
description.text = str(
    seriesInformationMRC.find("subfield[@code='a']").text + 
    ", " + 
    seriesInformationMRC.find("subfield[@code='v']").text
    )

# ToDo: Bei ZS: 773 0 9 $t $g NB! $t und $g mit "," getrennt

dcOutput.append(descriptions)

# Rights List -------------------------------------------------------------
rightsList = ET.Element("rightsList")
rights = ET.SubElement(rightsList, "rights")
rights.attrib = {
    "rightsIdentifier":"CC BY 4.0",
    "rightsURI":"https://creativecommons.org/licenses/by/4.0/legalcode"
    }
# rights.text = " "

dcOutput.append(rightsList)

# Size -------------------------------------------------------------
sizeMRC = records.find(".//datafield[@tag='300']")
sizes = ET.Element("sizes")
size = ET.SubElement(sizes, "size")
pageNr = re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text).group() # match 1-n digits after ()
size.text = str(pageNr + " pages")

dcOutput.append(sizes)

# fundingReferences -------------------------------------------------------------
fundingMRC = records.findall(".//datafield[@tag='536']")

fundingReferences = ET.Element("fundingReferences")

for item in fundingMRC:
    fundingReference = ET.SubElement(fundingReferences, "fundingReference")
    funderName = ET.SubElement(fundingReference, "funderName")
    funderIdentifier = ET.SubElement(fundingReference, "funderIdentifier")
    funderIdentifier.attrib = {"funderIdentifierType":"Crossref Funder ID"}
    awardNumber = ET.SubElement(fundingReference, "awardNumber")
    awardNumber.text = item.find(("subfield[@code='f']")).text
    
    if item.find(("subfield[@code='a']")).text == "Fonds zur Förderung der Wissenschaftlichen Forschung":
        funderName.text = "Austrian Science Fund"
        funderIdentifier.text = "https://doi.org/10.13039/501100002428"
    elif item.find(("subfield[@code='a']")).text == "Österreichische Forschungsförderungsgesellschaft":
        funderName.text = "Österreichische Forschungsförderungsgesellschaft"
        funderIdentifier.text = "https://doi.org/10.13039/501100004955"
    elif  item.find(("subfield[@code='a']")).text == "Europäische Kommission":
        funderName.text = "European Commission"
        funderIdentifier.text = "https://doi.org/10.13039/501100000780"

dcOutput.append(fundingReferences)

# -------------------------------------------------------------
# create Tree -------------------------------------------------------------
dcTree = ET.ElementTree(dcOutput)
# -------------------------------------------------------------
# write output--------------------------------------------------------------
dcTree.write("output.xml")

In [14]:
# dcTree = ET.ElementTree(dcOutput)
# dcTree.write("dcTest.xml")

# ToDo

* Docker