In [1]:
import xml.etree.ElementTree as ET
import re

# Load Marc xml

In [2]:
# load Marc xml
tree = ET.parse("exportExample.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)

In [3]:
def create_creator(author):
    '''
    function to create the multiple creators
    Input is either the main author: records.find(".//datafield[@tag='100']") 
    - or the side Authors: records.findall(".//datafield[@tag='700']")
    the function appends subfields to the "creators" Tag and fills them with text from the mrcXml
    '''
    creator = ET.SubElement(creators, "creator")

    creatorName = ET.SubElement(creator, "creatorName")
    creatorName.attrib = {"nameType":"Personal"}
    creatorName.text = author.find("subfield[@code='a']").text

    givenName = ET.SubElement(creator, "givenName")
    givenName.text = author.find("subfield[@code='a']").text.split(",")[1]

    familyName = ET.SubElement(creator, "familyName")
    familyName.text = author.find("subfield[@code='a']").text.split(",")[0]

    # name Identifier and affiliation not in MRC Data

    # nameIdentifier = ET.SubElement(creator, "nameIdentifier")
    # nameIdentifier.attrib = {
    #     "schemeURI":"https://orcid.org/",
    #     "nameIdentifierScheme":"ORCID"
    #     }
    # nameIdentifier.text = " "

    # affiliation = ET.SubElement(creator, "affiliation")
    # affiliation.text = " "

# Functions

## Identifier

In [4]:
def create_identifier(output):
    '''Searches the 024 field'''
    identifierMRC = records.findall(".//datafield[@tag='024']")

    for item in identifierMRC:
        if item.find("subfield[@code='2']").text == "doi":
            identifier = ET.Element("identifier")
            identifier.attrib = {"identifierType":"DOI"}
            identifier.text = item.find("subfield[@code='a']").text

            output.append(identifier)
            
        elif item.find("subfield[@code='2']").text == "urn":
            altidentifiers = ET.Element("alternateIdentifiers")
            altidentifier = ET.SubElement(altidentifiers, "alternateIdentifier")
            altidentifier.attrib = {"alternateIdentifierType":item.find("subfield[@code='2']").text}
            altidentifier.text = item.find("subfield[@code='a']").text

            output.append(altidentifiers)

## Author

In [5]:
def helper_create_creator(author, mainElement):
    creator = ET.SubElement(mainElement, "creator")

    creatorName = ET.SubElement(creator, "creatorName")
    creatorName.attrib = {"nameType":"Personal"}
    creatorName.text = author.find("subfield[@code='a']").text

    givenName = ET.SubElement(creator, "givenName")
    givenName.text = author.find("subfield[@code='a']").text.split(",")[1]

    familyName = ET.SubElement(creator, "familyName")
    familyName.text = author.find("subfield[@code='a']").text.split(",")[0]


def create_creator(output):
    main_authorMRC = records.find(".//datafield[@tag='100']")
    creators = ET.Element("creators")

    # create main Author
    helper_create_creator(main_authorMRC, creators)
    
    # create side-authors
    if records.findall(".//datafield[@tag='700']") != None:
        side_authorMRC = records.findall(".//datafield[@tag='700']")
        for author in side_authorMRC:
            helper_create_creator(author, creators)

    output.append(creators)

## Title

In [6]:
def create_title(output):
    titleMRC = records.find(".//datafield[@tag='245']")
    titles = ET.Element("titles")
    # ------------------------------ Main Title
    title = ET.SubElement(titles, "title")
    title.text = titleMRC.find("subfield[@code='a']").text
    # ------------------------------ Subtitle
    if titleMRC.find("subfield[@code='b']") != None:
        subtitle = ET.SubElement(titles, "title")
        subtitle.attrib = {
            "titleType":"Subtitle"
            }
        subtitle.text = titleMRC.find("subfield[@code='b']").text

    output.append(titles)

## Publisher/Publication Year

In [10]:
def create_publisher(output):
    # publisher -------------------------------------------------------------
    publisherMRC = records.find(".//datafield[@tag='264']")
    publisher = ET.Element("publisher")
    publisher.text = publisherMRC.find("subfield[@code='b']").text
    
    output.append(publisher)
    
    # publication Year -------------------------------------------------------------
    publicationYear = ET.Element("publicationYear")
    pubDate = re.search("\d{4}", publisherMRC.find("subfield[@code='c']").text).group() # search for 4 digits in the "c" Subfield
    publicationYear.text = pubDate
    
    output.append(publicationYear)

## Language

In [13]:
def create_language(output):
    languageMRC = records.find(".//datafield[@tag='041']")

    language = ET.Element("language")
    language.text = languageMRC.find("subfield[@code='a']").text

    output.append(language)

## Fields with fixed Value

### Resource Type

In [15]:
def create_resourceType(output):
    resourceType = ET.Element("resourceType")
    resourceType.attrib = {"resourceTypeGeneral":"Text"}
    # resourceType.text = " "

    output.append(resourceType)


### Formats

In [18]:
def create_formats(output):
    formats = ET.Element("formats")
    formatDC = ET.SubElement(formats, "format")
    formatDC.text = "PDF"

    output.append(formats)

### Rights

In [26]:
def create_rights(output):
    rightsList = ET.Element("rightsList")
    rights = ET.SubElement(rightsList, "rights")
    rights.attrib = {
        "rightsIdentifier":"CC BY 4.0",
        "rightsURI":"https://creativecommons.org/licenses/by/4.0/legalcode"
        }
    # rights.text = " "

    output.append(rightsList)


## Descriptions

In [22]:
def create_descriptions(output):
    descriptions = ET.Element("descriptions")

    # ------------- Abstract
    abstractMRC = records.findall(".//datafield[@tag='520']")

    for item in abstractMRC:
        description = ET.SubElement(descriptions, "description")
        description.attrib = {"descriptionType":"Abstract"}
        # cut "eng: " or "ger: " from abstract text
        oldText = item.find("subfield[@code='a']").text
        toCut = re.search("^eng: |^ger: ", oldText).group()
        abstractText = oldText.replace(toCut, "")
        description.text = abstractText

    # ------------- Series Information
    description = ET.SubElement(descriptions, "description")
    description.attrib = {"descriptionType":"SeriesInformation"}

    check_for_490 = records.find(".//datafield[@tag='490']")
    # field 490 is not present - .find() returns None

    if check_for_490 is None:
        seriesInformationMRC = records.find(".//datafield[@tag='773']")
        description.text = str(
            seriesInformationMRC.find("subfield[@code='t']").text + 
            ", " + 
            seriesInformationMRC.find("subfield[@code='g']").text
            )
    else:
        seriesInformationMRC = records.find(".//datafield[@tag='490']")
        description.text = str(
            seriesInformationMRC.find("subfield[@code='a']").text + 
            ", " + 
            seriesInformationMRC.find("subfield[@code='v']").text
            )

    output.append(descriptions)


## Size

In [42]:
def create_size(output):
    sizeMRC = records.find(".//datafield[@tag='300']")
    sizes = ET.Element("sizes")
    size = ET.SubElement(sizes, "size")
    if re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text) is None:
        size.text = records.find(".//datafield[@tag='300']").find("subfield[@code='a']").text
    else:
        pageNr = re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text).group() # match 1-n digits after ()
        size.text = str(pageNr + " pages")

    output.append(sizes)

## Funding Reference

In [45]:
def create_fundingReference(output):
    # fundingReferences -------------------------------------------------------------
    fundingMRC = records.findall(".//datafield[@tag='536']")

    fundingReferences = ET.Element("fundingReferences")

    for item in fundingMRC:
        fundingReference = ET.SubElement(fundingReferences, "fundingReference")
        funderName = ET.SubElement(fundingReference, "funderName")
        funderIdentifier = ET.SubElement(fundingReference, "funderIdentifier")
        funderIdentifier.attrib = {"funderIdentifierType":"Crossref Funder ID"}
        awardNumber = ET.SubElement(fundingReference, "awardNumber")
        awardNumber.text = item.find(("subfield[@code='f']")).text
        
        if item.find(("subfield[@code='a']")).text == "Fonds zur Förderung der Wissenschaftlichen Forschung":
            funderName.text = "Austrian Science Fund"
            funderIdentifier.text = "https://doi.org/10.13039/501100002428"
        elif item.find(("subfield[@code='a']")).text == "Österreichische Forschungsförderungsgesellschaft":
            funderName.text = "Österreichische Forschungsförderungsgesellschaft"
            funderIdentifier.text = "https://doi.org/10.13039/501100004955"
        elif  item.find(("subfield[@code='a']")).text == "Europäische Kommission":
            funderName.text = "European Commission"
            funderIdentifier.text = "https://doi.org/10.13039/501100000780"

    output.append(fundingReferences)


# Main

In [47]:
def create_output():
    output = ET.Element("resource")
    output.attrib = {
        "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
        "xsi:schemaLocation": "http://datacite.org/schema/kernel-4 https://schema.datacite.org/meta/kernel-4.4/metadata.xsd"
        }
    # create Tree -------------------------------------------------------------
    return output


tree = ET.parse("exportExample-ZS.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)
records = collection[0]

# NB! -------------------------
# all tags in list --> for tag in tagList: if records.find(datafield[tag]) != None
tagsList = ["024", "100", "245"]
# oder Dict. {"024":create_identifier(output), ...}


output = create_output()
if records.findall(".//datafield[@tag='{}']".format("024")) != None:
    create_identifier(output)
if records.find(".//datafield[@tag='100']") != None:
    create_creator(output)
if records.findall(".//datafield[@tag='245']") != None:
    create_title(output)
if records.find(".//datafield[@tag='264']") != None:
    create_publisher(output)
if records.find(".//datafield[@tag='041']") != None:
    create_language(output)


create_formats(output)
create_resourceType(output)

if records.find(".//datafield[@tag='520']") != None:
    create_descriptions(output)

create_rights(output)

if records.find(".//datafield[@tag='300']") != None:
    create_size(output)
    
if records.find(".//datafield[@tag='536']") != None:
    create_fundingReference(output)

tree = ET.ElementTree(output)
# -------------------------------------------------------------
# write output--------------------------------------------------------------
tree.write("{}.xml".format("output-ZS"))


# MarcXml_2_DataCite

In [None]:
records = collection[0] # first record in collection # later replace with: for record in collection

dcOutput = ET.Element("resource")
dcOutput.attrib = {
    "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
    "xsi:schemaLocation": "http://datacite.org/schema/kernel-4 https://schema.datacite.org/meta/kernel-4.4/metadata.xsd"
    }

# Identifier -------------------------------------------------------------
identifierMRC = records.findall(".//datafield[@tag='024']")

for item in identifierMRC:
    if item.find("subfield[@code='2']").text == "doi":
        identifier = ET.Element("identifier")
        identifier.attrib = {"identifierType":"DOI"}
        identifier.text = item.find("subfield[@code='a']").text

        dcOutput.append(identifier)
        
    elif item.find("subfield[@code='2']").text == "urn":
        altidentifiers = ET.Element("alternateIdentifiers")
        altidentifier = ET.SubElement(altidentifiers, "alternateIdentifier")
        altidentifier.attrib = {"alternateIdentifierType":item.find("subfield[@code='2']").text}
        altidentifier.text = item.find("subfield[@code='a']").text

        dcOutput.append(altidentifiers)

# Creators -------------------------------------------------------------
main_authorMRC = records.find(".//datafield[@tag='100']")
side_authorMRC = records.findall(".//datafield[@tag='700']")

creators = ET.Element("creators")
create_creator(main_authorMRC)
for author in side_authorMRC:
    create_creator(author)

dcOutput.append(creators)

# Titles -------------------------------------------------------------
titleMRC = records.find(".//datafield[@tag='245']")
titles = ET.Element("titles")
# ------------------------------ Main Title
title = ET.SubElement(titles, "title")
title.text = titleMRC.find("subfield[@code='a']").text
# ------------------------------ Subtitle
subtitle = ET.SubElement(titles, "title")
subtitle.attrib = {
    "titleType":"Subtitle"
    }
subtitle.text = titleMRC.find("subfield[@code='b']").text

dcOutput.append(titles)

# publisher -------------------------------------------------------------
publisherMRC = records.find(".//datafield[@tag='264']")

publisher = ET.Element("publisher")
publisher.text = publisherMRC.find("subfield[@code='b']").text

dcOutput.append(publisher)

# publication Year -------------------------------------------------------------
publicationYear = ET.Element("publicationYear")
pubDate = re.search("\d{4}", publisherMRC.find("subfield[@code='c']").text).group() # search for 4 digits in the "c" Subfield
publicationYear.text = pubDate

dcOutput.append(publicationYear)

# Language -------------------------------------------------------------
languageMRC = records.find(".//datafield[@tag='041']")

language = ET.Element("language")
language.text = languageMRC.find("subfield[@code='a']").text

dcOutput.append(language)

# Resource Type -------------------------------------------------------------
resourceType = ET.Element("resourceType")
resourceType.attrib = {"resourceTypeGeneral":"Text"}
# resourceType.text = " "

dcOutput.append(resourceType)

# Formats -------------------------------------------------------------
formats = ET.Element("formats")
formatDC = ET.SubElement(formats, "format")
formatDC.text = "PDF"

dcOutput.append(formats)

# Descriptions -------------------------------------------------------------
descriptions = ET.Element("descriptions")

# ------------- Abstract
abstractMRC = records.findall(".//datafield[@tag='520']")

for item in abstractMRC:
    description = ET.SubElement(descriptions, "description")
    description.attrib = {"descriptionType":"Abstract"}
    # cut "eng: " or "ger: " from abstract text
    oldText = item.find("subfield[@code='a']").text
    toCut = re.search("^eng: |^ger: ", oldText).group()
    abstractText = oldText.replace(toCut, "")
    description.text = abstractText

# ------------- Series Information
description = ET.SubElement(descriptions, "description")
description.attrib = {"descriptionType":"SeriesInformation"}

check_for_490 = records.find(".//datafield[@tag='490']")
# field 490 is not present - .find() returns None

if check_for_490 is None:
    seriesInformationMRC = records.find(".//datafield[@tag='773']")
    description.text = str(
        seriesInformationMRC.find("subfield[@code='t']").text + 
        ", " + 
        seriesInformationMRC.find("subfield[@code='g']").text
        )
else:
    seriesInformationMRC = records.find(".//datafield[@tag='490']")
    description.text = str(
        seriesInformationMRC.find("subfield[@code='a']").text + 
        ", " + 
        seriesInformationMRC.find("subfield[@code='v']").text
        )

dcOutput.append(descriptions)

# Rights List -------------------------------------------------------------
rightsList = ET.Element("rightsList")
rights = ET.SubElement(rightsList, "rights")
rights.attrib = {
    "rightsIdentifier":"CC BY 4.0",
    "rightsURI":"https://creativecommons.org/licenses/by/4.0/legalcode"
    }
# rights.text = " "

dcOutput.append(rightsList)

# Size -------------------------------------------------------------
sizeMRC = records.find(".//datafield[@tag='300']")
sizes = ET.Element("sizes")
size = ET.SubElement(sizes, "size")
pageNr = re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text).group() # match 1-n digits after ()
size.text = str(pageNr + " pages")

dcOutput.append(sizes)

# fundingReferences -------------------------------------------------------------
fundingMRC = records.findall(".//datafield[@tag='536']")

fundingReferences = ET.Element("fundingReferences")

for item in fundingMRC:
    fundingReference = ET.SubElement(fundingReferences, "fundingReference")
    funderName = ET.SubElement(fundingReference, "funderName")
    funderIdentifier = ET.SubElement(fundingReference, "funderIdentifier")
    funderIdentifier.attrib = {"funderIdentifierType":"Crossref Funder ID"}
    awardNumber = ET.SubElement(fundingReference, "awardNumber")
    awardNumber.text = item.find(("subfield[@code='f']")).text
    
    if item.find(("subfield[@code='a']")).text == "Fonds zur Förderung der Wissenschaftlichen Forschung":
        funderName.text = "Austrian Science Fund"
        funderIdentifier.text = "https://doi.org/10.13039/501100002428"
    elif item.find(("subfield[@code='a']")).text == "Österreichische Forschungsförderungsgesellschaft":
        funderName.text = "Österreichische Forschungsförderungsgesellschaft"
        funderIdentifier.text = "https://doi.org/10.13039/501100004955"
    elif  item.find(("subfield[@code='a']")).text == "Europäische Kommission":
        funderName.text = "European Commission"
        funderIdentifier.text = "https://doi.org/10.13039/501100000780"

dcOutput.append(fundingReferences)

# -------------------------------------------------------------
# create Tree -------------------------------------------------------------
dcTree = ET.ElementTree(dcOutput)
# -------------------------------------------------------------
# write output--------------------------------------------------------------
dcTree.write("output.xml")

In [None]:
# dcTree = ET.ElementTree(dcOutput)
# dcTree.write("dcTest.xml")

# ToDo

* Docker