In [2]:
import xml.etree.ElementTree as ET
import re

# Load Marc xml

In [3]:
# load Marc xml
tree = ET.parse("exportExample.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)

In [4]:
def create_creator(author):
    '''
    function to create the multiple creators
    Input is either the main author: records.find(".//datafield[@tag='100']") 
    - or the side Authors: records.findall(".//datafield[@tag='700']")
    the function appends subfields to the "creators" Tag and fills them with text from the mrcXml
    '''
    creator = ET.SubElement(creators, "creator")

    creatorName = ET.SubElement(creator, "creatorName")
    creatorName.attrib = {"nameType":"Personal"}
    creatorName.text = author.find("subfield[@code='a']").text

    givenName = ET.SubElement(creator, "givenName")
    givenName.text = author.find("subfield[@code='a']").text.split(",")[1]

    familyName = ET.SubElement(creator, "familyName")
    familyName.text = author.find("subfield[@code='a']").text.split(",")[0]

    nameIdentifier = ET.SubElement(creator, "nameIdentifier")
    nameIdentifier.attrib = {
        "schemeURI":"https://orcid.org/",
        "nameIdentifierScheme":"ORCID"
        }
    nameIdentifier.text = " "

    affiliation = ET.SubElement(creator, "affiliation")
    affiliation.text = " "

# MarcXml_2_DataCite

In [8]:
records = collection[0] # first record in collection # later replace with: for record in collection

dcOutput = ET.Element("resource")
dcOutput.attrib = {
    "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
    "xsi:schemaLocation": "http://datacite.org/schema/kernel-4 https://schema.datacite.org/meta/kernel-4.4/metadata.xsd"
    }

# Identifier -------------------------------------------------------------
identifierMRC = records.findall(".//datafield[@tag='024']")

for item in identifierMRC:
    identifier = ET.Element("identifier")
    identifier.attrib = {"identifierType":item.find("subfield[@code='2']").text}
    identifier.text = item.find("subfield[@code='a']").text
    dcOutput.append(identifier)

# Creators -------------------------------------------------------------
main_authorMRC = records.find(".//datafield[@tag='100']")
side_authorMRC = records.findall(".//datafield[@tag='700']")
    # subfield a = creatorName
    # subfield a - split "," split[0] = familyName split[1] = givenName
    # subfield??? = nameIdentifier -- z.B. ORCID
    # subfield??? = affiliation

creators = ET.Element("creators")
create_creator(main_authorMRC)
for author in side_authorMRC:
    create_creator(author)

dcOutput.append(creators)

# Titles -------------------------------------------------------------
titleMRC = records.find(".//datafield[@tag='245']")
titles = ET.Element("titles")
# ------------------------------ Main Title
title = ET.SubElement(titles, "title")
title.text = titleMRC.find("subfield[@code='a']").text
# ------------------------------ Subtitle
subtitle = ET.SubElement(titles, "title")
subtitle.attrib = {
    "titleType":"Subtitle"
    }
subtitle.text = titleMRC.find("subfield[@code='b']").text

dcOutput.append(titles)

# publisher -------------------------------------------------------------
publisherMRC = records.find(".//datafield[@tag='264']")

publisher = ET.Element("publisher")
publisher.text = publisherMRC.find("subfield[@code='b']").text

dcOutput.append(publisher)

# publication Year -------------------------------------------------------------
publicationYear = ET.Element("publicationYear")
pubDate = re.search("\d{4}", publisherMRC.find("subfield[@code='c']").text).group() # search for 4 digits in the "c" Subfield
publicationYear.text = pubDate

dcOutput.append(publicationYear)

# Language -------------------------------------------------------------
languageMRC = records.find(".//datafield[@tag='041']")

language = ET.Element("language")
language.text = languageMRC.find("subfield[@code='a']").text

dcOutput.append(language)

# Resource Type -------------------------------------------------------------
resourceType = ET.Element("resourceType")
resourceType.attrib = {"resourceTypeGeneral":"Text"}
resourceType.text = " "

dcOutput.append(resourceType)

# Formats -------------------------------------------------------------
formats = ET.Element("formats")
formatDC = ET.SubElement(formats, "format")
formatDC.text = "PDF"

dcOutput.append(formats)

# Descriptions -------------------------------------------------------------
abstractMRC = records.find(".//datafield[@tag='520']")

descriptions = ET.Element("descriptions")

# ------------- Abstract
description = ET.SubElement(descriptions, "description")
description.attrib = {"descriptionType":"Abstract"}
description.text = abstractMRC.find("subfield[@code='a']").text
# ------------- Series Information


# -------------------------------------------------------------
# create Tree -------------------------------------------------------------
dcTree = ET.ElementTree(dcOutput)
# -------------------------------------------------------------
# write output--------------------------------------------------------------
dcTree.write("output.xml")

In [6]:
publisherMRC.find("subfield[@code='b']")

<Element 'subfield' at 0x0000024B58D28040>

In [7]:
# dcTree = ET.ElementTree(dcOutput)
# dcTree.write("dcTest.xml")

# ToDo

* resourceType> resourceTypeGeneral="Software">XML /resourceType
    * resourceTypeGeneral soll "Text" sein
    * kein text
*   formats>    format>application/xml /format>   /formats>
    * befüllen mit "PDF" (wir haben nix anderes)
    * alternativ auch aus datafield tag="347" ind1=" " ind2=" "> subfield code="b">PDF /subfield>
* descriptions> description xml:lang="en-US" descriptionType="Abstract">XML example of all DataCite Metadata Schema v4.4 properties./description>   /descriptions>
    * ---NB! 520 kann mehrmals vorkommen.
        * descriptionType="Abstract"
            * befüllen mit datafield tag="520" ind1=" " ind2=" "> subfield code="a">
            * Prefix Text "eng: " entfernen (ev. auch ger:)
        * descriptionType="SeriesInformation"
            * datafield tag="490" ind1="0" ind2=" "> subfield code="a">RISC Report Series /subfield> subfield code="v">22-04 /subfield>
                * NB! subfielder mit "," getrennt
            * Bei ZS: 773 0 9 $t $g NB! $t und $g mit "," getrennt
                * z.B. AC16411195
* alternateIdentifiers> alternateIdentifier alternateIdentifierType="URL"> /alternateIdentifier>  /alternateIdentifiers>
    * befüllen mit URN
    * alternateIdentifierType="URN"
    * NB! Identifiers nur mit DOI
* rightsList> rights xml:lang="en-US" schemeURI="https://spdx.org/licenses/" rightsIdentifierScheme="SPDX" rightsIdentifier="CC0 1.0" rightsURI="https://creativecommons.org/publicdomain/zero/1.0/" /> /rightsList>
    * rightsIdentifier="CC BY 4.0 " standardmäßig einfügen
    * Rights URI: https://creativecommons.org/licenses/by/4.0/legalcode
    * Felder im datafield tag="540" subfield code="f" subfield code="u" (hat leider eine andere URL)
* sizes> size>4 kB /size> /sizes>
    * befüllen mit datafield tag="300" ind1=" " ind2=" "> subfield code="a">1 Online-Ressource (22 Seiten) /subfield> /datafield>
    * NB! nur die Seitenzahl (22) + "pages" anhängen
* fundingReferences> fundingReference>
      funderName>National Science Foundation /funderName>
      funderIdentifier funderIdentifierType="Crossref Funder ID">https://doi.org/10.13039/100000001 /funderIdentifier>
      awardNumber>CBET-106 /awardNumber>
      awardTitle>Full DataCite XML Example /awardTitle>
    /fundingReference> /fundingReferences>
    * datafield tag="536" ind1=" " ind2=" "> subfield code="a">Fonds zur Förderung der Wissenschaftlichen Forschung /subfield> subfield code="f">P 35530 /subfield>
        * funderName> --> subfield code="a">
        * awardNumber> --> subfield code="f">
        * funderIdentifier --> DOI des Funders (ev. wenn FWF: xxx, else: yyy)
        
