In [58]:
import xml.etree.ElementTree as ET
import re

In [96]:
tree = ET.parse("inputFile.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)
record= collection[0]

In [98]:
text = "error Message"
def write_log(record, text):
    acNr = record.find(".//controlfield[@tag='009']").text
    with open("log.txt", "a") as logFile:
        logFile.write("{} - {}\n".format(acNr, text))
write_log(record, text)

# Functions

## Identifier

In [59]:
def create_identifier(output):
    '''Searches the 024 field'''
    identifierMRC = record.findall(".//datafield[@tag='024']")

    for item in identifierMRC:
        if item.find("subfield[@code='2']").text == "doi":
            identifier = ET.Element("identifier")
            identifier.attrib = {"identifierType":"DOI"}
            identifier.text = item.find("subfield[@code='a']").text

            output.append(identifier)
            
        elif item.find("subfield[@code='2']").text == "urn":
            altidentifiers = ET.Element("alternateIdentifiers")
            altidentifier = ET.SubElement(altidentifiers, "alternateIdentifier")
            altidentifier.attrib = {"alternateIdentifierType":item.find("subfield[@code='2']").text}
            altidentifier.text = item.find("subfield[@code='a']").text

            output.append(altidentifiers)

## Author

In [60]:
def helper_create_creator(author, mainElement):
    creator = ET.SubElement(mainElement, "creator")

    creatorName = ET.SubElement(creator, "creatorName")
    creatorName.attrib = {"nameType":"Personal"}
    creatorName.text = author.find("subfield[@code='a']").text

    givenName = ET.SubElement(creator, "givenName")
    givenName.text = author.find("subfield[@code='a']").text.split(",")[1]

    familyName = ET.SubElement(creator, "familyName")
    familyName.text = author.find("subfield[@code='a']").text.split(",")[0]


def create_creator(output):
    main_authorMRC = record.find(".//datafield[@tag='100']")
    creators = ET.Element("creators")

    # create main Author
    helper_create_creator(main_authorMRC, creators)
    
    # create side-authors
    if record.findall(".//datafield[@tag='700']") != None:
        side_authorMRC = record.findall(".//datafield[@tag='700']")
        for author in side_authorMRC:
            helper_create_creator(author, creators)

    output.append(creators)

## Title

In [61]:
def create_title(output):
    titleMRC = record.find(".//datafield[@tag='245']")
    titles = ET.Element("titles")
    # ------------------------------ Main Title
    title = ET.SubElement(titles, "title")
    # check if << OR >> are present in title and remove them
    titleText = titleMRC.find("subfield[@code='a']").text
    toCut = re.findall("<<|>>", titleText)
    for item in toCut:
            if item in titleText:
                    titleText = titleText.replace(item, "")
    title.text = titleText
    # ------------------------------ Subtitle
    if titleMRC.find("subfield[@code='b']") != None:
        subtitle = ET.SubElement(titles, "title")
        subtitle.attrib = {
            "titleType":"Subtitle"
            }
        subtitle.text = titleMRC.find("subfield[@code='b']").text

    output.append(titles)

## Publisher/Publication Year

In [62]:
def create_publisher(output):
    # publisher -------------------------------------------------------------
    publisherMRC = record.find(".//datafield[@tag='264']")
    publisher = ET.Element("publisher")
    publisher.text = publisherMRC.find("subfield[@code='b']").text
    
    output.append(publisher)
    
    # publication Year -------------------------------------------------------------
    publicationYear = ET.Element("publicationYear")
    pubDate = re.search("\d{4}", publisherMRC.find("subfield[@code='c']").text).group() # search for 4 digits in the "c" Subfield
    publicationYear.text = pubDate
    
    output.append(publicationYear)

## Language

In [63]:
def create_language(output):
    languageMRC = record.find(".//datafield[@tag='041']")

    language = ET.Element("language")
    language.text = languageMRC.find("subfield[@code='a']").text

    output.append(language)

## Fields with fixed Value

### Resource Type

In [64]:
def create_resourceType(output):
    resourceType = ET.Element("resourceType")
    resourceType.attrib = {"resourceTypeGeneral":"Text"}
    # resourceType.text = " "

    output.append(resourceType)

### Formats

In [65]:
def create_formats(output):
    formats = ET.Element("formats")
    formatDC = ET.SubElement(formats, "format")
    formatDC.text = "PDF"

    output.append(formats)

### Rights

In [66]:
def create_rights(output):
    rightsList = ET.Element("rightsList")
    rights = ET.SubElement(rightsList, "rights")
    rights.attrib = {
        "rightsIdentifier":"CC BY 4.0",
        "rightsURI":"https://creativecommons.org/licenses/by/4.0/legalcode"
        }
    # rights.text = " "

    output.append(rightsList)

## Descriptions

In [67]:
def create_descriptions(output):
    descriptions = ET.Element("descriptions")

    # ------------- Abstract
    abstractMRC = record.findall(".//datafield[@tag='520']")

    for item in abstractMRC:
        description = ET.SubElement(descriptions, "description")
        description.attrib = {"descriptionType":"Abstract"}
        # cut "eng: " or "ger: " from abstract text
        oldText = item.find("subfield[@code='a']").text
        toCut = re.search("^eng: |^ger: ", oldText).group()
        abstractText = oldText.replace(toCut, "")
        description.text = abstractText

    # ------------- Series Information
    description = ET.SubElement(descriptions, "description")
    description.attrib = {"descriptionType":"SeriesInformation"}

    check_for_490 = record.find(".//datafield[@tag='490']")
    # field 490 is not present - .find() returns None

    if check_for_490 is None:
        seriesInformationMRC = record.find(".//datafield[@tag='773']")
        description.text = str(
            seriesInformationMRC.find("subfield[@code='t']").text + 
            ", " + 
            seriesInformationMRC.find("subfield[@code='g']").text
            )
    else:
        seriesInformationMRC = record.find(".//datafield[@tag='490']")
        description.text = str(
            seriesInformationMRC.find("subfield[@code='a']").text + 
            ", " + 
            seriesInformationMRC.find("subfield[@code='v']").text
            )

    output.append(descriptions)

## Size

In [68]:
def create_size(output):
    sizeMRC = record.find(".//datafield[@tag='300']")
    sizes = ET.Element("sizes")
    size = ET.SubElement(sizes, "size")
    if re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text) is None:
        size.text = record.find(".//datafield[@tag='300']").find("subfield[@code='a']").text
    else:
        pageNr = re.search("(?<=\()\d+", sizeMRC.find("subfield[@code='a']").text).group() # match 1-n digits after ()
        size.text = str(pageNr + " pages")

    output.append(sizes)

## Funding Reference

In [69]:
def create_fundingReference(output):
    fundingMRC = record.findall(".//datafield[@tag='536']")
    fundingReferences = ET.Element("fundingReferences")

    for item in fundingMRC:
        fundingReference = ET.SubElement(fundingReferences, "fundingReference")
        funderName = ET.SubElement(fundingReference, "funderName")
        funderIdentifier = ET.SubElement(fundingReference, "funderIdentifier")
        funderIdentifier.attrib = {"funderIdentifierType":"Crossref Funder ID"}
        awardNumber = ET.SubElement(fundingReference, "awardNumber")
        awardNumber.text = item.find(("subfield[@code='f']")).text
        
        if item.find(("subfield[@code='a']")).text == "Fonds zur Förderung der Wissenschaftlichen Forschung":
            funderName.text = "Austrian Science Fund"
            funderIdentifier.text = "https://doi.org/10.13039/501100002428"
        elif item.find(("subfield[@code='a']")).text == "Österreichische Forschungsförderungsgesellschaft":
            funderName.text = "Österreichische Forschungsförderungsgesellschaft"
            funderIdentifier.text = "https://doi.org/10.13039/501100004955"
        elif  item.find(("subfield[@code='a']")).text == "Europäische Kommission":
            funderName.text = "European Commission"
            funderIdentifier.text = "https://doi.org/10.13039/501100000780"

    output.append(fundingReferences)

# Create DCxml

In [70]:
def create_DCxml(record):
    # create output root
    output = ET.Element("resource")
    output.attrib = {
        "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
        "xsi:schemaLocation": "http://datacite.org/schema/kernel-4 https://schema.datacite.org/meta/kernel-4.4/metadata.xsd"
        }

    # Dictionary to map functions to Alma Datafield
    tagsDict = {
        "024":create_identifier,
        "041":create_language,
        "100":create_creator,
        "245":create_title,
        "264":create_publisher,
        "300":create_size,
        "520":create_descriptions,
        "536":create_fundingReference
        }
    # if datafield is in Alma xml - run function
    for key in tagsDict:
        if record.find(".//datafield[@tag='{}']".format(key)) != None:
            # print(record.find(".//datafield[@tag='{}']".format(key)))
            tagsDict[key](output)

    # Functions with set values (Fields that are created with hard-coded value)
    create_formats(output)
    create_resourceType(output)
    create_rights(output)

    return output

# Main

In [71]:
# Load Alma xml
tree = ET.parse("AC16588536.xml")
collection = tree.getroot() # rename root in collection (export starts with collection)

for record in collection:
    output = create_DCxml(record)
    # create tree ---------------------------------------------------
    outputTree = ET.ElementTree(output)
    # -------------------------------------------------------------
    # write output--------------------------------------------------------------
    acNr = record.find(".//controlfield[@tag='009']").text
    outputTree.write("{}_{}.xml".format("output", acNr))