In [1]:
# https://support.datacite.org/docs/fields-file-upload
# https://support.datacite.org/docs/doi-content-negotiation#section-supported-content-types


In [2]:
import xml.etree.ElementTree as ET

In [3]:
# https://docs.python.org/3/library/xml.etree.elementtree.html#tutorial

# https://www.youtube.com/watch?v=j0xr0-IAqyk&ab_channel=Socratica

# Basics

## Load xml

In [4]:
tree = ET.parse("bibliographicData.xml")
root = tree.getroot()
# ET.tostring(root) # transforms xml to a string

## Tagname and Attributes

In [5]:
# print the tagname of root
root.tag

'record'

In [6]:
# Get Attributes
root.get("name") # get Attribute called "name"

'Tagname'

In [7]:
root.attrib # returns all attributes as a Dictionary

{'name': 'Tagname'}

In [8]:
root.set("newTagName", "bla") # set new attribute
root.attrib 

{'name': 'Tagname', 'newTagName': 'bla'}

## Save Tree into xml File

In [9]:
tree.write("bibliographicData-updated.xml") # Write new Attribute to xml File

# Get Childs

Die wichtigsten Argumente sind:
* .text - gibt den Inhalt des tags an
* .tag - gibt den tagnamen an
* .attrib - gibt die Attribute als Dictionary an

In [10]:
# hildren are nested, and we can access specific child nodes by index
root[19][0].text

'Grashoff, Dietrich'

In [11]:
# get all childs
for child in root:
    print(child.tag, child.attrib)

leader {}
controlfield {'tag': '001'}
controlfield {'tag': '005'}
controlfield {'tag': '007'}
controlfield {'tag': '008'}
controlfield {'tag': '009'}
datafield {'tag': '020', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '035', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '040', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '041', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '044', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '082', 'ind1': '0', 'ind2': '4'}
datafield {'tag': '084', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '084', 'ind1': ' ', 'ind2': ' '}
datafield {'tag': '100', 'ind1': '1', 'ind2': ' '}
datafield {'tag': '240', 'ind1': '1', 'ind2': '0'}
datafield {'tag': '245', 'ind1': '1', 'ind2': '0'}
datafield {'tag': '250', 'ind1': '

## find and findall Function

In [12]:
tree.findall("controlfield") # find all childs with name controlfield

[<Element 'controlfield' at 0x0000014F24C3D300>,
 <Element 'controlfield' at 0x0000014F24C3D350>,
 <Element 'controlfield' at 0x0000014F24C3D3F0>,
 <Element 'controlfield' at 0x0000014F24C3D440>,
 <Element 'controlfield' at 0x0000014F24C3D490>]

In [13]:
# access the fields
# possible: tag, attrib, text
for field in tree.findall("controlfield"):
    print(field.tag, field.attrib, field.text)

controlfield {'tag': '001'} 997020655103340
controlfield {'tag': '005'} 20220704101451.0
controlfield {'tag': '007'} cr#|||||||||||
controlfield {'tag': '008'} 190219s2018    ||||||||o|||| 00||||ger
controlfield {'tag': '009'} AC15290932


### Find specific Datafield

In [14]:
author = root.find(".//datafield[@tag='100']")
# https://www.w3.org/TR/1999/REC-xpath-19991116/#path-abbrev
# @name selects the name attribute of the context node
# .//para selects the para element descendants of the context node

author[0].text

'Grashoff, Dietrich'

In [15]:
# print all subfield text
for child in author:
    print(child.tag, child.attrib, child.text)

subfield {'code': 'a'} Grashoff, Dietrich
subfield {'code': 'd'} 1966-
subfield {'code': '0'} (DE-588)113632029
subfield {'code': '4'} aut


# Modify XML

## Add / Delete Attribute

In [16]:
# add incrementing id to controlfield tag
id = 1
for field in tree.findall("controlfield"):
    field.set("id", str(id)) # id is int - has to be str
    id+=1
tree.write("bibliographicData-updated.xml")

In [17]:
# Delete "id" attributes
for field in tree.findall("controlfield"):
    del(field.attrib["id"])
tree.write("bibliographicData-updated.xml")

## Add Element

### Method 1 (fromstring)

In [18]:
# Add Controlfield
# controlfield1 = ET.fromstring("<controlfield>NewField</controlfield>") # create field from string
# root.append(controlfield1) # append element to root node

### Method 2 (Element Constructor)

In [19]:
# Add Controlfield
controlfield2 = ET.Element("controlfield") # Element Constructor - pass in Name of Tag
controlfield2.text = "Element Field" # Set Text Value of Element
root.append(controlfield2) # append element to root node

In [20]:
# save
tree.write("bibliographicData-updated.xml")

### Create Datafield
With Indicator and Subfield

In [21]:
# Create new Title (240 1 0)
# https://docs.python.org/3/library/xml.etree.elementtree.html#building-xml-documents
datafield1 = ET.Element("datafield") # create datafield
datafield1.attrib = {"tag": "240", "ind1": "1", "ind2": "0"} # add indicators (attributes) to datafield
subfield1 = ET.SubElement(datafield1, "subfield") # add subfield to datafield
subfield1.attrib = {"code": "a"} # add subfield code
subfield1.text = "New Title" # add subfield text
root.append(datafield1) # append to root

<datafield tag="240" ind1="1" ind2="0"><subfield code="a">New Title</subfield></datafield>


In [22]:
tree.write("bibliographicData-updated.xml")