# Examples of Using Pymdwizard library to batch-update XML Metadata

* Change all instances of an author's name
* Update project webpage & publication date
* Reset title to match that stored in "Citation" field



In [1]:
import glob
import pandas as pd
from lxml import etree
import sys
import os
from pymdwizard.core.xml_utils import XMLRecord, XMLNode #import in this manner requires adding a path file to your site-packages directory

** List the existing XML files you wish to edit **

In [3]:
#folder with xml metadata
xml_fldr=r"..\_XMLMetadata"
os.chdir(xml_fldr)

#Pattern of the XML files you wish to select
xml_pattern="*RawGPR.xml"
#List RawGPR xmls
xmls=glob.glob(xml_pattern)

** Update Project Webpage **

In [5]:
for fl in xmls:
    metd=XMLRecord(fl)
    new_website="https://staging-www.usgs.gov/climate_landuse/clu_rd/glacierstudies/data.asp"
    metd.metadata.idinfo.citation.citeinfo.lworkcit.citeinfo.onlink.text=new_website
    metd.save()

** Change Person's Name (e.g. add middle initial) **

In [6]:
#Edit Name for given person
replace_name="Erin Whorton"
new_name="Erin N. Whorton"
for fl in xmls:
    metd=XMLRecord(fl)
    for name in metd.metadata.idinfo.citation.citeinfo.origin:
        if name.text==replace_name: #replace old name with new name
            name.text=new_name
    metd.save()

** Update Publication Date **

In [7]:
pubdate="201707" #set date as you wish it to appear in publication node
for fl in xmls:
    metd=XMLRecord(fl)
    metd.metadata.idinfo.citation.citeinfo.pubdate.text=pubdate
    metd.save()

** Change List of Authors in Larger Work Citation **

Author list can be either read in from an XMl fragment:

In [8]:
all_author_path=r"../people.xml"
all_author=XMLNode(open(all_author_path, 'r').read()) #Must read in text file as string to convert to node

Or, created from text entered directly in the notebook:

In [9]:
all_author=XMLNode(r'''
<people>
<origin>Shad O'Neel (ORCID 0000-0002-9185-0144)</origin>
<origin>Daniel McGrath (ORCID 0000-0002-9462-6842)</origin>
<origin>Gabriel J. Wolken</origin>
<origin>Salvatore G. Candela</origin>
<origin>Louis C. Sass (ORCID 0000-0003-4677-029X)</origin>
<origin>Christopher J. McNeil (ORCID 0000-0003-4170-0428)</origin>
<origin>Emily H. Baker (OrcID 0000-0002-0938-3496)</origin>
<origin>Esther L. Babcock (ORCID 0000-0001-7665-7795)</origin>
<origin>Michael G. Loso</origin>
<origin>Anthony A. Arendt</origin>
<origin>Erin N. Whorton</origin>
<origin>Evan W. Burgess</origin>
<origin>Alessio Gusmeroli</origin>
</people>)''')

Then, this list of authors can be added to each record, after initial contents is deleted

In [10]:
for fl in xmls:
    metd=XMLRecord(fl) #read xml
    metd.metadata.idinfo.citation.citeinfo.lworkcit.citeinfo.clear_children('origin') #Clear contents of current author list
    for author in all_author.people.origin: #add authors from xml list back in to the larger work cited 'origin' tag
        metd.metadata.idinfo.citation.citeinfo.lworkcit.citeinfo.add_child(author)
    metd.save()

** Reset title of dataset to match that in the Citation section**

The two should match, but this may not always be the case (e.g. date ranges)

In [11]:
for fl in xmls:
    metd=XMLRecord(fl)
    #this will need to be re-written depending on how it is possible to split your desired "Title" from the larger citation.
    title_from_citation=metd.metadata.idinfo.citation.citeinfo.othercit.text.split("2017, ")[1].split(".")[0] 
    #the above line splits "Title" from the citation, in format of "Author. et. al, 2017, Title."
    metd.metadata.idinfo.citation.citeinfo.title.text=title_from_citation
    metd.save()