## Initialisation
Before running the initialisation cell below, you must upload your completed DRI Batch Metadata Template file in Excel format to storage available to the Notebook. You should also create an output folder to store the generated XML files.

When you have completed this you should click on the run icon for the cell below which will read in your metadta file and set up some data that will be used in the rest of the Notebook.

Bear in mind that it is important to run the cells in Jupyter Notebooks in order. If you try to run the checking or processing cells before you have run this initialisation cell, you will get errors or unexpected results.

In [47]:
import pandas as pd
from os import path
import re

# Setup lists of valid fields which we will use for checking our metadata later
dc_fields = [
    'dc:identifier',
    'dc:title',
    'dcterms:alternative',
    'dc:creator',
    'dc:date',
    'dcterms:created',
    'dcterms:issued',
    'dc:description',
    'dc:rights',
    'dc:type',
    'dcterms:accessRights',
    'dc:language',
    'dc:contributor',
    'dc:source',
    'dc:coverage',
    'dcterms:spatial',
    'dcterms:temporal',
    'dc:subject',
    'dcterms:depicted',
    'dc:relation',
    'determs:isVersionOf',
    'dcterms:hasVersion',
    'dcterms:isPartOf',
    'dcterms:hasPart',
    'dcterms:isReferencedBy',
    'dcterms:references',
    'dcterms:isFormatOf',
    'dcterms:hasFormat']

marcrel_fields = ["abr","act","adp","rcp","anl","anm","ann","anc","apl","ape",
                  "app","arc","arr","acp","adi","art","ill","ard","asg","asn",
                  "fmo","att","auc","aue","aup","aut","aqt","aud","ato","ant",
                  "bnd","bdd","blw","bka","bkd","bkp","bjd","bpd","bsl","brl",
                  "brd","cll","cop","ctg","cas","cad","cns","chr","cng","cli",
                  "cor","col","clt","clr","cmm","cwt","com","cpl","cpt","cpe",
                  "cmp","cmt","ccp","cnd","con","csl","csp","cos","cot","coe",
                  "cts","ctt","cte","ctr","ctb","cpc","cph","crr","crp","cst",
                  "cou","crt","cov","cre","cur","dnc","dtc","dtm","dte","dto",
                  "dfd","dft","dfe","dgc","dgg","dgs","dln","dpc","dpt","dsr",
                  "drt","dis","dbp","dst","djo","dnr","drm","dbd","dub","edt",
                  "edc","edm","edd","elg","elt","enj","eng","egr","etr","evp",
                  "exp","fac","fld","fmd","fds","flm","fmp","fmk","fpy","frg",
                  "fmo","fon","fnd","gdv","gis","hnr","hst","his","ilu","ill",
                  "ink","ins","itr","ive","ivr","inv","isb","jud","jug","lbr",
                  "ldr","lsa","led","len","ltr","lil","lit","lie","lel","let",
                  "lee","lbt","lse","lso","lgd","ltg","lyr","mka","mfp","mfr",
                  "mrb","mrk","med","mdc","mte","mtk","mxe","mod","mon","mcp",
                  "mup","msd","mus","nrt","nan","onp","osp","opn","orm","org",
                  "oth","own","pan","ppm","pta","pth","pat","pnc","prf","prf",
                  "pma","pht","pad","ptf","ptt","pte","plt","pra","pre","prt",
                  "pop","prm","prc","pro","prn","prs","pmn","prd","prp","prg",
                  "pdr","pfr","crr","prv","pbl","pup","pbl","pbd","ppt","rdd",
                  "rpc","rap","rce","rcd","red","rxa","ren","rpt","rps","rth",
                  "rtm","res","rsp","rst","rse","rpy","rsg","rsr","rev","rbr",
                  "sce","sad","aus","scr","fac","scl","spy","sec","sll","std",
                  "stg","sgn","ins","sng","swd","sds","sde","spk","sfx","spn",
                  "sgd","stm","stn","str","stl","sht","srv","tch","tad","tcd",
                  "tld","tlg","tlh","tlp","tau","ths","trc","fac","trl","tyd",
                  "tyg","bkd","uvp","vdg","vfx","vac","wit","wde","wdc","wam",
                  "wac","wal","wat","waw","wfs","wfw","wft","win","wpr","wst",
                  "wts"]

xml_header = '<qualifieddc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:marcrel="http://www.loc.gov/marc.relators/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/marc.relators/ http://imlsdcc2.grainger.illinois.edu/registry/marcrel.xsd" xsi:noNamespaceSchemaLocation="http://dublincore.org/schemas/xmls/qdc/2008/02/11/qualifieddc.xsd">'
xml_footer = '</qualifieddc>'

dri_mandatory_fields = ['dc:title',
    'dc:creator',
    'dc:date',
    'dcterms:created',
    'dcterms:issued',
    'dc:description',
    'dc:rights',
    'dc:type']

# Get the filename to read and the output folder
filename = ""
while True:
  if not path.isfile(filename):
    print('You must enter a valid filename for the file containing your metadata.')
  else:
    print(f'Successfully found the input file "{filename}"')
    break
  filename = input('Please enter the name of the file you have uploaded containing the cleaned metadata: ')

tab = input(f'Please enter the name of the tab in the spreadsheet {filename} which contains the metadata')

outputdir = ""
while True:
  if not path.isdir(outputdir):
    print('You must enter a valid folder name for the output folder, make sure you have created the folder first.')
  else:
    print(f'Successfully found the output folder "{outputdir}"')
    break
  outputdir = input('Please enter the name of the output folder: ')

print('If no errors are displayed here, you may now progress to execute the next code step.')
print('You may run the proceeding code steps as many times as you want without having to re-run this Initialisation step.')
print('You only need to re-run the Initialisation step if you see an error here, or if you want to change the input file or output folder.')
    
def lookup_case_insensitive(value, choices):
    """Return the matching value from choices with original case, or None if not found."""
    value_cf = value.casefold()
    lookup = {c.casefold(): c for c in choices}
    return lookup.get(value_cf)

class CaseInsensitiveLookupManager:
    def __init__(self):
        self.lookups = {}

    def register(self, name, choices):
        """Register a lookup set by name."""
        self.lookups[name] = {c.casefold(): c for c in choices}

    def get(self, name, value):
        """Case-insensitive lookup in the named set."""
        return self.lookups.get(name, {}).get(value.casefold())

    def find_in(self, names, value):
        """
        Search for value in multiple sets in order.
        Returns (set_name, match) or (None, None) if not found.
        """
        value_cf = value.casefold()
        for name in names:
            match = self.lookups.get(name, {}).get(value_cf)
            if match is not None:
                return name, match
        return None, None

You must enter a valid filename for the file containing your metadata.


Please enter the name of the file you have uploaded containing the cleaned metadata:  DRI_Metadata Template_withdups.xlsx


Successfully found the input file "DRI_Metadata Template_withdups.xlsx"


Please enter the name of the tab in the spreadsheet DRI_Metadata Template_withdups.xlsx which contains the metadata Template


You must enter a valid folder name for the output folder, make sure you have created the folder first.


Please enter the name of the output folder:  output


Successfully found the output folder "output"
If no errors are displayed here, you may now progress to execute the next code step.
You may run the proceeding code steps as many times as you want without having to re-run this Initialisation step.
You only need to re-run the Initialisation step if you see an error here, or if you want to change the input file or output folder.


## Check and Clean the Metadata
The following cell will perform some checks on your metadata to make sure that it can be converted to Dublin Core XML files.

1. First it will discard any empty columns, necessary as there will likely be columns in the metadata template that are not relevant for your data. We don't want to include these as empty Dublin Core elements in our xml files.

2. Next it will check that all of the column headers match a valid Dublin Core element. You may have added additional columns to handle multiple values, e.g. for multiple Subjects you may have dc:Subject, dc:Subject2, dc:Subject3, etc. The Notebook will ask you to clarify the metadata mapping for any columns where this is unclear.

In [48]:
# Read in the metadata file
try:
    df = pd.read_excel(filename, tab)
except Exception as e:
    print('Failed to read the metadata file. '
          'This may be because the first step has not been run or encountered an error. '
          'Alternatively, you may have given the wrong name for the tab containing your metadata. '
          'Please run the first step again.')
    print(e)

# drop any unused columns
df = df.dropna(axis=1, how='all')

drop_fields = []
mappings = {}

marcrel_pattern = re.compile('^marcrel:.*', re.IGNORECASE)

lookup_manager = CaseInsensitiveLookupManager()
lookup_manager.register("dc", dc_fields)
lookup_manager.register("marcrel", marcrel_fields)

for field in df.columns:
    field_cf = field.casefold()

    if field_cf == "filename":
        continue

    if marcrel_pattern.match(field):
        ns1, code1 = field.split(":")
        _, match1 = lookup_manager.find_in(["marcrel"], code1)
        if not match1:
            print(f'Field "{field}" is not a valid MARC Relator code.')
            while True:
              tmp = input(
                f'Please specify the field (in the format marcrel:xxx) for mapping. '
                f'Hit enter to ignore this column: '
              )
              if tmp == "":
                print(f'Dropping column {field}')
                drop_fields.append(field)
                break
              else:
                ns2, code2 = tmp.split(":")
                _, match2 = lookup_manager.find_in(["marcrel"], code2)
                if not match2:
                  print(f'The input "{tmp}" is not a valid MARC Relator code.')
                else:
                  print(f'Valid MARC Relator code input. The field {field} will be mapped to {tmp} in the Dublin Core output file')
                  mappings[field] = tmp
                  break
        else:
            mappings[field] = "marcrel:"+match1

    else:
        _, match1 = lookup_manager.find_in(["dc"], field)
        if not match1:
            print(f'Field "{field}" is not a valid Dublin Core field.')
            while True:
                tmp = input(
                    f'Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. '
                    f'Hit enter to ignore this column: '
                )
                if tmp == "":
                    print(f'Dropping column {field}')
                    drop_fields.append(field)
                    break
                else:
                    _, match2 = lookup_manager.find_in(["dc"], tmp)
                    if not match2:
                        print(f'The input "{tmp}" is not a valid Dublin Core field.')
                    else:
                        print(f'Valid Dublin Core input. The field {field} will be mapped to {tmp} in the Dublin Core output file')
                        mappings[field] = tmp
                        break
        else:
            mappings[field] = match1

df = df.drop(drop_fields, axis=1)

print('Finished running the second step.')
print('If you do not see any errors here, you can safely proceed to the third step.')

Field "dc:Subject.1" is not a valid Dublin Core field.


Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. Hit enter to ignore this column:  dc:subject


Valid Dublin Core input. The field dc:Subject.1 will be mapped to dc:subject in the Dublin Core output file
Field "potato" is not a valid Dublin Core field.


Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. Hit enter to ignore this column:  dc:subject


Valid Dublin Core input. The field potato will be mapped to dc:subject in the Dublin Core output file
Field "potato.1" is not a valid Dublin Core field.


Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. Hit enter to ignore this column:  dc:subject


Valid Dublin Core input. The field potato.1 will be mapped to dc:subject in the Dublin Core output file
Field "dc:Subject 2" is not a valid Dublin Core field.


Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. Hit enter to ignore this column:  dc:subject


Valid Dublin Core input. The field dc:Subject 2 will be mapped to dc:subject in the Dublin Core output file
Field "dc:Subject 3" is not a valid Dublin Core field.


Please specify the dc field (in the format dc:xxxxxx or dcterms:xxxxxx) for mapping. Hit enter to ignore this column:  dc:subject


Valid Dublin Core input. The field dc:Subject 3 will be mapped to dc:subject in the Dublin Core output file
Field "marcrel:***" is not a valid MARC Relator code.


Please specify the field (in the format marcrel:xxx) for mapping. Hit enter to ignore this column:  


Dropping column marcrel:***
Finished running the second step.
If you do not see any errors here, you can safely proceed to the third step.


## Process Metadata and create XML files
This next cell will iterate through all rows in your metadata spreadsheet.

It will perform some additional checks as follows:

1. It will look for cell values containing an ampersand (&) character. These may not be valid XML and need to be escaped (converted to the string "&amp;"). It will print a warning, but will not automatically convert these.
2. It will check to see if all of the DRI Required fields are present. If not it will print a warning.

Please note that both of the above checks will only result in a warning. They will not be corrected, or stop execution. Read the output from this step carefully to determine whether you need to make further changes to your metadata spreadsheet or to the output XML files.

Finally it will convert the cell values to xml and write these out to the output file which will be named for your Filename cell. A zip file metadata.zip, and a tar archive metadata.tar.gz, containing of your output folder will also be created. These can then be downloaded to your local machine and extracted, before ingesting into DRI or another Repository or application that supports Dublin Core metadata.


In [49]:
amp_pattern = re.compile(".*&.*")

for index, row in df.iterrows():
  if pd.isna(row["Filename"]):
      print('WARNING: A row without a "Filename" has been found. This row cannot be written to an XML file. '
            'Please check your metadata Spreadsheet and correct if required.')
  xmlfile = open(f'{outputdir}/{row["Filename"]}.xml', 'w')
  xmlfile.write(xml_header)
        
  for field in df.columns:
    if field == "Filename":
      continue
    elif pd.isna(row[field]):
      if (mappings[field] in dri_mandatory_fields) and (mappings[field] in ['dc:date','dcterms:created','dcterms:issued']):
        print(f'WARNING: The file {row["Filename"]} does not have a {field} field. '
             'This is a mandatory field for DRI ingest, but this may not be a problem if you have one of the fields '
             'dc:date, dcterms:created or dcterms:issued')
      elif mappings[field] in dri_mandatory_fields:
          print(f'WARNING: The file {row["Filename"]} does not have a {field} field. '
                'This is a mandatory field for DRI ingest.')
      continue
    else:
      if amp_pattern.match(str(row[field])):
        print(f'WARNING: The following metadata entry in file {row["Filename"]} contains an & character and may not produce valid XML:')
        print(row[field])
      tmp = f'<{mappings[field]}>{row[field]}</{mappings[field]}>\n'
      xmlfile.write(tmp)
  xmlfile.write(xml_footer)
  xmlfile.close()

!zip -r metadata.zip $outputdir
!tar -czf metadata.tar.gz $outputdir

print('Finished running the final step.')
print(f'If you did not see any errors for any of the steps, then your output files should now be available in the {outputdir} folder')
print('You should also see a zip file and/or tar.gz archive containing the output folder')
print('Please do not forget to download the zip/archive file before exiting your Jupyter Notebooks environment.')


potato2&
/bin/bash: line 1: zip: command not found
Finished running the final step.
If you did not see any errors for any of the steps, then your output files should now be available in the output folder
You should also see a zip file and/or tar.gz archive containing the output folder
Please do not forget to download the zip/archive file before exiting your Jupyter Notebooks environment.
