## This notebook creates a report on all of the metadata files in a directory and subdirectory.
1. finds all of the xml files in a directory and it's sub directories.
1. opens each xml file
1. extracts out the FGDC title and abstract
1. validates the FGDC record against the FGDC schema, and lists errors
1. writes each filename, title, abstract and list of errors to a csv.

In [1]:
import glob
import csv

import pandas as pd

#This needs to be run using the python install that contains the pymdwizard and it's dependencies
from pymdwizard.core import fgdc_utils, xml_utils

### Change this to the directory we're going to be searching through

In [2]:
dname = r"..\Spatial_Data"

md_fnames = list(glob.iglob('{}/**/*.xml'.format(dname), recursive=True))

### Change this to a filename to save the output to

In [3]:
out_fname = r"spatial_fgdc_reportx.csv"

In [7]:
with open(out_fname, 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"',quoting=csv.QUOTE_MINIMAL)
    
    spamwriter.writerow(['filename', 'title', 'abstract', 'errors'])
    for fname in md_fnames:
        xml = xml_utils.fname_to_node(fname)
        fname = fname.replace(dname, '')
        try:
            title = xml.xpath('idinfo/citation/citeinfo/title')[0].text
            abstract = xml.xpath('idinfo/descript/abstract')[0].text
        except:
            title = "<<<could not get the title>>>"
            abstract = "<<<could not get the abstract>>>"
        errors = fgdc_utils.validate_xml(xml, xsl_fname='fgdc')
        spamwriter.writerow([fname, title, abstract, str(errors)])

In [8]:
pd.read_csv(out_fname).head(5)

Unnamed: 0,filename,title,abstract,errors
0,\Analysis_Structures\Bottomland_Polygons\BLPol...,<<<could not get the title>>>,<<<could not get the abstract>>>,"[('metadata', ""Element 'metadata', attribute '..."
1,\Analysis_Structures\Bottomland_Polygons\Botto...,<<<could not get the title>>>,<<<could not get the abstract>>>,"[('metadata', ""Element 'metadata', attribute '..."
2,\Associated_Models\Cost_of_Restoration\Cost_of...,"Ease of Access to Bottomland Areas (Sept, 2010...",This dataset represents ease of access to bott...,"[('metadata/eainfo/detailed', ""Element 'detail..."
3,\Associated_Models\Cost_of_Restoration\Cost_of...,"Relative Abundance of Herbaceous, Non-native S...",This dataset represents the relative abundance...,[('metadata/dataqual/attracc/qattracc/attracce...
4,\Associated_Models\Cost_of_Restoration\Cost_of...,"Relative Abundance of Woody, Non-native Specie...",This dataset represents the relative abundance...,[('metadata/dataqual/attracc/qattracc/attracce...
