## This notebook creates a report on all of the metadata files in a directory and subdirectory.
1. finds all of the xml files in a directory and it's sub directories.
1. opens each xml file
1. extracts out the FGDC title and abstract
1. validates the FGDC record against the FGDC schema, and lists errors
1. writes each filename, title, abstract and list of errors to a csv.

In [1]:
import glob
import csv

import pandas as pd

#This needs to be run using the python install that contains the pymdwizard and it's dependencies
from pymdwizard.core import fgdc_utils, xml_utils

### Change this to the directory we're going to be searching through

In [2]:
dname = r"c:\temp"

md_fnames = list(glob.iglob('{}/**/*.xml'.format(dname), recursive=True))
md_fnames = [f for f in md_fnames if '~' not in f]

### Change this to a filename to save the output to

In [3]:
out_fname = r"spatial_fgdc_reportx.csv"

In [4]:
with open(out_fname, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"',quoting=csv.QUOTE_MINIMAL)
    
    csvwriter.writerow(['filename', 'title', 'abstract', 'errors'])
    for fname in md_fnames:
        try:
            
            xml = xml_utils.fname_to_node(fname)
            fname = fname.replace(dname, '')

            title = xml.xpath('idinfo/citation/citeinfo/title')[0].text
            abstract = xml.xpath('idinfo/descript/abstract')[0].text
            errors = fgdc_utils.validate_xml(xml, xsl_fname='fgdc')
        except:
            title = "<<<could not get the title>>>"
            abstract = "<<<could not get the abstract>>>"
            errors = "<<<could not open file>>>"
            
        csvwriter.writerow([fname, title, abstract, str(errors)])

In [5]:
pd.read_csv(out_fname, encoding='latin-1').head(15)

Unnamed: 0,filename,title,abstract,errors
0,\42012_FGDC.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
1,\42012_FGDC_FGDC.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
2,\42012_FGDC_Original.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
3,\42012_line_FGDC.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
4,\42012_line_Original.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
5,\42012_Original.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
6,\az_FWSRefugeBdys_may05_FGDC.xml,az_FWSRefugeBdys_may05,This data set depicts refuge boundary informat...,[('metadata/idinfo/timeperd/timeinfo/rngdates/...
7,\az_FWSRefugeBdys_may05_Original.xml,az_FWSRefugeBdys_may05,This data set depicts refuge boundary informat...,"[('metadata/idinfo/native', ""Element 'begdate'..."
8,\bio_01_2007_4km_FGDC.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
9,\bio_01_2007_4km_Original.xml,<<<could not get the title>>>,<<<could not get the abstract>>>,<<<could not open file>>>
