# How to validate an FGDC XML record in python with only the schema

In [None]:
import requests
from lxml import etree
from lxml import html

In [None]:
url = "https://www2.usgs.gov/datamanagement/documents/USGS_ASC_PolarBears_FGDC.xml"
fname= r"N:\Metadata\MetadataWizard\pymdwizard\tests\data\USGS_ASC_PolarBears_FGDC_savedas2.xml"

In [3]:
res = requests.get(url)
# doc = lxml.html.parse(res.content[3:])
doc = etree.fromstring(res.content[3:])

In [4]:
doc = etree.parse(fname)

schema_fname = r"C:\temp\junk\BDPSchema\BDPfgdc-std-001-1998.xsd"
schema_fname2 = r"X:\FORT-wide Resources\DataManagement\Metadata\Tools\XMLNotepad\BDPfgdc-std-001-1998-annotated.xsd"
xmlschema_doc = etree.parse(schema_fname)
xmlschema = etree.XMLSchema(xmlschema_doc)

## This record has no errors, of course. 

In [5]:
xmlschema.validate(doc)

False

## Let's introduce some errors.

In [6]:
# Bad date format
doc.getchildren()[0].getchildren()[2].getchildren()[0].getchildren()[0].getchildren()[0].text = 'Bad date format'

AttributeError: 'lxml.etree._ElementTree' object has no attribute 'getchildren'

In [7]:
#missing keywords
idinfo = doc.getchildren()[0]
idinfo.remove(doc.getchildren()[0].getchildren()[5])

AttributeError: 'lxml.etree._ElementTree' object has no attribute 'getchildren'

In [8]:
#duplicate logical consistency
dataqual = doc.getchildren()[1]
logical2 = etree.Element('logic')
logical2.text = 'This is illogical'
dataqual.insert(1, logical2)

AttributeError: 'lxml.etree._ElementTree' object has no attribute 'getchildren'

## Does it pass? 

In [9]:
xmlschema.validate(doc)

False

## What's wrong with it?

In [10]:
for error in xmlschema.error_log:
    print(error.message, " at line number ", error.line)

Element 'citation': Missing child element(s). Expected is ( citeinfo ).  at line number  3
Element 'descript': Missing child element(s). Expected is ( abstract ).  at line number  4
Element 'timeperd': Missing child element(s). Expected is ( timeinfo ).  at line number  5
Element 'status': Missing child element(s). Expected is ( progress ).  at line number  6
Element 'accconst': [facet 'pattern'] The value '' is not accepted by the pattern '\s*\S(.|\n|\r)*'.  at line number  15
Element 'accconst': '' is not a valid value of the atomic type 'accconstType'.  at line number  15
Element 'useconst': [facet 'pattern'] The value '' is not accepted by the pattern '\s*\S(.|\n|\r)*'.  at line number  16
Element 'useconst': '' is not a valid value of the atomic type 'useconstType'.  at line number  16
Element 'cntorg': [facet 'pattern'] The value '' is not accepted by the pattern '\s*\S(.|\n|\r)*'.  at line number  20
Element 'cntorg': '' is not a valid value of the atomic type 'cntorgType'.  at 

## But we really want the xpath to the error and not the line numbers

In [11]:
import lxml.etree

def get_xpath_from_line_nums(doc, line_nums):
#     tree = lxml.etree.ElementTree(doc)
    line_lookup = dict([(e.sourceline, tree.getpath(e)) for e in tree.xpath('.//*')])
    results = {}
    for i in line_nums:
        print(i)
        results[i] = line_lookup[i]
    return results
#     return dict([(i, line_lookup[i]) for i in line_nums])

In [12]:
doc.getroot().sourceline

1

In [14]:
type(doc)

lxml.etree._ElementTree

In [35]:
dict([(e.sourceline, tree.getpath(e)) for e in tree.xpath('.//*')])


{3: '/metadata/idinfo',
 4: '/metadata/idinfo/citation',
 5: '/metadata/idinfo/citation/citeinfo',
 6: '/metadata/idinfo/citation/citeinfo/origin',
 7: '/metadata/idinfo/citation/citeinfo/pubdate',
 8: '/metadata/idinfo/citation/citeinfo/title',
 9: '/metadata/idinfo/citation/citeinfo/geoform',
 10: '/metadata/idinfo/citation/citeinfo/serinfo',
 11: '/metadata/idinfo/citation/citeinfo/serinfo/sername',
 12: '/metadata/idinfo/citation/citeinfo/serinfo/issue',
 16: '/metadata/idinfo/descript',
 17: '/metadata/idinfo/descript/abstract',
 18: '/metadata/idinfo/descript/purpose',
 20: '/metadata/idinfo/timeperd',
 21: '/metadata/idinfo/timeperd/timeinfo',
 22: '/metadata/idinfo/timeperd/timeinfo/rngdates',
 23: '/metadata/idinfo/timeperd/timeinfo/rngdates/begdate',
 24: '/metadata/idinfo/timeperd/timeinfo/rngdates/enddate',
 27: '/metadata/idinfo/timeperd/current',
 29: '/metadata/idinfo/status',
 30: '/metadata/idinfo/status/progress',
 31: '/metadata/idinfo/status/update',
 33: '/metadata

In [36]:
error_lines = [error.line for error in xmlschema.error_log]
# tree = lxml.etree.ElementTree(doc)
get_xpath_from_line_nums(doc, error_lines)

3
4
5
6
15


KeyError: 15

In [52]:
# tree = lxml.etree.ElementTree(doc)
line_lookup = dict([(e.sourceline, doc.getpath(e)) for e in doc.xpath('.//*')])
line_lookup[doc.getroot().sourceline] = doc.getpath(doc.getroot())
for error in xmlschema.error_log:
    print(error.line)
    print("     ", line_lookup[error.line], error.message, " at line number ", error.line)

3
      /metadata/idinfo/citation Element 'citation': Missing child element(s). Expected is ( citeinfo ).  at line number  3
4
      /metadata/idinfo/descript Element 'descript': Missing child element(s). Expected is ( abstract ).  at line number  4
5
      /metadata/idinfo/timeperd Element 'timeperd': Missing child element(s). Expected is ( timeinfo ).  at line number  5
6
      /metadata/idinfo/status Element 'status': Missing child element(s). Expected is ( progress ).  at line number  6
15
      /metadata/idinfo/accconst Element 'accconst': [facet 'pattern'] The value '' is not accepted by the pattern '\s*\S(.|\n|\r)*'.  at line number  15
15
      /metadata/idinfo/accconst Element 'accconst': '' is not a valid value of the atomic type 'accconstType'.  at line number  15
16
      /metadata/idinfo/useconst Element 'useconst': [facet 'pattern'] The value '' is not accepted by the pattern '\s*\S(.|\n|\r)*'.  at line number  16
16
      /metadata/idinfo/useconst Element 'useconst': '' 

In [47]:
line_lookup

{2: '/metadata/idinfo',
 3: '/metadata/idinfo/citation',
 4: '/metadata/idinfo/descript',
 5: '/metadata/idinfo/timeperd',
 6: '/metadata/idinfo/status',
 7: '/metadata/idinfo/keywords',
 8: '/metadata/idinfo/keywords/theme',
 9: '/metadata/idinfo/keywords/theme/themekt',
 10: '/metadata/idinfo/keywords/theme/themekey[1]',
 11: '/metadata/idinfo/keywords/theme/themekey[2]',
 12: '/metadata/idinfo/keywords/theme/themekey[3]',
 15: '/metadata/idinfo/accconst',
 16: '/metadata/idinfo/useconst',
 17: '/metadata/idinfo/ptcontac',
 18: '/metadata/idinfo/ptcontac/cntinfo',
 19: '/metadata/idinfo/ptcontac/cntinfo/cntorgp',
 20: '/metadata/idinfo/ptcontac/cntinfo/cntorgp/cntorg',
 21: '/metadata/idinfo/ptcontac/cntinfo/cntorgp/cntper',
 23: '/metadata/idinfo/ptcontac/cntinfo/cntpos',
 24: '/metadata/idinfo/ptcontac/cntinfo/cntaddr',
 25: '/metadata/idinfo/ptcontac/cntinfo/cntaddr/addrtype',
 26: '/metadata/idinfo/ptcontac/cntinfo/cntaddr/address',
 27: '/metadata/idinfo/ptcontac/cntinfo/cntaddr

In [44]:
tree = lxml.etree.ElementTree(doc)

TypeError: Argument 'element' has incorrect type (expected lxml.etree._Element, got lxml.etree._ElementTree)

In [None]:
xmlschema_doc.xpath('//a/@ref')