# How to validate an FGDC XML record in python with only the schema

In [8]:
import requests
from lxml import etree
from lxml import html

In [9]:
url = "https://www2.usgs.gov/datamanagement/documents/USGS_ASC_PolarBears_FGDC.xml"

In [19]:
res = requests.get(url)
# doc = lxml.html.parse(res.content[3:])
doc = etree.fromstring(res.content[3:])

In [28]:
schema_fname = r"C:\temp\junk\BDPSchema\BDPfgdc-std-001-1998.xsd"
xmlschema_doc = etree.parse(schema_fname)
xmlschema = etree.XMLSchema(xmlschema_doc)

## This record has no errors, of course. 

In [30]:
xmlschema.validate(doc)

True

## Let's introduce some errors.

In [49]:
# Bad date format
doc.getchildren()[0].getchildren()[2].getchildren()[0].getchildren()[0].getchildren()[0].text = 'Bad date format'

In [60]:
#missing keywords
idinfo = doc.getchildren()[0]
idinfo.remove(doc.getchildren()[0].getchildren()[5])

In [72]:
#duplicate logical consistency
dataqual = doc.getchildren()[1]
logical2 = etree.Element('logic')
logical2.text = 'This is illogical'
dataqual.insert(1, logical2)

## Does it pass? 

In [75]:
xmlschema.validate(doc)

False

## What's wrong with it?

In [78]:
for error in xmlschema.error_log:
    print(error.message, " at line number ", error.line)

Element 'begdate': 'Bad date format' is not a valid value of the union type 'begdateType'.  at line number  23
Element 'taxonomy': This element is not expected. Expected is one of ( spdom, keywords ).  at line number  56
Element 'logic': This element is not expected. Expected is ( complete ).  at line number  133


## But we really want the xpath to the error and not the line numbers

In [81]:
import lxml.etree

def get_xpath_from_line_nums(doc, line_nums):
    tree = lxml.etree.ElementTree(doc)
    line_lookup = dict([(e.sourceline, tree.getpath(e)) for e in tree.xpath('.//*')])
    return dict([(i, line_lookup[i]) for i in line_nums])

In [82]:
error_lines = [error.line for error in xmlschema.error_log]
get_xpath_from_line_nums(doc, error_lines)

{23: '/metadata/idinfo/timeperd/timeinfo/rngdates/begdate',
 56: '/metadata/idinfo/taxonomy',
 133: '/metadata/dataqual/logic[2]'}