Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
92 lines (74 sloc) 3.04 KB
Extract SetId, Id, NDC from fields in SPL
TODO(mattmo): Logging to a file when fields are not present.
from lxml import etree
import StringIO
import re
NDC_OID = '2.16.840.1.113883.6.69'
UNII_OID = '2.16.840.1.113883.4.9'
UNII_OTHER_OID = '2.16.840.1.113883.'
def remove_namespace(xml_file):
"""SPL files have namespaces that don't play nice with python's libxml.
For now we just remove namespaces to work around this."""
# TODO(mattmo): Figure out a better solution to this.
# via-xpath-in-python-in-a-namespace-agnostic-way
lines =
lines = re.sub(r'<document [^>]+>', '<document>', lines)
lines = re.sub(r'\w+:type="[^"]+"', '', lines)
return StringIO.StringIO(lines)
def parse_xml(xml_file):
p = etree.XMLParser(huge_tree=True)
tree = etree.parse(remove_namespace(open(xml_file)), parser=p)
return tree
def first_match_or_empty_string(matches):
if len(matches) > 0:
return matches[0]
return ''
def extract_title(tree):
return ' '.join(tree.getroot().xpath('/document/title/descendant::text()'))
def extract_id(tree):
return first_match_or_empty_string(
def extract_set_id(tree):
return first_match_or_empty_string(
def extract_effective_time(tree):
raw_date = first_match_or_empty_string(
if not raw_date:
return ''
year, month, day = raw_date[:4], raw_date[4:6], raw_date[6:]
return year + "-" + month + "-" + day
def extract_version_number(tree):
return first_match_or_empty_string(
def extract_display_name(tree):
return first_match_or_empty_string(
def extract_duns(tree):
duns_xpath = '//id[@root="%s"]/@extension' % DUNS_OID
return first_match_or_empty_string(tree.getroot().xpath(duns_xpath))
def is_original_packager(tree):
"Returns true if this SPL's manufacturer is the original packager."
return len(extract_original_packager_product_ndcs(tree)) == 0
def extract_product_ndcs(tree):
"Extracts the partial (labelercode-productcode) NDCs."
ndc_xpath = '//manufacturedProduct/code[@codeSystem="%s"]/@code' % NDC_OID
return tree.getroot().xpath(ndc_xpath)
def extract_original_packager_product_ndcs(tree):
"""Extracts the partial (labelercode-productcode) NDCs for the original
packager. Only populated if this SPL is for a repackager."""
ndc_xpath = \
'//definingMaterialKind/code[@codeSystem="%s"]/@code' % NDC_OID
return tree.getroot().xpath(ndc_xpath)
def extract_package_ndcs(tree):
"""Extracts the full labelercode-productcode-packagecode NDCs"""
ndc_xpath = \
'//containerPackagedProduct/code[@codeSystem="%s"]/@code' % NDC_OID
return tree.getroot().xpath(ndc_xpath)
You can’t perform that action at this time.