Skip to content

Commit

Permalink
Add inline html base processing
Browse files Browse the repository at this point in the history
  • Loading branch information
hefischer committed Dec 16, 2018
1 parent 8a9f50c commit 4b7e74d
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 14 deletions.
2 changes: 1 addition & 1 deletion arelle/HashUtil.py
Expand Up @@ -71,7 +71,7 @@ def md5hash(argList):
elif isinstance(_arg, ModelObject):
# use inner text list
_md5.update('\x1F'.join(text.strip()
for text in XmlUtil.innerTextNodes(_arg, True, False, True))
for text in XmlUtil.innerTextNodes(_arg, True, False, True, False))
.encode('utf-8','replace'))
if firstMd5arg:
md5sum = MD5SUM0
Expand Down
8 changes: 8 additions & 0 deletions arelle/ModelDocument.py
Expand Up @@ -70,6 +70,7 @@ def load(modelXbrl, uri, base=None, referringElement=None, isEntry=False, isDisc
# HMRC note, HMRC.blockedFile should be in this list if hmrc-taxonomies.xml is maintained an dup to date
modelXbrl.error(("EFM.6.22.02", "GFM.1.1.3", "SBR.NL.2.1.0.06" if normalizedUri.startswith("http") else "SBR.NL.2.2.0.17"),
_("Prohibited file for filings %(blockedIndicator)s: %(url)s"),
edgarCode="cp-2202-Prohibited-Href-Or-Schema-Location",
modelObject=referringElement, url=normalizedUri,
blockedIndicator=_(" blocked") if blocked else "",
messageCodes=("EFM.6.22.02", "GFM.1.1.3", "SBR.NL.2.1.0.06", "SBR.NL.2.2.0.17"))
Expand Down Expand Up @@ -905,6 +906,7 @@ def baseForElement(self, element):
if self.modelXbrl.modelManager.validateDisclosureSystem:
self.modelXbrl.error(("EFM.6.03.11", "GFM.1.1.7", "EBA.2.1", "EIOPA.2.1"),
_("Prohibited base attribute: %(attribute)s"),
edgarCode="du-0311-Xml-Base-Used",
modelObject=element, attribute=baseAttr, element=element.qname)
else:
if baseAttr.startswith("/"):
Expand Down Expand Up @@ -1193,6 +1195,7 @@ def unitDiscover(self, unitElement):

def inlineXbrlDiscover(self, htmlElement):
ixNS = None
htmlBase = None
conflictingNSelts = []
# find namespace, only 1 namespace
for inlineElement in htmlElement.iterdescendants():
Expand All @@ -1201,17 +1204,22 @@ def inlineXbrlDiscover(self, htmlElement):
ixNS = inlineElement.namespaceURI
elif ixNS != inlineElement.namespaceURI:
conflictingNSelts.append(inlineElement)
elif inlineElement.tag == "{http://www.w3.org/1999/xhtml}base":
htmlBase = inlineElement.get("href")
if ixNS is None: # no inline element, look for xmlns namespaces on htmlElement:
for _ns in htmlElement.nsmap.values():
if _ns in XbrlConst.ixbrlAll:
ixNS = _ns
break
if htmlBase is None:
htmlBase = os.path.dirname(self.uri) + "/"
if conflictingNSelts:
self.modelXbrl.error("ix:multipleIxNamespaces",
_("Multiple ix namespaces were found"),
modelObject=conflictingNSelts)
self.ixNS = ixNS
self.ixNStag = ixNStag = "{" + ixNS + "}" if ixNS else ""
self.htmlBase = htmlBase
ixdsTarget = getattr(self.modelXbrl, "ixdsTarget", None)
# load referenced schemas and linkbases (before validating inline HTML
for inlineElement in htmlElement.iterdescendants(tag=ixNStag + "references"):
Expand Down
5 changes: 4 additions & 1 deletion arelle/ModelInstanceObject.py
Expand Up @@ -604,10 +604,12 @@ def value(self):
self.xValid = UNVALIDATED # may not be initialized otherwise
self.xValue = None
f = self.format
ixEscape = self.get("escape") in ("true","1")
v = XmlUtil.innerText(self,
ixExclude="tuple" if self.elementQname == XbrlConst.qnIXbrl11Tuple else "html",
ixEscape=(self.get("escape") in ("true","1")),
ixEscape=ixEscape,
ixContinuation=(self.elementQname == XbrlConst.qnIXbrl11NonNumeric),
ixResolveUris=ixEscape,
strip=(f is not None)) # transforms are whitespace-collapse, otherwise it is preserved.
if self.isNil:
self._ixValue = v
Expand Down Expand Up @@ -1514,6 +1516,7 @@ def value(self):
ixExclude=True,
ixEscape="html",
ixContinuation=(self.namespaceURI != XbrlConst.ixbrl),
ixResolveUris=True,
strip=True) # include HTML constructs

return self._ixValue
Expand Down
1 change: 1 addition & 0 deletions arelle/Version.py
Expand Up @@ -7,3 +7,4 @@
'''
__version__ = '1.2018.01.06' # number version of code base and date compiled
version = '2018-01-06 20:15 UTC' # string version of date compiled
copyrightLatestYear = '2018' # string version of year compiled
33 changes: 31 additions & 2 deletions arelle/XhtmlValidate.py
Expand Up @@ -6,7 +6,7 @@
(originally part of XmlValidate, moved to separate module)
'''
from arelle import XbrlConst, XmlUtil, XmlValidate, ValidateFilingText
from arelle import XbrlConst, XmlUtil, XmlValidate, ValidateFilingText, UrlUtil
from arelle.ModelValue import qname
from arelle.ModelObject import ModelObject
from arelle.PythonUtil import normalizeSpace
Expand Down Expand Up @@ -106,6 +106,18 @@
"datetime": "dateTime",
"hfreflang": "language"
}
htmlEltUriAttrs = { # attributes with URI content (for relative correction and %20 canonicalization
"a": {"href"},
"area": {"href"},
"blockquote": {"cite"},
"del": {"cite"},
"form": {"action"},
"input": {"src", "usemap"},
"ins": {"cite"},
"img": {"src", "longdesc", "usemap"},
"object": {"classid", "codebase", "data", "archive", "usemap"},
"q": {"cite"},
}
ixAttrRequired = {
XbrlConst.ixbrl: {
"footnote": ("footnoteID",),
Expand Down Expand Up @@ -345,7 +357,7 @@ def checkHierarchyConstraints(elt):
if relations is None: relations = []
else: relations = [relations]
if rel == "child-or-text":
relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False)
relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False, ixResolveUris=False)
issue = ''
if reqt in ('^',):
if not any(r.localName in names and r.namespaceURI == elt.namespaceURI
Expand Down Expand Up @@ -543,3 +555,20 @@ def copyNonIxChildren(fromElt, toElt, excludeSubtree=False):
_("%(element)s error %(error)s"),
modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())

def resolveHtmlUri(elt, name, value):
if name == "archive": # URILIST
return " ".join(resolveHtmlUri(elt, None, v) for v in value.split(" "))
if not UrlUtil.isAbsolute(value) and not value.startswith("/"):
if elt.modelDocument.htmlBase is not None:
value = elt.modelDocument.htmlBase + value
# canonicalize ../ and ./
authority, sep, path = value.rpartition("://")
inpaths = path.split("/")
outpaths = []
for path in inpaths:
if path == "..":
if len(outpaths) > 1:
outpaths.pop()
elif path != "." and (path != "" or len(outpaths) == 0):
outpaths.append(path.replace(" ", "%20"))
return authority + sep + "/".join(outpaths)
30 changes: 20 additions & 10 deletions arelle/XmlUtil.py
Expand Up @@ -13,6 +13,7 @@
from arelle.XbrlConst import ixbrlAll, qnLinkFootnote, xhtml, xml, xsd, xhtml
from arelle.ModelObject import ModelObject, ModelComment
from arelle.ModelValue import qname, QName
htmlEltUriAttrs = resolveHtmlUri = None

datetimePattern = re.compile(r"\s*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):([0-9]{2})\s*|"
r"\s*([0-9]{4})-([0-9]{2})-([0-9]{2})\s*")
Expand Down Expand Up @@ -150,47 +151,54 @@ def textNotStripped(element):
return element.textValue # allows embedded comment nodes, returns '' if None

# ixEscape can be None, "html" (xhtml namespace becomes default), "xhtml", or "xml"
def innerText(element, ixExclude=False, ixEscape=None, ixContinuation=False, strip=True):
def innerText(element, ixExclude=False, ixEscape=None, ixContinuation=False, ixResolveUris=False, strip=True):
try:
text = "".join(text for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation))
text = "".join(text for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris))
if strip:
return text.strip()
return text
except (AttributeError, TypeError):
return ""

def innerTextList(element, ixExclude=False, ixEscape=None, ixContinuation=False):
def innerTextList(element, ixExclude=False, ixEscape=None, ixContinuation=False, ixResolveUris=False):
try:
return ", ".join(text.strip() for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation) if len(text.strip()) > 0)
return ", ".join(text.strip() for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris) if len(text.strip()) > 0)
except (AttributeError, TypeError):
return ""

def innerTextNodes(element, ixExclude, ixEscape, ixContinuation):
def innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris):
global htmlEltUriAttrs, resolveHtmlUri
if htmlEltUriAttrs is None:
from arelle.XhtmlValidate import htmlEltUriAttrs, resolveHtmlUri
if element.text:
yield escapedText(element.text) if ixEscape else element.text
for child in element.iterchildren():
if isinstance(child,ModelObject) and (
not ixExclude or
not ((child.localName == "exclude" or ixExclude == "tuple") and child.namespaceURI in ixbrlAll)):
firstChild = True
for nestedText in innerTextNodes(child, ixExclude, ixEscape, False): # nested elements don't participate in continuation chain
for nestedText in innerTextNodes(child, ixExclude, ixEscape, False, ixResolveUris): # nested elements don't participate in continuation chain
if firstChild and ixEscape:
yield escapedNode(child, True, False, ixEscape)
yield escapedNode(child, True, False, ixEscape, ixResolveUris)
firstChild = False
yield nestedText
if ixEscape:
yield escapedNode(child, False, firstChild, ixEscape)
yield escapedNode(child, False, firstChild, ixEscape, ixResolveUris)
if child.tail:
yield escapedText(child.tail) if ixEscape else child.tail
if ixContinuation:
contAt = getattr(element, "_continuationElement", None)
if contAt is not None:
for contText in innerTextNodes(contAt, ixExclude, ixEscape, ixContinuation):
for contText in innerTextNodes(contAt, ixExclude, ixEscape, ixContinuation, ixResolveUris):
yield contText

def escapedNode(elt, start, empty, ixEscape):
def escapedNode(elt, start, empty, ixEscape, ixResolveUris):
if elt.namespaceURI in ixbrlAll:
return '' # do not yield XML for nested facts
if ixResolveUris:
uriAttrs = htmlEltUriAttrs.get(elt.qname.localName, ())
else:
uriAttrs = ()
s = ['<']
if not start and not empty:
s.append('/')
Expand All @@ -200,6 +208,8 @@ def escapedNode(elt, start, empty, ixEscape):
s.append(str(elt.qname))
if start or empty:
for n,v in sorted(elt.items(), key=lambda item: item[0]):
if n in uriAttrs:
v = resolveHtmlUri(elt, n, v)
s.append(' {0}="{1}"'.format(qname(elt,n),
v.replace("&","&amp;").replace('"','&quot;')))
if not start and empty:
Expand Down

0 comments on commit 4b7e74d

Please sign in to comment.