Add inline html base processing

Arelle · Dec 16, 2018 · 4b7e74d · 4b7e74d
1 parent 8a9f50c
commit 4b7e74d
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 14 deletions.
diff --git a/arelle/HashUtil.py b/arelle/HashUtil.py
@@ -71,7 +71,7 @@ def md5hash(argList):
             elif isinstance(_arg, ModelObject):
                 # use inner text list
                 _md5.update('\x1F'.join(text.strip() 
-                                        for text in XmlUtil.innerTextNodes(_arg, True, False, True))
+                                        for text in XmlUtil.innerTextNodes(_arg, True, False, True, False))
                             .encode('utf-8','replace'))
     if firstMd5arg:
         md5sum = MD5SUM0

diff --git a/arelle/ModelDocument.py b/arelle/ModelDocument.py
@@ -70,6 +70,7 @@ def load(modelXbrl, uri, base=None, referringElement=None, isEntry=False, isDisc
             # HMRC note, HMRC.blockedFile should be in this list if hmrc-taxonomies.xml is maintained an dup to date
             modelXbrl.error(("EFM.6.22.02", "GFM.1.1.3", "SBR.NL.2.1.0.06" if normalizedUri.startswith("http") else "SBR.NL.2.2.0.17"),
                     _("Prohibited file for filings %(blockedIndicator)s: %(url)s"),
+                    edgarCode="cp-2202-Prohibited-Href-Or-Schema-Location",
                     modelObject=referringElement, url=normalizedUri,
                     blockedIndicator=_(" blocked") if blocked else "",
                     messageCodes=("EFM.6.22.02", "GFM.1.1.3", "SBR.NL.2.1.0.06", "SBR.NL.2.2.0.17"))
@@ -905,6 +906,7 @@ def baseForElement(self, element):
                 if self.modelXbrl.modelManager.validateDisclosureSystem:
                     self.modelXbrl.error(("EFM.6.03.11", "GFM.1.1.7", "EBA.2.1", "EIOPA.2.1"),
                         _("Prohibited base attribute: %(attribute)s"),
+                        edgarCode="du-0311-Xml-Base-Used",
                         modelObject=element, attribute=baseAttr, element=element.qname)
                 else:
                     if baseAttr.startswith("/"):
@@ -1193,6 +1195,7 @@ def unitDiscover(self, unitElement):
 
     def inlineXbrlDiscover(self, htmlElement):
         ixNS = None
+        htmlBase = None
         conflictingNSelts = []
         # find namespace, only 1 namespace
         for inlineElement in htmlElement.iterdescendants():
@@ -1201,17 +1204,22 @@ def inlineXbrlDiscover(self, htmlElement):
                     ixNS = inlineElement.namespaceURI
                 elif ixNS != inlineElement.namespaceURI:
                     conflictingNSelts.append(inlineElement)
+            elif inlineElement.tag == "{http://www.w3.org/1999/xhtml}base":
+                htmlBase = inlineElement.get("href")
         if ixNS is None: # no inline element, look for xmlns namespaces on htmlElement:
             for _ns in htmlElement.nsmap.values():
                 if _ns in XbrlConst.ixbrlAll:
                     ixNS = _ns
                     break
+        if htmlBase is None:
+            htmlBase = os.path.dirname(self.uri) + "/"
         if conflictingNSelts:
             self.modelXbrl.error("ix:multipleIxNamespaces",
                     _("Multiple ix namespaces were found"),
                     modelObject=conflictingNSelts)
         self.ixNS = ixNS
         self.ixNStag = ixNStag = "{" + ixNS + "}" if ixNS else ""
+        self.htmlBase = htmlBase
         ixdsTarget = getattr(self.modelXbrl, "ixdsTarget", None)
         # load referenced schemas and linkbases (before validating inline HTML
         for inlineElement in htmlElement.iterdescendants(tag=ixNStag + "references"):

diff --git a/arelle/ModelInstanceObject.py b/arelle/ModelInstanceObject.py
@@ -604,10 +604,12 @@ def value(self):
             self.xValid = UNVALIDATED # may not be initialized otherwise
             self.xValue = None
             f = self.format
+            ixEscape = self.get("escape") in ("true","1")
             v = XmlUtil.innerText(self, 
                                   ixExclude="tuple" if self.elementQname == XbrlConst.qnIXbrl11Tuple else "html", 
-                                  ixEscape=(self.get("escape") in ("true","1")), 
+                                  ixEscape=ixEscape, 
                                   ixContinuation=(self.elementQname == XbrlConst.qnIXbrl11NonNumeric),
+                                  ixResolveUris=ixEscape,
                                   strip=(f is not None)) # transforms are whitespace-collapse, otherwise it is preserved.
             if self.isNil:
                 self._ixValue = v
@@ -1514,6 +1516,7 @@ def value(self):
                                   ixExclude=True, 
                                   ixEscape="html", 
                                   ixContinuation=(self.namespaceURI != XbrlConst.ixbrl),
+                                  ixResolveUris=True,
                                   strip=True) # include HTML constructs
 
             return self._ixValue

diff --git a/arelle/Version.py b/arelle/Version.py
@@ -7,3 +7,4 @@
 '''
 __version__ = '1.2018.01.06'  # number version of code base and date compiled
 version = '2018-01-06 20:15 UTC'  # string version of date compiled
+copyrightLatestYear = '2018'  # string version of year compiled
diff --git a/arelle/XhtmlValidate.py b/arelle/XhtmlValidate.py
@@ -6,7 +6,7 @@
 
 (originally part of XmlValidate, moved to separate module)
 '''
-from arelle import XbrlConst, XmlUtil, XmlValidate, ValidateFilingText
+from arelle import XbrlConst, XmlUtil, XmlValidate, ValidateFilingText, UrlUtil
 from arelle.ModelValue import qname
 from arelle.ModelObject import ModelObject
 from arelle.PythonUtil import normalizeSpace
@@ -106,6 +106,18 @@
     "datetime": "dateTime",
     "hfreflang": "language"
     }
+htmlEltUriAttrs = { # attributes with URI content (for relative correction and %20 canonicalization
+    "a": {"href"},
+    "area": {"href"},
+    "blockquote": {"cite"},
+    "del": {"cite"},
+    "form": {"action"},
+    "input": {"src", "usemap"},
+    "ins": {"cite"},
+    "img": {"src", "longdesc", "usemap"},
+    "object": {"classid", "codebase", "data", "archive", "usemap"},
+    "q": {"cite"},
+    }
 ixAttrRequired = {
     XbrlConst.ixbrl: {
         "footnote": ("footnoteID",),
@@ -345,7 +357,7 @@ def checkHierarchyConstraints(elt):
                     if relations is None: relations = []
                     else: relations = [relations]
                 if rel == "child-or-text":
-                    relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False)
+                    relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False, ixResolveUris=False)
                 issue = ''
                 if reqt in ('^',):
                     if not any(r.localName in names and r.namespaceURI == elt.namespaceURI
@@ -543,3 +555,20 @@ def copyNonIxChildren(fromElt, toElt, excludeSubtree=False):
             _("%(element)s error %(error)s"),
             modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
 
+def resolveHtmlUri(elt, name, value):
+    if name == "archive": # URILIST
+        return " ".join(resolveHtmlUri(elt, None, v) for v in value.split(" "))
+    if not UrlUtil.isAbsolute(value) and not value.startswith("/"):
+        if elt.modelDocument.htmlBase is not None:
+            value = elt.modelDocument.htmlBase + value
+    # canonicalize ../ and ./
+    authority, sep, path = value.rpartition("://")
+    inpaths = path.split("/")
+    outpaths = []
+    for path in inpaths:
+        if path == "..":
+            if len(outpaths) > 1:
+                outpaths.pop()
+        elif path != "." and (path != "" or len(outpaths) == 0):
+            outpaths.append(path.replace(" ", "%20"))
+    return authority + sep + "/".join(outpaths)
diff --git a/arelle/XmlUtil.py b/arelle/XmlUtil.py
@@ -13,6 +13,7 @@
 from arelle.XbrlConst import ixbrlAll, qnLinkFootnote, xhtml, xml, xsd, xhtml
 from arelle.ModelObject import ModelObject, ModelComment
 from arelle.ModelValue import qname, QName
+htmlEltUriAttrs = resolveHtmlUri = None
 
 datetimePattern = re.compile(r"\s*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):([0-9]{2})\s*|"
                              r"\s*([0-9]{4})-([0-9]{2})-([0-9]{2})\s*")
@@ -150,47 +151,54 @@ def textNotStripped(element):
     return element.textValue  # allows embedded comment nodes, returns '' if None
 
 # ixEscape can be None, "html" (xhtml namespace becomes default), "xhtml", or "xml"
-def innerText(element, ixExclude=False, ixEscape=None, ixContinuation=False, strip=True):   
+def innerText(element, ixExclude=False, ixEscape=None, ixContinuation=False, ixResolveUris=False, strip=True):   
     try:
-        text = "".join(text for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation))
+        text = "".join(text for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris))
         if strip:
             return text.strip()
         return text
     except (AttributeError, TypeError):
         return ""
 
-def innerTextList(element, ixExclude=False, ixEscape=None, ixContinuation=False):   
+def innerTextList(element, ixExclude=False, ixEscape=None, ixContinuation=False, ixResolveUris=False):   
     try:
-        return ", ".join(text.strip() for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation) if len(text.strip()) > 0)
+        return ", ".join(text.strip() for text in innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris) if len(text.strip()) > 0)
     except (AttributeError, TypeError):
         return ""
 
-def innerTextNodes(element, ixExclude, ixEscape, ixContinuation):
+def innerTextNodes(element, ixExclude, ixEscape, ixContinuation, ixResolveUris):
+    global htmlEltUriAttrs, resolveHtmlUri
+    if htmlEltUriAttrs is None:
+        from arelle.XhtmlValidate import htmlEltUriAttrs, resolveHtmlUri
     if element.text:
         yield escapedText(element.text) if ixEscape else element.text
     for child in element.iterchildren():
         if isinstance(child,ModelObject) and (
            not ixExclude or 
            not ((child.localName == "exclude" or ixExclude == "tuple") and child.namespaceURI in ixbrlAll)):
             firstChild = True
-            for nestedText in innerTextNodes(child, ixExclude, ixEscape, False): # nested elements don't participate in continuation chain
+            for nestedText in innerTextNodes(child, ixExclude, ixEscape, False, ixResolveUris): # nested elements don't participate in continuation chain
                 if firstChild and ixEscape:
-                    yield escapedNode(child, True, False, ixEscape)
+                    yield escapedNode(child, True, False, ixEscape, ixResolveUris)
                     firstChild = False
                 yield nestedText
             if ixEscape:
-                yield escapedNode(child, False, firstChild, ixEscape)
+                yield escapedNode(child, False, firstChild, ixEscape, ixResolveUris)
         if child.tail:
             yield escapedText(child.tail) if ixEscape else child.tail
     if ixContinuation:
         contAt = getattr(element, "_continuationElement", None)
         if contAt is not None:
-            for contText in innerTextNodes(contAt, ixExclude, ixEscape, ixContinuation):
+            for contText in innerTextNodes(contAt, ixExclude, ixEscape, ixContinuation, ixResolveUris):
                 yield contText
 
-def escapedNode(elt, start, empty, ixEscape):
+def escapedNode(elt, start, empty, ixEscape, ixResolveUris):
     if elt.namespaceURI in ixbrlAll:
         return ''  # do not yield XML for nested facts
+    if ixResolveUris:
+        uriAttrs = htmlEltUriAttrs.get(elt.qname.localName, ())
+    else:
+        uriAttrs = ()
     s = ['<']
     if not start and not empty:
         s.append('/')
@@ -200,6 +208,8 @@ def escapedNode(elt, start, empty, ixEscape):
         s.append(str(elt.qname))
     if start or empty:
         for n,v in sorted(elt.items(), key=lambda item: item[0]):
+            if n in uriAttrs:
+                v = resolveHtmlUri(elt, n, v)
             s.append(' {0}="{1}"'.format(qname(elt,n),
                                          v.replace("&","&amp;").replace('"','&quot;')))
     if not start and empty: