Skip to content

Commit

Permalink
Merge pull request #862 from stevenbronson-wk/jpeg-mime
Browse files Browse the repository at this point in the history
  • Loading branch information
austinmatherne-wk committed Sep 14, 2023
2 parents 49dfd56 + 0d4e35e commit cb7272c
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 64 deletions.
5 changes: 5 additions & 0 deletions arelle/UrlUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from urllib.parse import urldefrag, unquote, quote, urljoin
from email.utils import parsedate
from datetime import datetime
from typing import overload

def authority(url: str, includeScheme: bool=True) -> str:
if url:
Expand Down Expand Up @@ -388,6 +389,10 @@ def relativeUri(baseUri: str, relativeUri: str) -> str: # return uri relative to
return os.path.relpath(relativeUri, os.path.dirname(baseUri)).replace('\\','/')


@overload
def decodeBase64DataImage(imageData: None) -> None: ...
@overload
def decodeBase64DataImage(imageData: str) -> bytes: ...
def decodeBase64DataImage(imageData: str | None) -> bytes | None:
if imageData is None:
return None
Expand Down
39 changes: 30 additions & 9 deletions arelle/ValidateFilingText.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
'''
See COPYRIGHT.md for copyright information.
'''
from __future__ import annotations
#import xml.sax, xml.sax.handler
from lxml.etree import XML, DTD, SubElement, _ElementTree, _Comment, _ProcessingInstruction, XMLSyntaxError, XMLParser
from dataclasses import dataclass
import os, io, base64
import regex as re
from arelle.XbrlConst import ixbrlAll, xhtml
Expand All @@ -26,7 +28,9 @@

inlinePattern = re.compile(r"xmlns:[\w.-]+=['\"]http://www.xbrl.org/2013/inlineXBRL['\"]")
inlineSelfClosedElementPattern = re.compile(r"<(([\w.-]+:)?(\w+))([^\w/][^<]*)?/>")
imgDataMediaBase64Pattern = re.compile(r"data:image([^,;]*)(;base64)?,(.*)$", re.S)
# The ESEF 2022 conformance suite G2-5-1_2 TC3_invalid depends on optional "/".
# <img src="data:image;base64,iVBOR...
imgDataMediaBase64Pattern = re.compile(r"data:image(?:/(?P<mimeSubtype>[^,;]*))?(?P<base64>;base64)?,(?P<data>.*)$", re.S)

edbodyDTD = None
isInlineDTD = None
Expand Down Expand Up @@ -623,11 +627,11 @@ def validateTextBlockFacts(modelXbrl):
if attrTag == "src" and allowedImageTypes and attrValue not in checkedGraphicsFiles:
if scheme(attrValue) == "data":
try: # allow embedded newlines
m = imgDataMediaBase64Pattern.match(attrValue)
dataURLParts = parseImageDataURL(attrValue)
if (not allowedImageTypes["data-scheme"] or
not m or not m.group(1) or not m.group(2)
or m.group(1)[1:] not in allowedImageTypes["mime-types"]
or m.group(1)[1:] != validateGraphicHeaderType(decodeBase64DataImage(m.group(3)))):
not dataURLParts or not dataURLParts.mimeSubtype or not dataURLParts.isBase64
or dataURLParts.mimeSubtype not in allowedImageTypes["mime-types"]
or not dataURLParts.base64GraphicHeaderTypeMatchesMimeSubtype()):
modelXbrl.error(("EFM.6.05.16.graphicDataUrl", "FERC.6.05.16.graphicDataUrl"),
_("Fact %(fact)s of context %(contextID)s references a graphics data URL which isn't accepted or valid '%(attribute)s' for <%(element)s>"),
modelObject=f1, fact=f1.qname, contextID=f1.contextID,
Expand Down Expand Up @@ -765,11 +769,11 @@ def validateHtmlContent(modelXbrl, referenceElt, htmlEltTree, validatedObjectLab
if attrTag == "src" and allowedImageTypes and attrValue not in checkedGraphicsFiles:
if scheme(attrValue) == "data":
try: # allow embedded newlines
m = imgDataMediaBase64Pattern.match(attrValue)
dataURLParts = parseImageDataURL(attrValue)
if (not allowedImageTypes["data-scheme"] or
not m or not m.group(1) or not m.group(2)
or m.group(1)[1:] not in allowedImageTypes["mime-types"]
or m.group(1)[1:] != validateGraphicHeaderType(decodeBase64DataImage(m.group(3)))):
not dataURLParts or not dataURLParts.mimeSubtype or not dataURLParts.isBase64
or dataURLParts.mimeSubtype not in allowedImageTypes["mime-types"]
or not dataURLParts.base64GraphicHeaderTypeMatchesMimeSubtype()):
modelXbrl.error(messageCodePrefix + "graphicDataUrl",
_("%(validatedObjectLabel)s references a graphics data URL which isn't accepted '%(attribute)s' for <%(element)s>"),
modelObject=elt, validatedObjectLabel=validatedObjectLabel,
Expand Down Expand Up @@ -924,6 +928,23 @@ def warning(self, err):
error=err.getMessage(), line=err.getLineNumber(), column=err.getColumnNumber())
'''

@dataclass
class ImageDataURLParts:
mimeSubtype: str | None
isBase64: bool
data: str
def base64GraphicHeaderTypeMatchesMimeSubtype(self) -> bool:
headerType = validateGraphicHeaderType(decodeBase64DataImage(self.data))
return headerType == self.mimeSubtype or headerType == 'jpg' and self.mimeSubtype == 'jpeg'

def parseImageDataURL(uri: str) -> ImageDataURLParts | None:
m = imgDataMediaBase64Pattern.match(uri)
return ImageDataURLParts(
mimeSubtype=m.group('mimeSubtype'),
isBase64=bool(m.group('base64')),
data=m.group('data'),
) if m else None

def validateGraphicHeaderType(data: bytes) -> str:
if data[:2] == b"\xff\xd8":
return "jpg"
Expand Down
20 changes: 2 additions & 18 deletions arelle/plugin/validate/ESEF/Util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from arelle.UrlUtil import scheme
from arelle.ModelManager import ModelManager
from arelle.ModelXbrl import ModelXbrl
from arelle.ValidateFilingText import validateGraphicHeaderType
from arelle.ValidateXbrl import ValidateXbrl
from typing import Any, Dict, List, Union, cast
from arelle.ModelDocument import ModelDocument
Expand Down Expand Up @@ -67,24 +68,7 @@ def checkImageContents(modelXbrl: ModelXbrl, imgElt: ModelObject, imgType: str,
_("Image SVG has XML error %(error)s"),
modelObject=imgElt, error=err)
else:
if data[:3] == b"GIF" and data[3:6] in (b'89a', b'89b', b'87a'):
headerType = "gif"
elif data[:2] == b"\xff\xd8":
headerType = "jpg"
elif data[:8] == b"\x89PNG\r\n\x1a\n":
headerType = "png"
elif data[:2] in (b"MM", b"II"):
headerType = "tiff"
elif data[:2] in (b"BM", b"BA"):
headerType = "bmp"
elif data[:4] == b"\x00\x00\x01\x00":
headerType = "ico"
elif data[:4] == b"\x00\x00\x02\x00":
headerType = "cur"
elif len(data) == 0:
headerType = "none"
else:
headerType = "unrecognized"
headerType = validateGraphicHeaderType(data)
if (("gif" not in imgType and headerType == "gif") or
("jpeg" not in imgType and "jpg" not in imgType and headerType == "jpg") or
("png" not in imgType and headerType == "png")):
Expand Down
16 changes: 8 additions & 8 deletions arelle/plugin/validate/ESEF/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from arelle.DisclosureSystem import DisclosureSystem
from arelle.ModelDtsObject import ModelConcept
from arelle.ModelXbrl import ModelXbrl
from arelle.ValidateFilingText import parseImageDataURL
from arelle.ValidateXbrl import ValidateXbrl
from arelle.formula.XPathContext import XPathContext
from arelle.ModelRelationshipSet import ModelRelationshipSet
Expand All @@ -88,7 +89,6 @@
styleCssHiddenPattern = re.compile(r"(.*[^\w]|^)display\s*:\s*none([^\w].*|$)")
ifrsNsPattern = re.compile(r"http://xbrl.ifrs.org/taxonomy/[0-9-]{10}/ifrs-full")
datetimePattern = lexicalPatterns["XBRLI_DATEUNION"]
imgDataMediaBase64Pattern = re.compile(r"data:image([^,;]*)(;base64)?,(.*)$", re.S)
ixErrorPattern = re.compile(r"ix11[.]|xmlSchema[:]|(?!xbrl.5.2.5.2|xbrl.5.2.6.2)xbrl[.]|xbrld[ti]e[:]|utre[:]")
docTypeXhtmlPattern = re.compile(r"^<!(?:DOCTYPE\s+)\s*html(?:PUBLIC\s+)?(?:.*-//W3C//DTD\s+(X?HTML)\s)?.*>$", re.IGNORECASE)

Expand Down Expand Up @@ -457,25 +457,25 @@ def checkFootnote(elt: ModelInlineFootnote | ModelResource, text: str) -> None:
_("Image file which isn't openable '%(src)s', error: %(error)s"),
modelObject=elt, src=src, error=err)
else:
m = imgDataMediaBase64Pattern.match(src)
if not m or not m.group(2):
dataURLParts = parseImageDataURL(src)
if not dataURLParts or not dataURLParts.isBase64:
modelXbrl.warning(f"{contentOtherThanXHTMLGuidance}.embeddedImageNotUsingBase64Encoding",
_("Images included in the XHTML document SHOULD be base64 encoded: %(src)s."),
modelObject=elt, src=src[:128])
if m and m.group(1) and m.group(3):
checkImageContents(modelXbrl, elt, m.group(1), False, m.group(3), val.consolidated)
if dataURLParts and dataURLParts.mimeSubtype and dataURLParts.data:
checkImageContents(modelXbrl, elt, dataURLParts.mimeSubtype, False, dataURLParts.data, val.consolidated)
else:
if not m.group(1):
if not dataURLParts.mimeSubtype:
modelXbrl.error(f"{contentOtherThanXHTMLGuidance}.MIMETypeNotSpecified",
_("Images included in the XHTML document MUST be saved with MIME type specifying PNG, GIF, SVG or JPG/JPEG formats: %(src)s."),
modelObject=elt, src=src[:128])
elif m.group(1) not in ("/gif", "/jpeg", "/jpg", "/png", "/svg+xml"):
elif dataURLParts.mimeSubtype not in ("gif", "jpeg", "jpg", "png", "svg+xml"):
modelXbrl.error(f"{contentOtherThanXHTMLGuidance}.imageFormatNotSupported",
_("Images included in the XHTML document MUST be saved in PNG, GIF, SVG or JPG/JPEG formats: %(src)s."),
modelObject=elt, src=src[:128])
# check for malicious image contents
try: # allow embedded newlines
checkImageContents(modelXbrl, elt, m.group(1), False, decodeBase64DataImage(m.group(3)), val.consolidated)
checkImageContents(modelXbrl, elt, dataURLParts.mimeSubtype, False, decodeBase64DataImage(dataURLParts.data), val.consolidated)
imgContents = None # deref, may be very large
except binascii.Error as err:
modelXbrl.error(f"{contentOtherThanXHTMLGuidance}.embeddedImageNotUsingBase64Encoding",
Expand Down
38 changes: 10 additions & 28 deletions arelle/plugin/validate/ESEF_2022/Util.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@
from arelle.UrlUtil import scheme, decodeBase64DataImage
from arelle.ModelManager import ModelManager
from arelle.ModelXbrl import ModelXbrl
from arelle.ValidateFilingText import parseImageDataURL, validateGraphicHeaderType
from arelle.ValidateXbrl import ValidateXbrl
from typing import Any, Dict, List, Optional, Union, cast
from arelle.ModelDocument import ModelDocument
from arelle.typing import TypeGetText
from collections import defaultdict

_: TypeGetText # Handle gettext
imgDataMediaBase64Pattern = re.compile(r"data:image([^,;]*)(;base64)?,(.*)$", re.S)

# check if a modelDocument URI is an extension URI (document URI)
# also works on a uri passed in as well as modelObject
Expand Down Expand Up @@ -77,27 +77,26 @@ def validateImage(baseUrl:Optional[str], image: str, modelXbrl: ModelXbrl, val:V
messageCodes=("ESEF.3.5.1.inlineXbrlDocumentContainsExternalReferences",
"ESEF.4.1.6.xHTMLDocumentContainsExternalReferences"))
elif image.startswith("data:image"):
m = imgDataMediaBase64Pattern.match(image)
(imgMimeType, isBase64, imgData) = m.group(1, 2, 3) if m is not None else (None, None, None)
if not m or not isBase64:
dataURLParts = parseImageDataURL(image)
if not dataURLParts or not dataURLParts.isBase64:
modelXbrl.warning(f"{contentOtherThanXHTMLGuidance}.embeddedImageNotUsingBase64Encoding",
_("Images included in the XHTML document SHOULD be base64 encoded: %(src)s."),
modelObject=elt, src=image[:128], evaluatedMsg=evaluatedMsg)
if m and imgMimeType and imgData:
checkImageContents(None, modelXbrl, elt, imgMimeType, False, unquote(imgData), val.consolidated, val)
if dataURLParts and dataURLParts.mimeSubtype and dataURLParts.data:
checkImageContents(None, modelXbrl, elt, dataURLParts.mimeSubtype, False, unquote(dataURLParts.data), val.consolidated, val)
else:
if not imgMimeType:
if not dataURLParts.mimeSubtype:
modelXbrl.error(f"{contentOtherThanXHTMLGuidance}.MIMETypeNotSpecified",
_("Images included in the XHTML document MUST be saved with MIME type specifying PNG, GIF, SVG or JPG/JPEG formats: %(src)s."),
modelObject=elt, src=image[:128], evaluatedMsg=evaluatedMsg)
elif imgMimeType not in ("/gif", "/jpeg", "/png", "/svg+xml"):
elif dataURLParts.mimeSubtype not in ("gif", "jpeg", "png", "svg+xml"):
modelXbrl.error(f"{contentOtherThanXHTMLGuidance}.imageFormatNotSupported",
_("Images included in the XHTML document MUST be saved in PNG, GIF, SVG or JPG/JPEG formats: %(src)s."),
modelObject=elt, src=image[:128], evaluatedMsg=evaluatedMsg)
# check for malicious image contents
try: # allow embedded newlines
imgContents:Union[bytes, Any, str] = decodeBase64DataImage(imgData)
checkImageContents(None, modelXbrl, elt, str(imgMimeType), False, imgContents, val.consolidated, val)
imgContents:Union[bytes, Any, str] = decodeBase64DataImage(dataURLParts.data)
checkImageContents(None, modelXbrl, elt, str(dataURLParts.mimeSubtype), False, imgContents, val.consolidated, val)
imgContents = b"" # deref, may be very large

except binascii.Error as err:
Expand Down Expand Up @@ -145,24 +144,7 @@ def checkImageContents(baseURI: Optional[str], modelXbrl: ModelXbrl, imgElt: _El
_("Image SVG has XML error %(error)s"),
modelObject=imgElt, error=err)
else:
if data[:3] == b"GIF" and data[3:6] in (b'89a', b'89b', b'87a'):
headerType = "gif"
elif data[:2] == b'\xff\xd8':
headerType = "jpg"
elif data[:8] == b"\x89PNG\r\n\x1a\n":
headerType = "png"
elif data[:2] in (b"MM", b"II"):
headerType = "tiff"
elif data[:2] in (b"BM", b"BA"):
headerType = "bmp"
elif data[:4] == b"\x00\x00\x01\x00":
headerType = "ico"
elif data[:4] == b"\x00\x00\x02\x00":
headerType = "cur"
elif len(data) == 0:
headerType = "none"
else:
headerType = "unrecognized"
headerType = validateGraphicHeaderType(data) # type: ignore[arg-type]
if (("gif" not in imgType and headerType == "gif") or
("jpeg" not in imgType and "jpg" not in imgType and headerType == "jpg") or
("png" not in imgType and headerType == "png")):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
'inline_xbrl/RTS_Annex_IV_Par_12_G2-2-4/index.xml:TC5_valid'
]),
file='esef_conformance_suite_2022/index_inline_xbrl.xml',
info_url='https://www.esma.europa.eu/document/conformance-suite-2022',
info_url='https://www.esma.europa.eu/document/esef-conformance-suite-2022',
local_filepath='esef_conformance_suite_2022.zip',
name=PurePath(__file__).stem,
plugins=frozenset({'validate/ESEF_2022'}),
Expand Down

0 comments on commit cb7272c

Please sign in to comment.