From 2b5d66217bd4ecf5e7271f1a4b2b339d7681e91c Mon Sep 17 00:00:00 2001 From: Damien Regad Date: Wed, 26 Sep 2012 18:20:41 +0200 Subject: [PATCH] Remove invalid chars from displayed string per XML specification Strict XHTML requires that data comply with XML 1.0 specification [1], which only allows a subset of the UTF-8 charset. Function string_html_specialchars() has been modified to remove from the string to print, any character which is not in the defined range Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] Fixes #14744 [1] http://www.w3.org/TR/xml/ --- core/string_api.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/string_api.php b/core/string_api.php index 1621f3b643..9fd81b504f 100644 --- a/core/string_api.php +++ b/core/string_api.php @@ -910,6 +910,10 @@ function string_html_entities( $p_string ) { * @return string */ function string_html_specialchars( $p_string ) { + # Remove any invalid character from the string per XML 1.0 specification + # http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char + $p_string = preg_replace( '/[^\x9\xA\xD\x20-\xD7FF\xE000-\xFFFD\x{10000}-\x{10FFFF}]/u', '', $p_string ); + # achumakov: @ added to avoid warning output in unsupported codepages # e.g. 8859-2, windows-1257, Korean, which are treated as 8859-1. # This is VERY important for Eastern European, Baltic and Korean languages