Skip to content

Commit

Permalink
MID-7959: Use Java XML Transfomer in order to handle Unicode correctly
Browse files Browse the repository at this point in the history
Xalan 2.7.2 is not activelly maintained and does not handle surrogate
characters well which results in unparseable XML.

Switched back to original JVM XML transformer, which behaves correctly.
Added logic to optionally add xml:space=preserve to embedded XSD schemas
  • Loading branch information
tonydamage committed Aug 25, 2022
1 parent 7cdfa63 commit a9e9be1
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public Element getSchemaElement() {

public void setSchemaElement(Element schemaElement) {
this.schemaElement = schemaElement;
DOMUtil.preserveFormattingIfPresent(schemaElement);
}

@Override
Expand Down
4 changes: 0 additions & 4 deletions infra/util/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
</dependency>
<dependency>
<groupId>xalan</groupId>
<artifactId>xalan</artifactId>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
Expand Down
31 changes: 24 additions & 7 deletions infra/util/src/main/java/com/evolveum/midpoint/util/DOMUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ public class DOMUtil {
}

public static TransformerFactory setupTransformerFactory() {
//setTransformerFactoryIfPresent("com.sun.org.apache.xalan.internal.xsltc.trax.TransformerFactoryImpl"); // too many whitespaces in Java11
//setTransformerFactoryIfPresent("org.apache.xalan.xsltc.trax.TransformerFactoryImpl"); // too few whitespaces
setTransformerFactoryIfPresent("org.apache.xalan.processor.TransformerFactoryImpl"); // a bit slower

// MID-7959: Use java native transformer, Xalan has problem with surrogates
// whitespace issue for schema elements is solved by adding xml:space=preserve attribute
// xnodes does not have problem with additional spaces. // a bit slower
setTransformerFactoryIfPresent("com.sun.org.apache.xalan.internal.xsltc.trax.TransformerFactoryImpl");
return TransformerFactory.newInstance();
}

Expand Down Expand Up @@ -473,6 +473,21 @@ public static QName resolveQName(Node domNode, String qnameStringRepresentation)
return qname;
}

public static void preserveFormattingIfPresent(Element xsdSchema) {
Node maybeChild = xsdSchema.getFirstChild();
while (maybeChild != null) {
if (maybeChild.getNodeType() == Node.TEXT_NODE) {
Text textNode = (Text) maybeChild;
if (StringUtils.isBlank(textNode.getTextContent())) {
xsdSchema.setAttributeNS(DOMUtil.W3C_XML_XML_URI, "xml:space", "preserve");
return;
}
}
maybeChild = maybeChild.getNextSibling();
}

}


public static String findNamespace(Node domNode, String prefix) {
String ns = null;
Expand Down Expand Up @@ -1370,11 +1385,13 @@ public static void checkValidXmlChars(String stringValue) {
return;
}
int codepointCount = stringValue.codePointCount(0, stringValue.length());

for (int i = 0; i < codepointCount; i++) {
if (!XMLChar.isValid(stringValue.codePointAt(i))) {
int i = 0;
while (i < codepointCount) {
int codePoint = stringValue.codePointAt(i);
if (!XMLChar.isValid(codePoint)) {
throw new IllegalStateException("Invalid character with regards to XML (code " + ((int) stringValue.charAt(i)) + ") in '" + makeSafelyPrintable(stringValue, 200) + "'");
}
i += Character.charCount(codePoint);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class DOMUtilTest {
private static final String WHITESPACES_FILENAME = "src/test/resources/domutil/whitespaces.xml";
private static final String QNAMES_FILENAME = "src/test/resources/domutil/qnames.xml";
private static final String FIX_NAMESPACE_FILENAME = "src/test/resources/domutil/fix-namespace.xml";
private static final String SURROGATES_FILENAME = "src/test/resources/domutil/surrogates.xml";

public static final String NS_W3C_XML_SCHEMA_PREFIX = "xsd";
public static final QName XSD_SCHEMA_ELEMENT = new QName(W3C_XML_SCHEMA_NS_URI, "schema",
Expand Down Expand Up @@ -349,4 +350,18 @@ public void testSupportStringWithSurrogates() {
assertTrue("Not support string with surrogates in xml",support);
}

@Test
public void testSerializationDeserializationWithSurrogates() {
String surrogateText = "𠀋 are composed of surrogate pairs. Emoji like 😁";
DOMUtil.checkValidXmlChars(surrogateText);
Document original = DOMUtil.parseFile(SURROGATES_FILENAME);
String kanjiText = DOMUtil.findElementRecursive(original.getDocumentElement(), new QName("kanji")).getTextContent();
System.out.println("kanji: " + kanjiText);
String asString = DOMUtil.serializeDOMToString(original);
System.out.println(asString);
Document fromString = DOMUtil.parseDocument(asString);
String asString2 = DOMUtil.serializeDOMToString(fromString);
System.out.println(asString2);
}

}
15 changes: 15 additions & 0 deletions infra/util/src/test/resources/domutil/surrogates.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2010-2019 Evolveum and contributors
~
~ This work is dual-licensed under the Apache License 2.0
~ and European Union Public License. See LICENSE file for details.
-->
<root>
<no>Normal Text</no>
<kanji>𠀋</kanji>
<emoji>😁</emoji>
<preserved xml:space="preserve">
Here should spaces be preserved.
</preserved>
</root>

0 comments on commit a9e9be1

Please sign in to comment.