Skip to content

Commit

Permalink
Add support to optionally allow surrogate pair entities (#165) (#174)
Browse files Browse the repository at this point in the history
  • Loading branch information
Magmaruss committed Jan 16, 2024
1 parent 37232fc commit 172371f
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 62 deletions.
5 changes: 5 additions & 0 deletions release-notes/CREDITS
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,8 @@ Tim Martin (@Orbisman)

* Contributed fix for #67: Wrong line for XML event location in elements following DTD
(6.6.0)

Kamil Gołębiewski (@Magmaruss)

* Contributed #165: Add support to optionally allow surrogate pair entities
(6.6.0)
2 changes: 2 additions & 0 deletions release-notes/VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Project: woodstox
#67: Wrong line for XML event location in elements following DTD
(reported by @m-g-sonar)
(fix contributed by Tim M)
#165: Add support to optionally allow surrogate pair entities
(contributed by Kamil G)
#176: Fix parser when not replacing entities and treating char references
as entities
(contributed by Guillaume N)
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/com/ctc/wstx/api/ReaderConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ public final class ReaderConfig

final static int PROP_MAX_DTD_DEPTH = 69;

/**
* @since 6.6
*/
final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70;

/*
////////////////////////////////////////////////
// Limits for numeric properties
Expand Down Expand Up @@ -361,6 +366,8 @@ public final class ReaderConfig
PROP_UNDECLARED_ENTITY_RESOLVER);
sProperties.put(WstxInputProperties.P_BASE_URL,
PROP_BASE_URL);
sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES,
PROP_ALLOW_SURROGATE_PAIR_ENTITIES);
sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE,
PROP_INPUT_PARSING_MODE);
}
Expand Down Expand Up @@ -419,6 +426,13 @@ public final class ReaderConfig
*/
protected URL mBaseURL;

/**
* Whether to allow surrogate pairs as entities (2 code-points as one target character).
*
* @since 6.6
*/
protected boolean mAllowSurrogatePairEntities = false;

/**
* Parsing mode can be changed from the default xml compliant
* behavior to one of alternate modes (fragment processing,
Expand Down Expand Up @@ -583,6 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym)
rc.mMaxEntityDepth = mMaxEntityDepth;
rc.mMaxEntityCount = mMaxEntityCount;
rc.mMaxDtdDepth = mMaxDtdDepth;
rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities;
if (mSpecialProperties != null) {
int len = mSpecialProperties.length;
Object[] specProps = new Object[len];
Expand Down Expand Up @@ -792,6 +807,10 @@ public XMLResolver getUndeclaredEntityResolver() {

public URL getBaseURL() { return mBaseURL; }

public boolean allowsSurrogatePairEntities() {
return mAllowSurrogatePairEntities;
}

public WstxInputProperties.ParsingMode getInputParsingMode() {
return mParsingMode;
}
Expand Down Expand Up @@ -1074,6 +1093,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) {
}

public void setBaseURL(URL baseURL) { mBaseURL = baseURL; }

public void doAllowSurrogatePairEntities(boolean state) {
mAllowSurrogatePairEntities = state;
}

public void setInputParsingMode(WstxInputProperties.ParsingMode mode) {
mParsingMode = mode;
Expand Down Expand Up @@ -1533,6 +1556,8 @@ public Object getProperty(int id)
return getUndeclaredEntityResolver();
case PROP_BASE_URL:
return getBaseURL();
case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
return allowsSurrogatePairEntities();
case PROP_INPUT_PARSING_MODE:
return getInputParsingMode();

Expand Down Expand Up @@ -1757,6 +1782,10 @@ public boolean setProperty(String propName, int id, Object value)
setBaseURL(u);
}
break;

case PROP_ALLOW_SURROGATE_PAIR_ENTITIES:
doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value));
break;

case PROP_INPUT_PARSING_MODE:
setInputParsingMode((WstxInputProperties.ParsingMode) value);
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/com/ctc/wstx/api/WstxInputProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,15 @@ public final class WstxInputProperties
* DTD subset).
*/
public final static String P_BASE_URL = "com.ctc.wstx.baseURL";

/**
* Property of type {@link java.lang.Boolean}, that will allow parsing
* high unicode characters written by surrogate pairs (2 code points)
* Default set as Boolean.FALSE, because it is not a standard behavior
*
* @since 6.6
*/
public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities";

// // // Alternate parsing modes

Expand Down
103 changes: 60 additions & 43 deletions src/main/java/com/ctc/wstx/sr/StreamScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -1183,59 +1183,62 @@ protected int resolveSimpleEntity(boolean checkStd)
char[] buf = mInputBuffer;
int ptr = mInputPtr;
char c = buf[ptr++];
final boolean allowSurrogatePairs = mConfig.allowsSurrogatePairEntities();

// Numeric reference?
if (c == '#') {
c = buf[ptr++];
int value = 0;
int pairValue = 0;
int inputLen = mInputEnd;
if (c == 'x') { // hex
while (ptr < inputLen) {

mInputPtr = ptr;
value = resolveCharEnt(null, false);
ptr = mInputPtr;
c = buf[ptr - 1];

// If resolving entity surrogate pairs enabled and if current entity
// is in range of high surrogate value, try to find surrogate pair
if (allowSurrogatePairs && value >= 0xD800 && value <= 0xDBFF) {
if (c == ';' && ptr + 1 < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
value += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
if (c == '&' && ptr + 1 < inputLen) {
c = buf[ptr++];
if (c == '#' && ptr + 1 < inputLen) {
try {
mInputPtr = ptr;
pairValue = resolveCharEnt(null, false);
ptr = mInputPtr;
c = buf[ptr -1];
} catch (WstxUnexpectedCharException wuce) {
reportNoSurrogatePair(value);
}
} else {
reportNoSurrogatePair(value);
}
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
reportNoSurrogatePair(value);
}
if (ptr >= inputLen) {
break;
}
c = buf[ptr++];
} else {
reportNoSurrogatePair(value);
}
}

// We get here either if we got it all, OR if we ran out of
// input in current buffer.
if (c == ';') { // got the full thing
mInputPtr = ptr;
validateChar(value);

if (allowSurrogatePairs && pairValue > 0) {
// [woodstox-core#165]
// If pair value is not in range of low surrogate values, then throw an error
if (pairValue < 0xDC00 || pairValue > 0xDFFF) {
reportInvalidSurrogatePair(value, pairValue);
}
value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00);
} else {
validateChar(value);
}

return value;
}

Expand Down Expand Up @@ -1352,7 +1355,7 @@ protected int resolveCharOnlyEntity(boolean checkStd)
// A char reference?
if (c == '#') { // yup
++mInputPtr;
return resolveCharEnt(null);
return resolveCharEnt(null, true);
}

// nope... except may be a pre-def?
Expand Down Expand Up @@ -1518,7 +1521,7 @@ protected int fullyResolveEntity(boolean allowExt)
// Do we have a (numeric) character entity reference?
if (c == '#') { // numeric
final StringBuffer originalSurface = new StringBuffer("#");
int ch = resolveCharEnt(originalSurface);
int ch = resolveCharEnt(originalSurface, true);
if (mCfgTreatCharRefsAsEntities) {
final char[] originalChars = new char[originalSurface.length()];
originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
Expand Down Expand Up @@ -2314,7 +2317,7 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
///////////////////////////////////////////////////////////////////////
*/

private int resolveCharEnt(StringBuffer originalCharacters)
private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar)
throws XMLStreamException
{
int value = 0;
Expand Down Expand Up @@ -2369,7 +2372,9 @@ private int resolveCharEnt(StringBuffer originalCharacters)
}
}
}
validateChar(value);
if (validateChar) {
validateChar(value);
}
return value;
}

Expand Down Expand Up @@ -2455,7 +2460,19 @@ private void reportUnicodeOverflow()
private void reportIllegalChar(int value)
throws XMLStreamException
{
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null);
}

private void reportNoSurrogatePair(int highSurrogate)
throws XMLStreamException
{
throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null);
}

private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate)
throws XMLStreamException
{
throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate));
}

protected void verifyLimit(String type, long maxValue, long currentValue)
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/codehaus/stax/test/BaseStaxTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import javax.xml.stream.*;
import javax.xml.stream.events.XMLEvent;

import com.ctc.wstx.api.WstxInputProperties;

/* Latest updates:
*
* - 07-Sep-2007, TSa: Updating based on latest understanding of
Expand Down Expand Up @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s
return false;
}
}

protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state)
throws XMLStreamException
{
Boolean b = state ? Boolean.TRUE : Boolean.FALSE;
f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b);
assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES));
}

protected static void setResolver(XMLInputFactory f, XMLResolver resolver)
throws XMLStreamException
Expand Down
Loading

0 comments on commit 172371f

Please sign in to comment.