Skip to content

Commit

Permalink
parser: Don't produce names with invalid UTF-8 in recovery mode
Browse files Browse the repository at this point in the history
  • Loading branch information
nwellnhof committed Jul 6, 2024
1 parent c45c15f commit 38195cf
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 28 deletions.
2 changes: 2 additions & 0 deletions include/private/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <libxml/parser.h>
#include <libxml/xmlversion.h>

#define XML_INVALID_CHAR 0x200000

#define XML_MAX_URI_LENGTH 2000

/**
Expand Down
61 changes: 34 additions & 27 deletions parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -2268,8 +2268,6 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
* NEXT Skip to the next character, this does the proper decoding
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
* NEXTL(l) Skip the current unicode character of l xmlChars long.
* CUR_CHAR(l) returns the current unicode character (int), set l
* to the number of xmlChars used for the encoding [0-5].
* CUR_SCHAR same but operate on a string instead of the context
* COPY_BUF copy the current unicode char to the target buffer, increment
* the index
Expand Down Expand Up @@ -2349,13 +2347,22 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
ctxt->input->cur += l; \
} while (0)

#define CUR_CHAR(l) xmlCurrentChar(ctxt, &l)
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)

#define COPY_BUF(b, i, v) \
if (v < 0x80) b[i++] = v; \
else i += xmlCopyCharMultiByte(&b[i],v)

static int
xmlCurrentCharRecover(xmlParserCtxtPtr ctxt, int *len) {
int c = xmlCurrentChar(ctxt, len);

if (c == XML_INVALID_CHAR)
c = 0xFFFD; /* replacement character */

return(c);
}

/**
* xmlSkipBlankChars:
* @ctxt: the XML parser context
Expand Down Expand Up @@ -3241,7 +3248,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) {
/*
* Handler for more complex cases
*/
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
if ((ctxt->options & XML_PARSE_OLD10) == 0) {
/*
* Use the new checks of production [4] [4a] amd [5] of the
Expand All @@ -3267,7 +3274,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) {
}
len += l;
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
while ((c != ' ') && (c != '>') && (c != '/') && /* accelerators */
(((c >= 'a') && (c <= 'z')) ||
((c >= 'A') && (c <= 'Z')) ||
Expand All @@ -3292,7 +3299,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) {
if (len <= INT_MAX - l)
len += l;
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
}
} else {
if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
Expand All @@ -3302,7 +3309,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) {
}
len += l;
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);

while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
((IS_LETTER(c)) || (IS_DIGIT(c)) ||
Expand All @@ -3313,7 +3320,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) {
if (len <= INT_MAX - l)
len += l;
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
}
}
if (len > maxLength) {
Expand Down Expand Up @@ -3417,7 +3424,7 @@ xmlParseNCNameComplex(xmlParserCtxtPtr ctxt) {
* Handler for more complex cases
*/
startPosition = CUR_PTR - BASE_PTR;
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
(!xmlIsNameStartChar(ctxt, c) || (c == ':'))) {
return(ret);
Expand All @@ -3428,7 +3435,7 @@ xmlParseNCNameComplex(xmlParserCtxtPtr ctxt) {
if (len <= INT_MAX - l)
len += l;
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
}
if (len > maxLength) {
xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "NCName");
Expand Down Expand Up @@ -3659,12 +3666,12 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;

c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);

while (xmlIsNameChar(ctxt, c)) {
COPY_BUF(buf, len, c);
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
if (len >= XML_MAX_NAMELEN) {
/*
* Okay someone managed to make a huge token, so he's ready to pay
Expand Down Expand Up @@ -3699,7 +3706,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
return(NULL);
}
NEXTL(l);
c = CUR_CHAR(l);
c = xmlCurrentChar(ctxt, &l);
}
buffer[len] = 0;
return(buffer);
Expand Down Expand Up @@ -4597,7 +4604,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
xmlErrMemory(ctxt);
return(NULL);
}
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
while ((IS_CHAR(cur)) && (cur != stop)) { /* checked */
if (len + 5 >= size) {
xmlChar *tmp;
Expand All @@ -4618,7 +4625,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
return(NULL);
}
NEXTL(l);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
}
buf[len] = 0;
if (!IS_CHAR(cur)) {
Expand Down Expand Up @@ -4911,7 +4918,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
int nbchar = 0;
int cur, l;

cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
while ((cur != '<') && /* checked */
(cur != '&') &&
(IS_CHAR(cur))) {
Expand Down Expand Up @@ -4944,7 +4951,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
nbchar = 0;
SHRINK;
}
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
}
if (nbchar != 0) {
buf[nbchar] = 0;
Expand Down Expand Up @@ -5107,7 +5114,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
return;
}
}
q = CUR_CHAR(ql);
q = xmlCurrentCharRecover(ctxt, &ql);
if (q == 0)
goto not_terminated;
if (!IS_CHAR(q)) {
Expand All @@ -5118,7 +5125,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
return;
}
NEXTL(ql);
r = CUR_CHAR(rl);
r = xmlCurrentCharRecover(ctxt, &rl);
if (r == 0)
goto not_terminated;
if (!IS_CHAR(r)) {
Expand All @@ -5129,7 +5136,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
return;
}
NEXTL(rl);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
if (cur == 0)
goto not_terminated;
while (IS_CHAR(cur) && /* checked */
Expand Down Expand Up @@ -5166,7 +5173,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
rl = l;

NEXTL(l);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);

}
buf[len] = 0;
Expand Down Expand Up @@ -5518,7 +5525,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) {
xmlFatalErrMsgStr(ctxt, XML_ERR_SPACE_REQUIRED,
"ParsePI: PI %s space expected\n", target);
}
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
while (IS_CHAR(cur) && /* checked */
((cur != '?') || (NXT(1) != '>'))) {
if (len + 5 >= size) {
Expand All @@ -5541,7 +5548,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) {
return;
}
NEXTL(l);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
}
buf[len] = 0;
if (cur != '?') {
Expand Down Expand Up @@ -9586,19 +9593,19 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) {
return;
SKIP(6);

r = CUR_CHAR(rl);
r = xmlCurrentCharRecover(ctxt, &rl);
if (!IS_CHAR(r)) {
xmlFatalErr(ctxt, XML_ERR_CDATA_NOT_FINISHED, NULL);
goto out;
}
NEXTL(rl);
s = CUR_CHAR(sl);
s = xmlCurrentCharRecover(ctxt, &sl);
if (!IS_CHAR(s)) {
xmlFatalErr(ctxt, XML_ERR_CDATA_NOT_FINISHED, NULL);
goto out;
}
NEXTL(sl);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
buf = (xmlChar *) xmlMallocAtomic(size);
if (buf == NULL) {
xmlErrMemory(ctxt);
Expand Down Expand Up @@ -9628,7 +9635,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) {
s = cur;
sl = l;
NEXTL(l);
cur = CUR_CHAR(l);
cur = xmlCurrentCharRecover(ctxt, &l);
}
buf[len] = 0;
if (cur != '>') {
Expand Down
2 changes: 1 addition & 1 deletion parserInternals.c
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
}
*len = 1;
return(0xFFFD); /* U+FFFD Replacement Character */
return(XML_INVALID_CHAR);

incomplete_sequence:
/*
Expand Down

0 comments on commit 38195cf

Please sign in to comment.