From 38195cf5961c7f9fb7bb960906193f126cfed7b1 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sat, 6 Jul 2024 14:58:16 +0200 Subject: [PATCH] parser: Don't produce names with invalid UTF-8 in recovery mode --- include/private/parser.h | 2 ++ parser.c | 61 ++++++++++++++++++++++------------------ parserInternals.c | 2 +- 3 files changed, 37 insertions(+), 28 deletions(-) diff --git a/include/private/parser.h b/include/private/parser.h index 6cd01b27c..054bf79f0 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -4,6 +4,8 @@ #include #include +#define XML_INVALID_CHAR 0x200000 + #define XML_MAX_URI_LENGTH 2000 /** diff --git a/parser.c b/parser.c index 8b63a9dbd..a0bd8225b 100644 --- a/parser.c +++ b/parser.c @@ -2268,8 +2268,6 @@ static int spacePop(xmlParserCtxtPtr ctxt) { * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. * NEXTL(l) Skip the current unicode character of l xmlChars long. - * CUR_CHAR(l) returns the current unicode character (int), set l - * to the number of xmlChars used for the encoding [0-5]. * CUR_SCHAR same but operate on a string instead of the context * COPY_BUF copy the current unicode char to the target buffer, increment * the index @@ -2349,13 +2347,22 @@ static int spacePop(xmlParserCtxtPtr ctxt) { ctxt->input->cur += l; \ } while (0) -#define CUR_CHAR(l) xmlCurrentChar(ctxt, &l) #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) #define COPY_BUF(b, i, v) \ if (v < 0x80) b[i++] = v; \ else i += xmlCopyCharMultiByte(&b[i],v) +static int +xmlCurrentCharRecover(xmlParserCtxtPtr ctxt, int *len) { + int c = xmlCurrentChar(ctxt, len); + + if (c == XML_INVALID_CHAR) + c = 0xFFFD; /* replacement character */ + + return(c); +} + /** * xmlSkipBlankChars: * @ctxt: the XML parser context @@ -3241,7 +3248,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) { /* * Handler for more complex cases */ - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); if ((ctxt->options & XML_PARSE_OLD10) == 0) { /* * Use the new checks of production [4] [4a] amd [5] of the @@ -3267,7 +3274,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) { } len += l; NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); while ((c != ' ') && (c != '>') && (c != '/') && /* accelerators */ (((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || @@ -3292,7 +3299,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) { if (len <= INT_MAX - l) len += l; NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); } } else { if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ @@ -3302,7 +3309,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) { } len += l; NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ ((IS_LETTER(c)) || (IS_DIGIT(c)) || @@ -3313,7 +3320,7 @@ xmlParseNameComplex(xmlParserCtxtPtr ctxt) { if (len <= INT_MAX - l) len += l; NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); } } if (len > maxLength) { @@ -3417,7 +3424,7 @@ xmlParseNCNameComplex(xmlParserCtxtPtr ctxt) { * Handler for more complex cases */ startPosition = CUR_PTR - BASE_PTR; - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ (!xmlIsNameStartChar(ctxt, c) || (c == ':'))) { return(ret); @@ -3428,7 +3435,7 @@ xmlParseNCNameComplex(xmlParserCtxtPtr ctxt) { if (len <= INT_MAX - l) len += l; NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); } if (len > maxLength) { xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "NCName"); @@ -3659,12 +3666,12 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) { XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); while (xmlIsNameChar(ctxt, c)) { COPY_BUF(buf, len, c); NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); if (len >= XML_MAX_NAMELEN) { /* * Okay someone managed to make a huge token, so he's ready to pay @@ -3699,7 +3706,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) { return(NULL); } NEXTL(l); - c = CUR_CHAR(l); + c = xmlCurrentChar(ctxt, &l); } buffer[len] = 0; return(buffer); @@ -4597,7 +4604,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { xmlErrMemory(ctxt); return(NULL); } - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); while ((IS_CHAR(cur)) && (cur != stop)) { /* checked */ if (len + 5 >= size) { xmlChar *tmp; @@ -4618,7 +4625,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { return(NULL); } NEXTL(l); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); } buf[len] = 0; if (!IS_CHAR(cur)) { @@ -4911,7 +4918,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) { int nbchar = 0; int cur, l; - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); while ((cur != '<') && /* checked */ (cur != '&') && (IS_CHAR(cur))) { @@ -4944,7 +4951,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) { nbchar = 0; SHRINK; } - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); } if (nbchar != 0) { buf[nbchar] = 0; @@ -5107,7 +5114,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf, return; } } - q = CUR_CHAR(ql); + q = xmlCurrentCharRecover(ctxt, &ql); if (q == 0) goto not_terminated; if (!IS_CHAR(q)) { @@ -5118,7 +5125,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf, return; } NEXTL(ql); - r = CUR_CHAR(rl); + r = xmlCurrentCharRecover(ctxt, &rl); if (r == 0) goto not_terminated; if (!IS_CHAR(r)) { @@ -5129,7 +5136,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf, return; } NEXTL(rl); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); if (cur == 0) goto not_terminated; while (IS_CHAR(cur) && /* checked */ @@ -5166,7 +5173,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf, rl = l; NEXTL(l); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); } buf[len] = 0; @@ -5518,7 +5525,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) { xmlFatalErrMsgStr(ctxt, XML_ERR_SPACE_REQUIRED, "ParsePI: PI %s space expected\n", target); } - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); while (IS_CHAR(cur) && /* checked */ ((cur != '?') || (NXT(1) != '>'))) { if (len + 5 >= size) { @@ -5541,7 +5548,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) { return; } NEXTL(l); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); } buf[len] = 0; if (cur != '?') { @@ -9586,19 +9593,19 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) { return; SKIP(6); - r = CUR_CHAR(rl); + r = xmlCurrentCharRecover(ctxt, &rl); if (!IS_CHAR(r)) { xmlFatalErr(ctxt, XML_ERR_CDATA_NOT_FINISHED, NULL); goto out; } NEXTL(rl); - s = CUR_CHAR(sl); + s = xmlCurrentCharRecover(ctxt, &sl); if (!IS_CHAR(s)) { xmlFatalErr(ctxt, XML_ERR_CDATA_NOT_FINISHED, NULL); goto out; } NEXTL(sl); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); buf = (xmlChar *) xmlMallocAtomic(size); if (buf == NULL) { xmlErrMemory(ctxt); @@ -9628,7 +9635,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) { s = cur; sl = l; NEXTL(l); - cur = CUR_CHAR(l); + cur = xmlCurrentCharRecover(ctxt, &l); } buf[len] = 0; if (cur != '>') { diff --git a/parserInternals.c b/parserInternals.c index 2e3119ab8..cc505ea05 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -967,7 +967,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { ctxt->input->flags |= XML_INPUT_ENCODING_ERROR; } *len = 1; - return(0xFFFD); /* U+FFFD Replacement Character */ + return(XML_INVALID_CHAR); incomplete_sequence: /*