Skip to content

Commit

Permalink
Add options to ignore the internal encoding
Browse files Browse the repository at this point in the history
For both XML and HTML, the document can provide an encoding
either in XMLDecl in XML, or as a meta element in HTML head.
This adds options to ignore those encodings if the encoding
is known in advace for example if the content had been converted
before being passed to the parser.

* parser.c include/libxml/parser.h: add XML_PARSE_IGNORE_ENC option
  for XML parsing
* include/libxml/HTMLparser.h HTMLparser.c: adds the
  HTML_PARSE_IGNORE_ENC for HTML parsing
* HTMLtree.c: fix the handling of saving when an unknown encoding is
  defined in meta document header
* xmllint.c: add a --noenc option to activate the new parser options
  • Loading branch information
veillard committed May 26, 2011
1 parent 0329a14 commit c62efc8
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 15 deletions.
11 changes: 9 additions & 2 deletions HTMLparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -3448,7 +3448,8 @@ static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
const xmlChar *encoding;

if ((ctxt == NULL) || (attvalue == NULL))
if ((ctxt == NULL) || (attvalue == NULL) ||
(ctxt->options & HTML_PARSE_IGNORE_ENC))
return;

/* do not change encoding */
Expand Down Expand Up @@ -3500,7 +3501,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
xmlSwitchToEncoding(ctxt, handler);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else {
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"htmlCheckEncoding: unknown encoding %s\n",
encoding, NULL);
}
}

Expand Down Expand Up @@ -6537,6 +6540,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
ctxt->options |= HTML_PARSE_NODEFDTD;
options -= HTML_PARSE_NODEFDTD;
}
if (options & HTML_PARSE_IGNORE_ENC) {
ctxt->options |= HTML_PARSE_IGNORE_ENC;
options -= HTML_PARSE_IGNORE_ENC;
}
ctxt->dictNames = 0;
return (options);
}
Expand Down
18 changes: 8 additions & 10 deletions HTMLtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
if (enc != XML_CHAR_ENCODING_UTF8) {
handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL)
return(-1);
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
}
}

Expand Down Expand Up @@ -562,11 +562,9 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
}

handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL) {
*mem = NULL;
*size = 0;
return;
}
if (handler == NULL)
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);

} else {
handler = xmlFindCharEncodingHandler(encoding);
}
Expand All @@ -587,7 +585,7 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
return;
}

htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);

xmlOutputBufferFlush(buf);
if (buf->conv != NULL) {
Expand Down Expand Up @@ -1061,7 +1059,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {

handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL)
return(-1);
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
} else {
handler = xmlFindCharEncodingHandler(encoding);
}
Expand Down Expand Up @@ -1120,7 +1118,7 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) {

handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL)
return(-1);
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
}
}

Expand Down Expand Up @@ -1181,7 +1179,7 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,

handler = xmlFindCharEncodingHandler(encoding);
if (handler == NULL)
return(-1);
htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
}
htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
} else {
Expand Down
3 changes: 2 additions & 1 deletion include/libxml/HTMLparser.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ typedef enum {
HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
HTML_PARSE_NONET = 1<<11,/* Forbid network access */
HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */
HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
} htmlParserOption;

XMLPUBFUN void XMLCALL
Expand Down
5 changes: 3 additions & 2 deletions include/libxml/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -1105,8 +1105,9 @@ typedef enum {
crash if you try to modify the tree) */
XML_PARSE_OLD10 = 1<<17,/* parse using XML-1.0 before update 5 */
XML_PARSE_NOBASEFIX = 1<<18,/* do not fixup XINCLUDE xml:base uris */
XML_PARSE_HUGE = 1<<19, /* relax any hardcoded limit from the parser */
XML_PARSE_OLDSAX = 1<<20 /* parse using SAX2 interface from before 2.7.0 */
XML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */
XML_PARSE_OLDSAX = 1<<20,/* parse using SAX2 interface before 2.7.0 */
XML_PARSE_IGNORE_ENC= 1<<21 /* ignore internal document encoding hint */
} xmlParserOption;

XMLPUBFUN void XMLCALL
Expand Down
11 changes: 11 additions & 0 deletions parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -9922,6 +9922,13 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
} else {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
}

/*
* Non standard parsing, allowing the user to ignore encoding
*/
if (ctxt->options & XML_PARSE_IGNORE_ENC)
return(encoding);

/*
* UTF-16 encoding stwich has already taken place at this stage,
* more over the little-endian/big-endian selection is already done
Expand Down Expand Up @@ -14561,6 +14568,10 @@ xmlCtxtUseOptionsInternal(xmlParserCtxtPtr ctxt, int options, const char *encodi
ctxt->options |= XML_PARSE_OLDSAX;
options -= XML_PARSE_OLDSAX;
}
if (options & XML_PARSE_IGNORE_ENC) {
ctxt->options |= XML_PARSE_IGNORE_ENC;
options -= XML_PARSE_IGNORE_ENC;
}
ctxt->linenumbers = 1;
return (options);
}
Expand Down
6 changes: 6 additions & 0 deletions xmllint.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ static int copy = 0;
#endif /* LIBXML_TREE_ENABLED */
static int recovery = 0;
static int noent = 0;
static int noenc = 0;
static int noblanks = 0;
static int noout = 0;
static int nowrap = 0;
Expand Down Expand Up @@ -2983,6 +2984,7 @@ static void usage(const char *name) {
printf("\t--recover : output what was parsable on broken XML documents\n");
printf("\t--huge : remove any internal arbitrary parser limits\n");
printf("\t--noent : substitute entity references by their value\n");
printf("\t--noenc : ignore any encoding specified inside the document\n");
printf("\t--noout : don't output the result tree\n");
printf("\t--path 'paths': provide a set of paths for resources\n");
printf("\t--load-trace : print trace of all external entites loaded\n");
Expand Down Expand Up @@ -3137,6 +3139,10 @@ main(int argc, char **argv) {
(!strcmp(argv[i], "--noent"))) {
noent++;
options |= XML_PARSE_NOENT;
} else if ((!strcmp(argv[i], "-noenc")) ||
(!strcmp(argv[i], "--noenc"))) {
noenc++;
options |= XML_PARSE_IGNORE_ENC;
} else if ((!strcmp(argv[i], "-nsclean")) ||
(!strcmp(argv[i], "--nsclean"))) {
options |= XML_PARSE_NSCLEAN;
Expand Down

0 comments on commit c62efc8

Please sign in to comment.