Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

add a new regression test program for testing character ranges and UTF8

* Makefile.am testchar.c Makefile.tests README.tests: add a
  new regression test program for testing character ranges and
  UTF8 encoding/decoding
Daniel

svn path=/trunk/; revision=3754
  • Loading branch information...
commit abade013348670c6a03fb9574c7979ba65a1b081 1 parent bf0324f
Daniel Veillard authored
Showing with 633 additions and 5 deletions.
  1. +6 −0 ChangeLog
  2. +8 −2 Makefile.tests
  3. +4 −3 README.tests
  4. +615 −0 testchar.c
6 ChangeLog
View
@@ -1,3 +1,9 @@
+Thu Jul 24 16:57:20 CEST 2008 Daniel Veillard <daniel@veillard.com>
+
+ * Makefile.am testchar.c Makefile.tests README.tests: add a
+ new regression test program for testing character ranges and
+ UTF8 encoding/decoding
+
Wed Jul 23 15:32:39 CEST 2008 Daniel Veillard <daniel@veillard.com>
* libxml.spec.in: fix the build root
10 Makefile.tests
View
@@ -8,12 +8,12 @@ LIBS=`xml2-config --libs`
THREADLIB= -lpthread
EXEEXT=
-all: runtest$(EXEEXT) runsuite$(EXEEXT) testapi$(EXEEXT)
+all: runtest$(EXEEXT) runsuite$(EXEEXT) testapi$(EXEEXT) testchar$(EXEEXT)
clean:
$(RM) runtest$(EXEEXT) runsuite$(EXEEXT) testapi$(EXEEXT)
-check: do_runtest do_runsuite do_testapi
+check: do_runtest do_testchar do_testapi do_runsuite
runtest$(EXEEXT): runtest.c
$(CC) -o runtest$(EXEEXT) $(CFLAGS) runtest.c $(LIBS) $(THREADLIB)
@@ -33,3 +33,9 @@ testapi$(EXEEXT): testapi.c
do_testapi: testapi$(EXEEXT)
./testapi
+testchar$(EXEEXT): testchar.c
+ $(CC) -o testchar$(EXEEXT) $(CFLAGS) testchar.c $(LIBS)
+
+do_testchar: testchar$(EXEEXT)
+ ./testchar
+
7 README.tests
View
@@ -10,13 +10,14 @@ program has a different testing purpose:
runtest.c : runs libxml2 basic internal regression tests
runsuite.c: runs libxml2 against external regression tests
testapi.c : exercises the library public entry points
+ testchar.c: exercise the check of character ranges and UTF-8 validation
The command:
- make -f Makefile.tests
+ make -f Makefile.tests check
should be sufficient on an Unix system to build and exercise the tests
-for the version of the library installed on the system. Note however
+for the version of the library installed on the system. Note however
that there isn't backward compatibility provided so if the installed
version is older to the testsuite one, failing to compile or run the tests
is likely. In any event this won't work with an installed libxml2 older
@@ -26,4 +27,4 @@ like any other program using libxml2, running the test should be done
simply by launching the resulting executables.
Daniel Veillard
-Sun Jul 10 2005
+Thu Jul 24 2008
615 testchar.c
View
@@ -0,0 +1,615 @@
+/**
+ * Test the UTF-8 decoding routines
+ *
+ * author: Daniel Veillard
+ * copy: see Copyright for the status of this software.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <libxml/parser.h>
+#include <libxml/parserInternals.h>
+
+int lastError;
+
+static void errorHandler(void *unused, xmlErrorPtr err) {
+ if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
+ lastError = err->code;
+ }
+}
+
+char document1[100] = "<doc>XXXX</doc>";
+char document2[100] = "<doc foo='XXXX'/>";
+
+static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
+ int len, char *data, int forbid1, int forbid2) {
+ int i;
+ xmlDocPtr res;
+
+ for (i = 0;i <= 0xFF;i++) {
+ lastError = 0;
+ xmlCtxtReset(ctxt);
+
+ data[0] = i;
+
+ res = xmlReadMemory(document, len, "test", NULL, 0);
+
+ if ((i == forbid1) || (i == forbid2)) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Byte 0x%02X: %c\n",
+ i, i);
+ }
+
+ else if ((i == '<') || (i == '&')) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
+ }
+ else if (((i < 0x20) || (i >= 0x80)) &&
+ (i != 0x9) && (i != 0xA) && (i != 0xD)) {
+ if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Byte 0x%02X\n", i);
+ }
+ else if (res == NULL) {
+ fprintf(stderr,
+ "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
+ }
+ if (res != NULL)
+ xmlFreeDoc(res);
+ }
+}
+
+static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
+ int len, char *data) {
+ int i, j;
+ xmlDocPtr res;
+
+ for (i = 0x80;i <= 0xFF;i++) {
+ for (j = 0;j <= 0xFF;j++) {
+ lastError = 0;
+ xmlCtxtReset(ctxt);
+
+ data[0] = i;
+ data[1] = j;
+
+ res = xmlReadMemory(document, len, "test", NULL, 0);
+
+ /* if first bit of first char is set, then second bit must too */
+ if ((i & 0x80) && ((i & 0x40) == 0)) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
+ i, j);
+ }
+
+ /*
+ * if first bit of first char is set, then second char first
+ * bits must be 10
+ */
+ else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
+ i, j);
+ }
+
+ /*
+ * if using a 2 byte encoding then the value must be greater
+ * than 0x80, i.e. one of bits 5 to 1 of i must be set
+ */
+ else if ((i & 0x80) && ((i & 0x1E) == 0)) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
+ i, j);
+ }
+
+ /*
+ * if third bit of first char is set, then the sequence would need
+ * at least 3 bytes, but we give only 2 !
+ */
+ else if ((i & 0xE0) == 0xE0) {
+ if ((lastError == 0) || (res != NULL))
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
+ i, j);
+ }
+
+ /*
+ * We should see no error in remaning cases
+ */
+ else if ((lastError != 0) || (res == NULL)) {
+ fprintf(stderr,
+ "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
+ }
+ if (res != NULL)
+ xmlFreeDoc(res);
+ }
+ }
+}
+
+/**
+ * testDocumentRanges:
+ *
+ * Test the correct UTF8 character parsing in context of XML documents
+ * Those are in-context injection tests checking the parser behaviour on
+ * edge case values at different point in content, beginning and end of
+ * CDATA in text or in attribute values.
+ */
+
+static void testDocumentRanges(void) {
+ xmlParserCtxtPtr ctxt;
+ char *data;
+
+ /*
+ * Set up a parsing context using the first document as
+ * the current input source.
+ */
+ ctxt = xmlNewParserCtxt();
+ if (ctxt == NULL) {
+ fprintf(stderr, "Failed to allocate parser context\n");
+ return;
+ }
+
+ printf("testing 1 byte char in document: 1");
+ fflush(stdout);
+ data = &document1[5];
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 1 byte injection at beginning of area */
+ testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
+ data, -1, -1);
+ printf(" 2");
+ fflush(stdout);
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 1 byte injection at end of area */
+ testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
+ data + 3, -1, -1);
+
+ printf(" 3");
+ fflush(stdout);
+ data = &document2[10];
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 1 byte injection at beginning of area */
+ testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
+ data, '\'', -1);
+ printf(" 4");
+ fflush(stdout);
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 1 byte injection at end of area */
+ testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
+ data + 3, '\'', -1);
+ printf(" done\n");
+
+ printf("testing 2 byte char in document: 1");
+ fflush(stdout);
+ data = &document1[5];
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 2 byte injection at beginning of area */
+ testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
+ data);
+ printf(" 2");
+ fflush(stdout);
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 2 byte injection at end of area */
+ testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
+ data + 2);
+
+ printf(" 3");
+ fflush(stdout);
+ data = &document2[10];
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 2 byte injection at beginning of area */
+ testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
+ data);
+ printf(" 4");
+ fflush(stdout);
+ data[0] = ' ';
+ data[1] = ' ';
+ data[2] = ' ';
+ data[3] = ' ';
+ /* test 2 byte injection at end of area */
+ testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
+ data + 2);
+ printf(" done\n");
+
+ xmlFreeParserCtxt(ctxt);
+}
+
+static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
+ int i = 0;
+ int len, c;
+
+ data[1] = 0;
+ data[2] = 0;
+ data[3] = 0;
+ for (i = 0;i <= 0xFF;i++) {
+ data[0] = i;
+ ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ lastError = 0;
+ c = xmlCurrentChar(ctxt, &len);
+ if ((i == 0) || (i >= 0x80)) {
+ /* we must see an error there */
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Byte 0x%02X\n", i);
+ } else if (i == 0xD) {
+ if ((c != 0xA) || (len != 1))
+ fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
+ } else if ((c != i) || (len != 1)) {
+ fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
+ }
+ }
+}
+
+static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
+ int i, j;
+ int len, c;
+
+ data[2] = 0;
+ data[3] = 0;
+ for (i = 0x80;i <= 0xFF;i++) {
+ for (j = 0;j <= 0xFF;j++) {
+ data[0] = i;
+ data[1] = j;
+ ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ lastError = 0;
+ c = xmlCurrentChar(ctxt, &len);
+
+ /* if first bit of first char is set, then second bit must too */
+ if ((i & 0x80) && ((i & 0x40) == 0)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
+ i, j);
+ }
+
+ /*
+ * if first bit of first char is set, then second char first
+ * bits must be 10
+ */
+ else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
+ i, j, c);
+ }
+
+ /*
+ * if using a 2 byte encoding then the value must be greater
+ * than 0x80, i.e. one of bits 5 to 1 of i must be set
+ */
+ else if ((i & 0x80) && ((i & 0x1E) == 0)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
+ i, j, c);
+ }
+
+ /*
+ * if third bit of first char is set, then the sequence would need
+ * at least 3 bytes, but we give only 2 !
+ */
+ else if ((i & 0xE0) == 0xE0) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
+ i, j);
+ }
+
+ /*
+ * We should see no error in remaning cases
+ */
+ else if ((lastError != 0) || (len != 2)) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
+ }
+
+ /*
+ * Finally check the value is right
+ */
+ else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
+ i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
+ }
+ }
+ }
+}
+
+static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
+ int i, j, k, K;
+ int len, c;
+ unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
+ int value;
+
+ data[3] = 0;
+ for (i = 0xE0;i <= 0xFF;i++) {
+ for (j = 0;j <= 0xFF;j++) {
+ for (k = 0;k < 6;k++) {
+ data[0] = i;
+ data[1] = j;
+ K = lows[k];
+ data[2] = (char) K;
+ value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
+ ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ lastError = 0;
+ c = xmlCurrentChar(ctxt, &len);
+
+ /*
+ * if fourth bit of first char is set, then the sequence would need
+ * at least 4 bytes, but we give only 3 !
+ */
+ if ((i & 0xF0) == 0xF0) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ i, j, K, data[3]);
+ }
+
+ /*
+ * The second and the third bytes must start with 10
+ */
+ else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
+ i, j, K);
+ }
+
+ /*
+ * if using a 3 byte encoding then the value must be greater
+ * than 0x800, i.e. one of bits 4 to 0 of i must be set or
+ * the 6th byte of data[1] must be set
+ */
+ else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
+ i, j, K);
+ }
+
+ /*
+ * There are values in that range that are not allowed in XML-1.0
+ */
+ else if (((value > 0xD7FF) && (value <0xE000)) ||
+ ((value > 0xFFFD) && (value <0x10000))) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
+ value, i, j, K);
+ }
+
+ /*
+ * We should see no error in remaining cases
+ */
+ else if ((lastError != 0) || (len != 3)) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
+ i, j, K);
+ }
+
+ /*
+ * Finally check the value is right
+ */
+ else if (c != value) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
+ i, j, data[2], value, c);
+ }
+ }
+ }
+ }
+}
+
+static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
+ int i, j, k, K, l, L;
+ int len, c;
+ unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
+ int value;
+
+ data[4] = 0;
+ for (i = 0xF0;i <= 0xFF;i++) {
+ for (j = 0;j <= 0xFF;j++) {
+ for (k = 0;k < 6;k++) {
+ for (l = 0;l < 6;l++) {
+ data[0] = i;
+ data[1] = j;
+ K = lows[k];
+ data[2] = (char) K;
+ L = lows[l];
+ data[3] = (char) L;
+ value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
+ ((i & 0x7) << 18);
+ ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ lastError = 0;
+ c = xmlCurrentChar(ctxt, &len);
+
+ /*
+ * if fifth bit of first char is set, then the sequence would need
+ * at least 5 bytes, but we give only 4 !
+ */
+ if ((i & 0xF8) == 0xF8) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ i, j, K, data[3]);
+ }
+
+ /*
+ * The second, third and fourth bytes must start with 10
+ */
+ else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
+ ((L & 0xC0) != 0x80)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ i, j, K, L);
+ }
+
+ /*
+ * if using a 3 byte encoding then the value must be greater
+ * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
+ * the 6 or 5th byte of j must be set
+ */
+ else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+ "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ i, j, K, L);
+ }
+
+ /*
+ * There are values in that range that are not allowed in XML-1.0
+ */
+ else if (((value > 0xD7FF) && (value <0xE000)) ||
+ ((value > 0xFFFD) && (value <0x10000)) ||
+ (value > 0x10FFFF)) {
+ if (lastError != XML_ERR_INVALID_CHAR)
+ fprintf(stderr,
+"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ value, i, j, K, L);
+ }
+
+ /*
+ * We should see no error in remaining cases
+ */
+ else if ((lastError != 0) || (len != 4)) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
+ i, j, K);
+ }
+
+ /*
+ * Finally check the value is right
+ */
+ else if (c != value) {
+ fprintf(stderr,
+ "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
+ i, j, data[2], value, c);
+ }
+ }
+ }
+ }
+ }
+}
+
+/**
+ * testCharRanges:
+ *
+ * Test the correct UTF8 character parsing in isolation i.e.
+ * not when parsing a full document, this is less expensive and we can
+ * cover the full range of UTF-8 chars accepted by XML-1.0
+ */
+
+static void testCharRanges(void) {
+ char data[5];
+ xmlParserCtxtPtr ctxt;
+ xmlParserInputBufferPtr buf;
+ xmlParserInputPtr input;
+
+ memset(data, 0, 5);
+
+ /*
+ * Set up a parsing context using the above data buffer as
+ * the current input source.
+ */
+ ctxt = xmlNewParserCtxt();
+ if (ctxt == NULL) {
+ fprintf(stderr, "Failed to allocate parser context\n");
+ return;
+ }
+ buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
+ XML_CHAR_ENCODING_NONE);
+ if (buf == NULL) {
+ fprintf(stderr, "Failed to allocate input buffer\n");
+ goto error;
+ }
+ input = xmlNewInputStream(ctxt);
+ if (input == NULL) {
+ xmlFreeParserInputBuffer(buf);
+ goto error;
+ }
+ input->filename = NULL;
+ input->buf = buf;
+ input->base = input->buf->buffer->content;
+ input->cur = input->buf->buffer->content;
+ input->end = &input->buf->buffer->content[4];
+ inputPush(ctxt, input);
+
+ printf("testing char range: 1");
+ fflush(stdout);
+ testCharRangeByte1(ctxt, data);
+ printf(" 2");
+ fflush(stdout);
+ testCharRangeByte2(ctxt, data);
+ printf(" 3");
+ fflush(stdout);
+ testCharRangeByte3(ctxt, data);
+ printf(" 4");
+ fflush(stdout);
+ testCharRangeByte4(ctxt, data);
+ printf(" done\n");
+ fflush(stdout);
+
+error:
+ xmlFreeParserCtxt(ctxt);
+}
+
+int main(void) {
+
+ /*
+ * this initialize the library and check potential ABI mismatches
+ * between the version it was compiled for and the actual shared
+ * library used.
+ */
+ LIBXML_TEST_VERSION
+
+ /*
+ * Catch errors separately
+ */
+
+ xmlSetStructuredErrorFunc(NULL, errorHandler);
+
+ /*
+ * Run the tests
+ */
+ testCharRanges();
+ testDocumentRanges();
+
+ /*
+ * Cleanup function for the XML library.
+ */
+ xmlCleanupParser();
+ /*
+ * this is to debug memory for regression tests
+ */
+ xmlMemoryDump();
+ return(0);
+}
Please sign in to comment.
Something went wrong with that request. Please try again.