Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
From 0cd2add6c46400b808329442f81451b369863983 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sat, 26 Aug 2023 15:08:59 +0200
Subject: [PATCH 1/6] Expose line and column information for use in PHP

---
source/lexbor/dom/interfaces/node.h | 2 ++
source/lexbor/html/token.h | 2 ++
source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++-
source/lexbor/html/tokenizer.h | 2 ++
source/lexbor/html/tokenizer/state.h | 2 ++
source/lexbor/html/tree.c | 11 +++++++++++
source/lexbor/html/tree/error.c | 5 +++--
source/lexbor/html/tree/error.h | 5 +++--
8 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
index 6c74ac5..b95373c 100644
--- a/source/lexbor/dom/interfaces/node.h
+++ b/source/lexbor/dom/interfaces/node.h
@@ -86,6 +86,8 @@ struct lxb_dom_node {

lxb_dom_node_type_t type;

+ size_t line;
+
#ifdef LXB_DOM_NODE_USER_VARIABLES
LXB_DOM_NODE_USER_VARIABLES
#endif /* LXB_DOM_NODE_USER_VARIABLES */
diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
index 79accd0..0b7f4fd 100644
--- a/source/lexbor/html/token.h
+++ b/source/lexbor/html/token.h
@@ -33,6 +33,8 @@ enum lxb_html_token_type {
typedef struct {
const lxb_char_t *begin;
const lxb_char_t *end;
+ size_t line;
+ size_t column;

const lxb_char_t *text_start;
const lxb_char_t *text_end;
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
index 22b88ed..1d9f378 100644
--- a/source/lexbor/html/tokenizer.c
+++ b/source/lexbor/html/tokenizer.c
@@ -92,6 +92,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)

tkz->pos = tkz->start;
tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
+ /* current_line & current_column already initialized by calloc (zero-based) */

tkz->tree = NULL;
tkz->tags = NULL;
@@ -153,6 +154,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
tkz_to->start = tkz_from->start;
tkz_to->end = tkz_from->end;
tkz_to->pos = tkz_to->start;
+ tkz_to->current_line = tkz_from->current_line;
+ tkz_to->current_column = tkz_from->current_column;

return LXB_STATUS_OK;
}
@@ -571,7 +574,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;

while (data < end) {
- data = tkz->state(tkz, data, end);
+ size_t current_column = tkz->current_column;
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
+ while (data < new_data) {
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
+ if (*data == '\n') {
+ tkz->current_line++;
+ current_column = 0;
+ } else {
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
+ if ((*data & 0b11000000) == 0b10000000) {
+ /* Continuation byte, do nothing */
+ } else {
+ /* First byte for a codepoint */
+ current_column++;
+ }
+ }
+ data++;
+ }
+ tkz->current_column = current_column;
}

return tkz->status;
diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
index 12b7c81..aa1ac37 100644
--- a/source/lexbor/html/tokenizer.h
+++ b/source/lexbor/html/tokenizer.h
@@ -79,6 +79,8 @@ struct lxb_html_tokenizer {
const lxb_char_t *end;
const lxb_char_t *begin;
const lxb_char_t *last;
+ size_t current_line;
+ size_t current_column;

/* Entities */
const lexbor_sbst_entry_static_t *entity;
diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
index 5e91444..52eaa9a 100644
--- a/source/lexbor/html/tokenizer/state.h
+++ b/source/lexbor/html/tokenizer/state.h
@@ -90,6 +90,8 @@ extern "C" {
do { \
tkz->pos = tkz->start; \
tkz->token->begin = v_begin; \
+ tkz->token->line = tkz->current_line; \
+ tkz->token->column = tkz->current_column; \
} \
while (0)

diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
index 062ea56..3f4c18d 100644
--- a/source/lexbor/html/tree.c
+++ b/source/lexbor/html/tree.c
@@ -431,6 +431,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
return NULL;
}

+ node->line = token->line;
+ /* We only expose line number in PHP DOM */
+
lxb_status_t status;
lxb_dom_element_t *element = lxb_dom_interface_element(node);

@@ -767,6 +770,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,

lxb_dom_interface_text(text)->char_data.data = *str;

+ if (tree->tkz_ref) {
+ text->line = tree->tkz_ref->token->line;
+ /* We only expose line number in PHP DOM */
+ }
+
if (ret_node != NULL) {
*ret_node = text;
}
@@ -806,6 +814,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
return NULL;
}

+ node->line = token->line;
+ /* We only expose line number in PHP DOM */
+
tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
tree->document->dom_document.text);
if (tree->status != LXB_STATUS_OK) {
diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
index ffdc55c..ef36eab 100644
--- a/source/lexbor/html/tree/error.c
+++ b/source/lexbor/html/tree/error.c
@@ -22,8 +22,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
}

entry->id = id;
- entry->begin = token->begin;
- entry->end = token->end;
+ entry->line = token->line;
+ entry->column = token->column;
+ entry->length = token->end - token->begin;

return entry;
}
diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
index 7a212af..b186772 100644
--- a/source/lexbor/html/tree/error.h
+++ b/source/lexbor/html/tree/error.h
@@ -109,8 +109,9 @@ lxb_html_tree_error_id_t;

typedef struct {
lxb_html_tree_error_id_t id;
- const lxb_char_t *begin;
- const lxb_char_t *end;
+ size_t line;
+ size_t column;
+ size_t length;
}
lxb_html_tree_error_t;

--
2.51.2

Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
From a4c29ba8d1ea1065ce6bd4a34382d53140cf1924 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Mon, 14 Aug 2023 20:18:51 +0200
Subject: [PATCH 2/6] Track implied added nodes for options use in PHP

---
source/lexbor/html/tree.h | 3 +++
source/lexbor/html/tree/insertion_mode/after_head.c | 1 +
source/lexbor/html/tree/insertion_mode/before_head.c | 2 ++
source/lexbor/html/tree/insertion_mode/before_html.c | 2 ++
4 files changed, 8 insertions(+)

diff --git a/source/lexbor/html/tree.h b/source/lexbor/html/tree.h
index 4912efb..7b2c620 100644
--- a/source/lexbor/html/tree.h
+++ b/source/lexbor/html/tree.h
@@ -55,6 +55,9 @@ struct lxb_html_tree {
bool foster_parenting;
bool frameset_ok;
bool scripting;
+ bool has_explicit_html_tag;
+ bool has_explicit_head_tag;
+ bool has_explicit_body_tag;

lxb_html_tree_insertion_mode_f mode;
lxb_html_tree_insertion_mode_f original_mode;
diff --git a/source/lexbor/html/tree/insertion_mode/after_head.c b/source/lexbor/html/tree/insertion_mode/after_head.c
index ad551b5..1448654 100644
--- a/source/lexbor/html/tree/insertion_mode/after_head.c
+++ b/source/lexbor/html/tree/insertion_mode/after_head.c
@@ -71,6 +71,7 @@ lxb_html_tree_insertion_mode_after_head_open(lxb_html_tree_t *tree,
return lxb_html_tree_process_abort(tree);
}

+ tree->has_explicit_body_tag = true;
tree->frameset_ok = false;
tree->mode = lxb_html_tree_insertion_mode_in_body;

diff --git a/source/lexbor/html/tree/insertion_mode/before_head.c b/source/lexbor/html/tree/insertion_mode/before_head.c
index 14621f2..cd2ac2a 100644
--- a/source/lexbor/html/tree/insertion_mode/before_head.c
+++ b/source/lexbor/html/tree/insertion_mode/before_head.c
@@ -67,6 +67,8 @@ lxb_html_tree_insertion_mode_before_head_open(lxb_html_tree_t *tree,
return lxb_html_tree_process_abort(tree);
}

+ tree->has_explicit_head_tag = true;
+
tree->mode = lxb_html_tree_insertion_mode_in_head;

break;
diff --git a/source/lexbor/html/tree/insertion_mode/before_html.c b/source/lexbor/html/tree/insertion_mode/before_html.c
index 05fe738..1e09cda 100644
--- a/source/lexbor/html/tree/insertion_mode/before_html.c
+++ b/source/lexbor/html/tree/insertion_mode/before_html.c
@@ -78,6 +78,8 @@ lxb_html_tree_insertion_mode_before_html_open(lxb_html_tree_t *tree,
return lxb_html_tree_process_abort(tree);
}

+ tree->has_explicit_html_tag = true;
+
tree->mode = lxb_html_tree_insertion_mode_before_head;

break;
--
2.51.2

Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
From 46fc776449252e74795569759a19d13857a59069 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Thu, 24 Aug 2023 22:57:48 +0200
Subject: [PATCH 3/6] Patch utilities and data structure to be able to generate
smaller lookup tables

Changed the generation script to check if everything fits in 32-bits.
And change the actual field types to 32-bits. This decreases the hash
tables in size.
---
source/lexbor/core/shs.h | 4 ++--
utils/lexbor/encoding/single-byte.py | 4 ++--
utils/lexbor/lexbor/LXB.py | 12 +++++++++---
3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/source/lexbor/core/shs.h b/source/lexbor/core/shs.h
index 7a63a07..c84dfaa 100644
--- a/source/lexbor/core/shs.h
+++ b/source/lexbor/core/shs.h
@@ -27,9 +27,9 @@ lexbor_shs_entry_t;

typedef struct {
uint32_t key;
- void *value;
+ uint32_t value;

- size_t next;
+ uint32_t next;
}
lexbor_shs_hash_t;

diff --git a/utils/lexbor/encoding/single-byte.py b/utils/lexbor/encoding/single-byte.py
index d7d1bb2..5420c16 100755
--- a/utils/lexbor/encoding/single-byte.py
+++ b/utils/lexbor/encoding/single-byte.py
@@ -128,7 +128,7 @@ class SingleByte:
entries = values[idx]
key_id = entries[1].decode('utf-8')

- hash_key.append(key_id, '(void *) {}'.format(idx + 0x80))
+ hash_key.append(key_id, idx + 0x80)

return hash_key.create(rate = 1)

@@ -161,7 +161,7 @@ def toHex(s):
lst = []

for ch in bytes(s, 'utf-8'):
- hv = hex(ch).replace('0x', '\\\\x')
+ hv = hex(ch).replace('0x', '\\x')
lst.append("'{}'".format(hv))

return ', '.join(lst)
diff --git a/utils/lexbor/lexbor/LXB.py b/utils/lexbor/lexbor/LXB.py
index 3e75812..2370c66 100755
--- a/utils/lexbor/lexbor/LXB.py
+++ b/utils/lexbor/lexbor/LXB.py
@@ -94,7 +94,7 @@ class HashKey:
def append(self, key_id, value):
self.buffer.append([self.hash_id(int(key_id, 0)), value])

- def create(self, terminate_value = '{0, NULL, 0}', rate = 2, is_const = True, data_before = None):
+ def create(self, terminate_value = '{0, 0, 0}', rate = 2, is_const = True, data_before = None):
test = self.test(int(self.max_table_size / 1.2), int(self.max_table_size * 1.2))

rate_dn = rate - 1
@@ -142,9 +142,12 @@ class HashKey:
entry = table[idx]

if entry:
+ assert entry[0] < 2**32
+ assert entry[1] < 2**32
+ assert entry[2] < 2**32
result.append("{{{}, {}, {}}},".format(entry[0], entry[1], entry[2]))
else:
- result.append("{0, NULL, 0},")
+ result.append("{0, 0, 0},")

if int(idx) % rate == rate_dn:
result.append("\n ")
@@ -154,9 +157,12 @@ class HashKey:
if len(table):
entry = table[-1]
if entry:
+ assert entry[0] < 2**32
+ assert entry[1] < 2**32
+ assert entry[2] < 2**32
result.append("{{{}, {}, {}}}\n".format(entry[0], entry[1], entry[2]))
else:
- result.append("{0, NULL, 0}\n")
+ result.append("{0, 0, 0}\n")

result.append("};")

--
2.51.2

Loading