From 7e3ada37aced3314741722de50b3a5371a2d06f7 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Wed, 23 Jul 2025 11:14:36 +0800 Subject: [PATCH] Add MarkdownEOFBuilder and simplify EOF handling --- .../SwiftParser/Core/CodeConstructor.swift | 10 ++++---- Sources/SwiftParser/Core/CodeLanguage.swift | 8 +++++++ Sources/SwiftParser/Core/CodeParser.swift | 11 +++++++-- Sources/SwiftParser/Core/CodeTokenizer.swift | 15 ++++++++---- .../Markdown/MarkdownLanguage.swift | 7 +++++- .../Markdown/Nodes/MarkdownEOFBuilder.swift | 14 +++++++++++ .../MarkdownCodeTokenizerBasicTests.swift | 24 +++++++++++++++---- .../MarkdownCodeTokenizerCodeTests.swift | 6 ++++- ...ownCodeTokenizerCustomContainerTests.swift | 6 ++++- .../MarkdownCodeTokenizerFormulaTests.swift | 6 ++++- .../MarkdownCodeTokenizerHTMLTests.swift | 6 ++++- .../MarkdownTokenizerBasicTests.swift | 6 ++++- .../MarkdownTokenizerComplexTests.swift | 6 ++++- .../MarkdownTokenizerFormulaTests.swift | 6 ++++- .../MarkdownTokenizerHTMLTests.swift | 6 ++++- 15 files changed, 111 insertions(+), 26 deletions(-) create mode 100644 Sources/SwiftParser/Markdown/Nodes/MarkdownEOFBuilder.swift diff --git a/Sources/SwiftParser/Core/CodeConstructor.swift b/Sources/SwiftParser/Core/CodeConstructor.swift index 6c0c97c..6ce270d 100644 --- a/Sources/SwiftParser/Core/CodeConstructor.swift +++ b/Sources/SwiftParser/Core/CodeConstructor.swift @@ -16,7 +16,10 @@ public class CodeConstructor where Node: CodeNodeElement, Token: Co /// - Parameters: /// - builders: The node builders responsible for producing AST nodes. /// - state: Factory returning the initial parsing state object. - public init(builders: [any CodeNodeBuilder], state: @escaping () -> (any CodeConstructState)?) { + public init( + builders: [any CodeNodeBuilder], + state: @escaping () -> (any CodeConstructState)? + ) { self.builders = builders self.state = state } @@ -30,11 +33,6 @@ public class CodeConstructor where Node: CodeNodeElement, Token: Co var context = CodeConstructContext(current: root, tokens: tokens, state: state()) while context.consuming < context.tokens.count { - // Stop at EOF without recording an error - if let token = context.tokens[context.consuming] as? MarkdownToken, - token.element == .eof { - break - } var matched = false for node in builders { diff --git a/Sources/SwiftParser/Core/CodeLanguage.swift b/Sources/SwiftParser/Core/CodeLanguage.swift index 5adf08a..6bf519c 100644 --- a/Sources/SwiftParser/Core/CodeLanguage.swift +++ b/Sources/SwiftParser/Core/CodeLanguage.swift @@ -18,4 +18,12 @@ public protocol CodeLanguage where Node: CodeNodeElement, Token: Co /// The function that creates the initial context for tokenization. func state() -> (any CodeTokenState)? + + /// Provide an EOF token if the language requires one. + /// - Parameter range: The range where the EOF token should be inserted. + func eofToken(at range: Range) -> (any CodeToken)? +} + +extension CodeLanguage { + public func eofToken(at range: Range) -> (any CodeToken)? { nil } } diff --git a/Sources/SwiftParser/Core/CodeParser.swift b/Sources/SwiftParser/Core/CodeParser.swift index 462c16f..0be6dad 100644 --- a/Sources/SwiftParser/Core/CodeParser.swift +++ b/Sources/SwiftParser/Core/CodeParser.swift @@ -30,8 +30,15 @@ public class CodeParser where No public init(language: any CodeLanguage) { self.language = language - self.tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) - self.constructor = CodeConstructor(builders: language.nodes, state: language.state) + self.tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) + self.constructor = CodeConstructor( + builders: language.nodes, + state: language.state + ) } /// Parse a source string using the supplied language. diff --git a/Sources/SwiftParser/Core/CodeTokenizer.swift b/Sources/SwiftParser/Core/CodeTokenizer.swift index f11e295..dffe0f1 100644 --- a/Sources/SwiftParser/Core/CodeTokenizer.swift +++ b/Sources/SwiftParser/Core/CodeTokenizer.swift @@ -8,10 +8,16 @@ public class CodeTokenizer where Token: CodeTokenElement { private let builders: [any CodeTokenBuilder] private var state: () -> (any CodeTokenState)? + private let eofTokenFactory: ((Range) -> (any CodeToken)?)? - public init(builders: [any CodeTokenBuilder], state: @escaping () -> (any CodeTokenState)?) { + public init( + builders: [any CodeTokenBuilder], + state: @escaping () -> (any CodeTokenState)?, + eofTokenFactory: ((Range) -> (any CodeToken)?)? = nil + ) { self.builders = builders self.state = state + self.eofTokenFactory = eofTokenFactory } public func tokenize(_ input: String) -> ([any CodeToken], [CodeError]) { @@ -42,10 +48,9 @@ public class CodeTokenizer where Token: CodeTokenElement { } } - // Automatically append EOF token for Markdown - if Token.self == MarkdownTokenElement.self, - let eof = MarkdownToken.eof(at: input.endIndex.. { - context.tokens.append(eof) + // Append EOF token if provided by the language + if let token = eofTokenFactory?(input.endIndex.. (any CodeTokenState)? { nil } + + public func eofToken(at range: Range) -> (any CodeToken)? { + return MarkdownToken.eof(at: range) + } } // MARK: - Language Configuration diff --git a/Sources/SwiftParser/Markdown/Nodes/MarkdownEOFBuilder.swift b/Sources/SwiftParser/Markdown/Nodes/MarkdownEOFBuilder.swift new file mode 100644 index 0000000..b258156 --- /dev/null +++ b/Sources/SwiftParser/Markdown/Nodes/MarkdownEOFBuilder.swift @@ -0,0 +1,14 @@ +import Foundation + +/// Consumes trailing EOF tokens without modifying the AST. +public class MarkdownEOFBuilder: CodeNodeBuilder { + public init() {} + + public func build(from context: inout CodeConstructContext) -> Bool { + guard context.consuming < context.tokens.count, + let token = context.tokens[context.consuming] as? MarkdownToken, + token.element == .eof else { return false } + context.consuming += 1 + return true + } +} diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerBasicTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerBasicTests.swift index 87024b6..a2826cd 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerBasicTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerBasicTests.swift @@ -4,7 +4,11 @@ import XCTest final class MarkdownCodeTokenizerBasicTests: XCTestCase { func testHeadingTokenization() { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize("# Title") XCTAssertEqual(tokens.count, 4) XCTAssertEqual(tokens[0].element, .hash) @@ -15,7 +19,11 @@ final class MarkdownCodeTokenizerBasicTests: XCTestCase { func testAutolinkTokenization() { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize("") XCTAssertEqual(tokens.count, 2) XCTAssertEqual(tokens[0].element, .autolink) @@ -25,7 +33,11 @@ final class MarkdownCodeTokenizerBasicTests: XCTestCase { func testBareURLTokenization() { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize("https://example.com") XCTAssertEqual(tokens.count, 2) XCTAssertEqual(tokens[0].element, .url) @@ -34,7 +46,11 @@ final class MarkdownCodeTokenizerBasicTests: XCTestCase { func testBareEmailTokenization() { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize("user@example.com") XCTAssertEqual(tokens.count, 2) XCTAssertEqual(tokens[0].element, .email) diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCodeTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCodeTests.swift index 201cd69..1fdd735 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCodeTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCodeTests.swift @@ -4,7 +4,11 @@ import XCTest final class MarkdownCodeTokenizerCodeTests: XCTestCase { private func tokenize(_ input: String) -> [any CodeToken] { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize(input) return tokens } diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCustomContainerTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCustomContainerTests.swift index da17de1..1d372dd 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCustomContainerTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerCustomContainerTests.swift @@ -4,7 +4,11 @@ import XCTest final class MarkdownCodeTokenizerCustomContainerTests: XCTestCase { private func tokenize(_ input: String) -> [any CodeToken] { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize(input) return tokens } diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerFormulaTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerFormulaTests.swift index 8da9cf8..cd1b2f4 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerFormulaTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerFormulaTests.swift @@ -4,7 +4,11 @@ import XCTest final class MarkdownCodeTokenizerFormulaTests: XCTestCase { private func tokenize(_ input: String) -> [any CodeToken] { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize(input) return tokens } diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerHTMLTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerHTMLTests.swift index e8f4566..eff0fc6 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerHTMLTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownCodeTokenizerHTMLTests.swift @@ -4,7 +4,11 @@ import XCTest final class MarkdownCodeTokenizerHTMLTests: XCTestCase { private func tokenize(_ input: String) -> [any CodeToken] { let language = MarkdownLanguage() - let tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + let tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) let (tokens, _) = tokenizer.tokenize(input) return tokens } diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift index 000ffa8..e08eb7c 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift @@ -8,7 +8,11 @@ final class MarkdownTokenizerBasicTests: XCTestCase { override func setUp() { super.setUp() let language = MarkdownLanguage() - tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) } override func tearDown() { diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerComplexTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerComplexTests.swift index 3409aa5..9c49e21 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerComplexTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerComplexTests.swift @@ -8,7 +8,11 @@ final class MarkdownTokenizerComplexTests: XCTestCase { override func setUp() { super.setUp() let language = MarkdownLanguage() - tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) } override func tearDown() { diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerFormulaTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerFormulaTests.swift index 97ab3c6..6a45362 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerFormulaTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerFormulaTests.swift @@ -8,7 +8,11 @@ final class MarkdownTokenizerFormulaTests: XCTestCase { override func setUp() { super.setUp() let language = MarkdownLanguage() - tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) } override func tearDown() { diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerHTMLTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerHTMLTests.swift index 99a293b..1299b18 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerHTMLTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerHTMLTests.swift @@ -8,7 +8,11 @@ final class MarkdownTokenizerHTMLTests: XCTestCase { override func setUp() { super.setUp() let language = MarkdownLanguage() - tokenizer = CodeTokenizer(builders: language.tokens, state: language.state) + tokenizer = CodeTokenizer( + builders: language.tokens, + state: language.state, + eofTokenFactory: { language.eofToken(at: $0) } + ) } override func tearDown() {