From 4791254195f12f9aa0d1c85a8c8b905c4dba3388 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 21 Jul 2025 19:09:21 +0800 Subject: [PATCH] Fix token builder and adjust tests --- .../Markdown/MarkdownLanguage.swift | 2 +- .../TokenBuilders/BacktickTokenBuilder.swift | 54 +++++++++++++++++++ .../TokenBuilders/MarkdownTokenBuilders.swift | 31 +++++++++++ .../TokenBuilders/NumberTokenBuilder.swift | 20 +++++++ .../SingleCharacterTokenBuilder.swift | 24 +++++++++ .../TokenBuilders/TextTokenBuilder.swift | 32 +++++++++++ .../WhitespaceTokenBuilder.swift | 51 ++++++++++++++++++ .../Tokenizer/CodeTokenizerBasicTests.swift | 24 +++++++++ 8 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/BacktickTokenBuilder.swift create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/MarkdownTokenBuilders.swift create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/NumberTokenBuilder.swift create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/SingleCharacterTokenBuilder.swift create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/TextTokenBuilder.swift create mode 100644 Sources/SwiftParser/Markdown/TokenBuilders/WhitespaceTokenBuilder.swift create mode 100644 Tests/SwiftParserTests/Markdown/Tokenizer/CodeTokenizerBasicTests.swift diff --git a/Sources/SwiftParser/Markdown/MarkdownLanguage.swift b/Sources/SwiftParser/Markdown/MarkdownLanguage.swift index 4a8a465..7ebad79 100644 --- a/Sources/SwiftParser/Markdown/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Markdown/MarkdownLanguage.swift @@ -43,7 +43,7 @@ public class MarkdownLanguage: CodeLanguage { ) { self.outdatedTokenizer = outdatedTokenizer self.nodes = consumers - self.tokens = [] + self.tokens = MarkdownTokenBuilders.commonMarkBase() } // MARK: - Language Protocol Implementation diff --git a/Sources/SwiftParser/Markdown/TokenBuilders/BacktickTokenBuilder.swift b/Sources/SwiftParser/Markdown/TokenBuilders/BacktickTokenBuilder.swift new file mode 100644 index 0000000..6b01bcb --- /dev/null +++ b/Sources/SwiftParser/Markdown/TokenBuilders/BacktickTokenBuilder.swift @@ -0,0 +1,54 @@ +import Foundation + +struct BacktickTokenBuilder: CodeTokenBuilder { + typealias Element = MarkdownTokenElement + + func build(from context: CodeTokenContext) -> Bool { + guard context.consuming < context.source.endIndex else { return false } + guard context.source[context.consuming] == "`" else { return false } + + // Count consecutive backticks + var idx = context.consuming + var tickCount = 0 + while idx < context.source.endIndex && context.source[idx] == "`" { + tickCount += 1 + idx = context.source.index(after: idx) + } + let start = context.consuming + var end = idx + var foundClosing = false + + // Search for closing sequence of same length + while end < context.source.endIndex { + if context.source[end] == "`" { + var check = end + var count = 0 + while check < context.source.endIndex && context.source[check] == "`" && count < tickCount { + count += 1 + check = context.source.index(after: check) + } + if count == tickCount { + end = check + foundClosing = true + break + } + } + end = context.source.index(after: end) + } + + if !foundClosing { + // No closing delimiter - treat first backtick as text + return false + } + + context.consuming = end + let range = start..= 3 { + context.tokens.append(MarkdownToken.fencedCodeBlock(text, at: range)) + } else { + context.tokens.append(MarkdownToken.inlineCode(text, at: range)) + } + return true + } +} diff --git a/Sources/SwiftParser/Markdown/TokenBuilders/MarkdownTokenBuilders.swift b/Sources/SwiftParser/Markdown/TokenBuilders/MarkdownTokenBuilders.swift new file mode 100644 index 0000000..bb165c5 --- /dev/null +++ b/Sources/SwiftParser/Markdown/TokenBuilders/MarkdownTokenBuilders.swift @@ -0,0 +1,31 @@ +import Foundation + +enum MarkdownTokenBuilders { + static func commonMarkBase() -> [any CodeTokenBuilder] { + var builders: [any CodeTokenBuilder] = [] + // Special structures first + builders.append(BacktickTokenBuilder()) + // Whitespace + builders.append(WhitespaceTokenBuilder(character: " ", element: .space)) + builders.append(WhitespaceTokenBuilder(character: "\t", element: .tab)) + builders.append(WhitespaceTokenBuilder(character: "\n", element: .newline)) + builders.append(WhitespaceTokenBuilder(character: "\r", element: .carriageReturn)) + // Single character tokens + let singles: [(Character, MarkdownTokenElement)] = [ + ("#", .hash), ("*", .asterisk), ("_", .underscore), ("-", .dash), + ("+", .plus), ("=", .equals), ("~", .tilde), ("^", .caret), + ("|", .pipe), (":", .colon), (";", .semicolon), ("!", .exclamation), + ("?", .question), (".", .dot), (",", .comma), (">", .gt), ("<", .lt), + ("&", .ampersand), ("\\", .backslash), ("/", .forwardSlash), + ("\"", .quote), ("'", .singleQuote), ("[", .leftBracket), ("]", .rightBracket), + ("(", .leftParen), (")", .rightParen), ("{", .leftBrace), ("}", .rightBrace) + ] + for (char, element) in singles { + builders.append(SingleCharacterTokenBuilder(character: char, element: element)) + } + // Numbers and text + builders.append(NumberTokenBuilder()) + builders.append(TextTokenBuilder()) + return builders + } +} diff --git a/Sources/SwiftParser/Markdown/TokenBuilders/NumberTokenBuilder.swift b/Sources/SwiftParser/Markdown/TokenBuilders/NumberTokenBuilder.swift new file mode 100644 index 0000000..6dd0609 --- /dev/null +++ b/Sources/SwiftParser/Markdown/TokenBuilders/NumberTokenBuilder.swift @@ -0,0 +1,20 @@ +import Foundation + +struct NumberTokenBuilder: CodeTokenBuilder { + typealias Element = MarkdownTokenElement + func build(from context: CodeTokenContext) -> Bool { + guard context.consuming < context.source.endIndex else { return false } + var idx = context.consuming + var hasDigit = false + while idx < context.source.endIndex && context.source[idx].isNumber { + idx = context.source.index(after: idx) + hasDigit = true + } + guard hasDigit else { return false } + let range = context.consuming..) -> Bool { + guard context.consuming < context.source.endIndex else { return false } + if context.source[context.consuming] == character { + let start = context.consuming + context.consuming = context.source.index(after: start) + let token = MarkdownToken(element: element, text: String(character), range: start..) -> Bool { + guard context.consuming < context.source.endIndex else { return false } + var idx = context.consuming + while idx < context.source.endIndex { + let c = context.source[idx] + if isSpecial(c) { break } + idx = context.source.index(after: idx) + } + guard idx > context.consuming else { return false } + let range = context.consuming.. Bool { + switch char { + case "#", "*", "_", "`", "-", "+", "=", "~", "^", "|", ":", ";", "!", "?", ".", ",", ">", "<", "&", "\\", "/", "\"", "'", "[", "]", "(", ")", "{", "}", "$": + return true + case " ", "\t", "\n", "\r": + return true + default: + return false + } + } +} diff --git a/Sources/SwiftParser/Markdown/TokenBuilders/WhitespaceTokenBuilder.swift b/Sources/SwiftParser/Markdown/TokenBuilders/WhitespaceTokenBuilder.swift new file mode 100644 index 0000000..1015496 --- /dev/null +++ b/Sources/SwiftParser/Markdown/TokenBuilders/WhitespaceTokenBuilder.swift @@ -0,0 +1,51 @@ +import Foundation + +struct WhitespaceTokenBuilder: CodeTokenBuilder { + typealias Element = MarkdownTokenElement + private let character: Character + private let element: MarkdownTokenElement + + init(character: Character, element: MarkdownTokenElement) { + self.character = character + self.element = element + } + + func build(from context: CodeTokenContext) -> Bool { + guard context.consuming < context.source.endIndex else { return false } + if element == .newline || element == .carriageReturn { + return buildNewline(from: context) + } + if context.source[context.consuming] == character { + let start = context.consuming + context.consuming = context.source.index(after: start) + let token = MarkdownToken(element: element, text: String(character), range: start..) -> Bool { + let index = context.consuming + let char = context.source[index] + if char == "\n" { + context.consuming = context.source.index(after: index) + let token = MarkdownToken.newline(at: index..! + + override func setUp() { + let language = MarkdownLanguage() + tokenizer = CodeTokenizer(language: language) + } + + func testSingleCharacterToken() { + let tokens = tokenizer.tokenize("#") + XCTAssertEqual(tokens.count, 1) + XCTAssertEqual(tokens[0].element, .hash) + } + + func testInlineCode() { + let tokens = tokenizer.tokenize("`code`") + XCTAssertFalse(tokens.isEmpty) + XCTAssertEqual(tokens[0].element, .inlineCode) + XCTAssertEqual(tokens[0].text, "`code`") + } +}