From c2ce1269d376918ebe0da18b5fe898ed8a5cd022 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 21 Jul 2025 11:07:10 +0800 Subject: [PATCH] Tokenize custom container block --- .../MarkdownCustomContainerBuilder.swift | 73 ++++++++----------- .../SwiftParser/Markdown/MarkdownNodes.swift | 5 +- .../Markdown/MarkdownTokenizer.swift | 54 +++++++++++++- .../SwiftParser/Markdown/MarkdownTokens.swift | 7 +- .../MarkdownTokenizerBasicTests.swift | 10 +++ 5 files changed, 103 insertions(+), 46 deletions(-) diff --git a/Sources/SwiftParser/Markdown/Builders/MarkdownCustomContainerBuilder.swift b/Sources/SwiftParser/Markdown/Builders/MarkdownCustomContainerBuilder.swift index b7e9dba..0564c73 100644 --- a/Sources/SwiftParser/Markdown/Builders/MarkdownCustomContainerBuilder.swift +++ b/Sources/SwiftParser/Markdown/Builders/MarkdownCustomContainerBuilder.swift @@ -4,54 +4,41 @@ public class MarkdownCustomContainerBuilder: CodeNodeBuilder { public init() {} public func build(from context: inout CodeContext) -> Bool { - guard context.consuming + 2 < context.tokens.count, + guard context.consuming < context.tokens.count, isStartOfLine(context), - let c1 = context.tokens[context.consuming] as? MarkdownToken, - let c2 = context.tokens[context.consuming + 1] as? MarkdownToken, - let c3 = context.tokens[context.consuming + 2] as? MarkdownToken, - c1.element == .colon, c2.element == .colon, c3.element == .colon else { return false } - var idx = context.consuming + 3 - var name = "" - while idx < context.tokens.count, - let t = context.tokens[idx] as? MarkdownToken, - t.element != .newline { - name += t.text - idx += 1 - } - name = name.trimmingCharacters(in: .whitespaces) - guard idx < context.tokens.count, - let nl = context.tokens[idx] as? MarkdownToken, - nl.element == .newline else { return false } - idx += 1 - var innerTokens: [any CodeToken] = [] - while idx < context.tokens.count { - if isStartOfLine(index: idx, tokens: context.tokens), - idx + 2 < context.tokens.count, - let e1 = context.tokens[idx] as? MarkdownToken, - let e2 = context.tokens[idx + 1] as? MarkdownToken, - let e3 = context.tokens[idx + 2] as? MarkdownToken, - e1.element == .colon, e2.element == .colon, e3.element == .colon { - idx += 3 - while idx < context.tokens.count, - let t = context.tokens[idx] as? MarkdownToken, - t.element != .newline { idx += 1 } - if idx < context.tokens.count, - let nl2 = context.tokens[idx] as? MarkdownToken, - nl2.element == .newline { idx += 1 } - break - } - innerTokens.append(context.tokens[idx]) - idx += 1 + let token = context.tokens[context.consuming] as? MarkdownToken, + token.element == .customContainer else { return false } + + context.consuming += 1 + + let (name, content) = parseContainer(token.text) + let node = CustomContainerNode(name: name, content: content) + context.current.append(node) + + if context.consuming < context.tokens.count, + let nl = context.tokens[context.consuming] as? MarkdownToken, + nl.element == .newline { + context.consuming += 1 } - context.consuming = idx - var subContext = CodeContext(current: DocumentNode(), tokens: innerTokens) - let children = MarkdownInlineParser.parseInline(&subContext) - let container = CustomContainerNode(name: name) - for c in children { container.append(c) } - context.current.append(container) + return true } + private func parseContainer(_ text: String) -> (String, String) { + var lines = text.split(omittingEmptySubsequences: false, whereSeparator: { $0.isNewline }) + guard !lines.isEmpty else { return ("", "") } + var first = String(lines.removeFirst()) + if let range = first.range(of: ":::") { + first.removeSubrange(range) + } + let name = first.trimmingCharacters(in: CharacterSet.whitespaces) + if let last = lines.last, last.trimmingCharacters(in: CharacterSet.whitespaces).hasPrefix(":::") { + lines.removeLast() + } + let content = lines.joined(separator: "\n") + return (name, content) + } + private func isStartOfLine(_ context: CodeContext) -> Bool { if context.consuming == 0 { return true } if let prev = context.tokens[context.consuming - 1] as? MarkdownToken { diff --git a/Sources/SwiftParser/Markdown/MarkdownNodes.swift b/Sources/SwiftParser/Markdown/MarkdownNodes.swift index 77debfc..c1ec79a 100644 --- a/Sources/SwiftParser/Markdown/MarkdownNodes.swift +++ b/Sources/SwiftParser/Markdown/MarkdownNodes.swift @@ -233,15 +233,18 @@ public class AdmonitionNode: MarkdownNodeBase { public class CustomContainerNode: MarkdownNodeBase { public var name: String + public var content: String - public init(name: String) { + public init(name: String, content: String) { self.name = name + self.content = content super.init(element: .customContainer) } public override func hash(into hasher: inout Hasher) { super.hash(into: &hasher) hasher.combine(name) + hasher.combine(content) } } diff --git a/Sources/SwiftParser/Markdown/MarkdownTokenizer.swift b/Sources/SwiftParser/Markdown/MarkdownTokenizer.swift index fd9d4d1..343a1c1 100644 --- a/Sources/SwiftParser/Markdown/MarkdownTokenizer.swift +++ b/Sources/SwiftParser/Markdown/MarkdownTokenizer.swift @@ -69,8 +69,11 @@ public class MarkdownTokenizer: CodeTokenizer { case "|": addToken(.pipe, text: "|", from: startIndex) - + case ":": + if tokenizeCustomContainer(from: startIndex) { + return + } addToken(.colon, text: ":", from: startIndex) case ";": @@ -1378,6 +1381,55 @@ extension MarkdownTokenizer { return false } + /// Tokenize custom containers starting with ':::' at line start + private func tokenizeCustomContainer(from startIndex: String.Index) -> Bool { + guard isAtLineStart(index: startIndex), match(":::") else { return false } + + var tempIndex = input.index(startIndex, offsetBy: 3) + + // Scan for the closing ':::' at line start + while tempIndex < input.endIndex { + if isAtLineStart(index: tempIndex) && input[tempIndex...].hasPrefix(":::") { + // Move to end of closing line + var end = input.index(tempIndex, offsetBy: 3) + while end < input.endIndex && input[end] != "\n" && input[end] != "\r" { + end = input.index(after: end) + } + if end < input.endIndex { + if input[end] == "\r" { + let next = input.index(after: end) + if next < input.endIndex && input[next] == "\n" { + end = input.index(after: next) + } else { + end = next + } + } else { + end = input.index(after: end) + } + } + let range = startIndex.. Bool { + if index == input.startIndex { return true } + let prev = input[input.index(before: index)] + return prev == "\n" || prev == "\r" + } + // ...existing code... } diff --git a/Sources/SwiftParser/Markdown/MarkdownTokens.swift b/Sources/SwiftParser/Markdown/MarkdownTokens.swift index 5d18d35..64c3066 100644 --- a/Sources/SwiftParser/Markdown/MarkdownTokens.swift +++ b/Sources/SwiftParser/Markdown/MarkdownTokens.swift @@ -65,6 +65,7 @@ public enum MarkdownTokenElement: String, CaseIterable, CodeTokenElement { case htmlEntity = "html_entity" case htmlBlock = "html_block" // Closed HTML block case htmlUnclosedBlock = "html_unclosed_block" // Unclosed HTML block + case customContainer = "custom_container" // ::: custom container block } @@ -234,6 +235,10 @@ public class MarkdownToken: CodeToken { public static func email(_ email: String, at range: Range) -> MarkdownToken { return MarkdownToken(element: .email, text: email, range: range) } + + public static func customContainer(_ text: String, at range: Range) -> MarkdownToken { + return MarkdownToken(element: .customContainer, text: text, range: range) + } } // MARK: - Token Utilities @@ -266,7 +271,7 @@ extension MarkdownToken { /// Check if this token can start a block element public var canStartBlock: Bool { switch element { - case .hash, .gt, .dash, .plus, .asterisk, .tilde, .number, .inlineCode, .fencedCodeBlock, .indentedCodeBlock, .autolink: + case .hash, .gt, .dash, .plus, .asterisk, .tilde, .number, .inlineCode, .fencedCodeBlock, .indentedCodeBlock, .autolink, .customContainer: return true default: return false diff --git a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift index 5c16b5d..62c1b00 100644 --- a/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift +++ b/Tests/SwiftParserTests/Markdown/Tokenizer/MarkdownTokenizerBasicTests.swift @@ -412,6 +412,16 @@ final class MarkdownTokenizerBasicTests: XCTestCase { XCTAssertEqual(firstToken.text, "`", "Should be just the backtick") } + func testCustomContainerTokenization() { + let input = "::: custom\ncontent\n:::" + let tokens = tokenizer.tokenize(input) + + XCTAssertEqual(tokens.count, 2) + XCTAssertEqual(tokens[0].element, .customContainer) + XCTAssertEqual(tokens[0].text, input) + XCTAssertEqual(tokens[1].element, .eof) + } + // MARK: - Edge Cases and Special Scenarios func testEmptyAndWhitespaceInputs() {