Skip to content
309 changes: 136 additions & 173 deletions MARKDOWN_PARSER.md

Large diffs are not rendered by default.

28 changes: 23 additions & 5 deletions Sources/SwiftParser/Core/CodeContext.swift
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
import Foundation

public struct CodeContext {
public var tokens: [any CodeToken]
public var currentNode: CodeNode
public protocol CodeContextState<Node, Token> where Node: CodeNodeElement, Token: CodeTokenElement {
associatedtype Node: CodeNodeElement
associatedtype Token: CodeTokenElement
}

public class CodeContext<Node, Token> where Node: CodeNodeElement, Token: CodeTokenElement {
/// The current node being processed in the context
public var current: CodeNode<Node>

/// The tokens that need to be processed in this context
public var tokens: [any CodeToken<Token>]

/// The index of the next token to consume
public var consuming: Int

/// Any errors encountered during processing
public var errors: [CodeError]

public init(tokens: [any CodeToken], currentNode: CodeNode, errors: [CodeError]) {
/// The state of the processing, which can hold additional information
public var state: (any CodeContextState<Node, Token>)?

public init(current: CodeNode<Node>, tokens: [any CodeToken<Token>], consuming: Int = 0, state: (any CodeContextState<Node, Token>)? = nil, errors: [CodeError] = []) {
self.current = current
self.tokens = tokens
self.currentNode = currentNode
self.consuming = consuming
self.state = state
self.errors = errors
}
}
3 changes: 0 additions & 3 deletions Sources/SwiftParser/Core/CodeElement.swift

This file was deleted.

13 changes: 9 additions & 4 deletions Sources/SwiftParser/Core/CodeLanguage.swift
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import Foundation

public protocol CodeLanguage {
var tokenizer: CodeTokenizer { get }
var consumers: [CodeTokenConsumer] { get }
var rootElement: any CodeElement { get }
public protocol CodeLanguage<Node, Token> where Node: CodeNodeElement, Token: CodeTokenElement {
associatedtype Node: CodeNodeElement
associatedtype Token: CodeTokenElement

var tokenizer: any CodeTokenizer<Token> { get }
var builders: [any CodeNodeBuilder<Node, Token>] { get }

func root(of content: String) -> CodeNode<Node>
func state(of content: String) -> (any CodeContextState<Node, Token>)?
}
72 changes: 41 additions & 31 deletions Sources/SwiftParser/Core/CodeNode.swift
Original file line number Diff line number Diff line change
@@ -1,71 +1,81 @@
import Foundation

public class CodeNode {
public let type: any CodeElement
public var value: String
public weak var parent: CodeNode?
public var children: [CodeNode] = []
public var range: Range<String.Index>?
public protocol CodeNodeElement: CaseIterable, RawRepresentable where RawValue == String {}

public class CodeNode<Node> where Node: CodeNodeElement {
public let element: Node
public weak var parent: CodeNode<Node>?
public var children: [CodeNode<Node>] = []

/// The node's id relies on its element and children
public var id: Int {
var hasher = Hasher()
hasher.combine(String(describing: type))
hasher.combine(value)
hash(into: &hasher)
for child in children {
hasher.combine(child.id)
}
return hasher.finalize()
}

public init(type: any CodeElement, value: String, range: Range<String.Index>? = nil) {
self.type = type
self.value = value
self.range = range
public init(element: Node) {
self.element = element
}

/// The function to compute the hash value of this node.
/// Since some structure node do not have hashable content, we leave this function open.
/// Each subclass can override this method to provide its own hash logic.
open func hash(into hasher: inout Hasher) {
hasher.combine(element.rawValue)
}

public func addChild(_ node: CodeNode) {
// MARK: - Child management

/// Add a child node to this node
public func append(_ node: CodeNode<Node>) {
node.parent = self
children.append(node)
}

/// Insert a child node at the specified index
public func insertChild(_ node: CodeNode, at index: Int) {
public func insert(_ node: CodeNode<Node>, at index: Int) {
node.parent = self
children.insert(node, at: index)
}

/// Remove and return the child node at the given index
@discardableResult
public func removeChild(at index: Int) -> CodeNode {
public func remove(at index: Int) -> CodeNode<Node> {
let removed = children.remove(at: index)
removed.parent = nil
return removed
}

/// Detach this node from its parent
public func remove() {
parent?.children.removeAll { $0 === self }
parent = nil
}

/// Replace the child node at the given index with another node
public func replaceChild(at index: Int, with node: CodeNode) {
public func replace(at index: Int, with node: CodeNode<Node>) {
children[index].parent = nil
node.parent = self
children[index] = node
}

/// Detach this node from its parent
public func removeFromParent() {
parent?.children.removeAll { $0 === self }
parent = nil
}
// MARK: - Traversal and Searching

/// Depth-first traversal of this node and all descendants
public func traverseDepthFirst(_ visit: (CodeNode) -> Void) {
public func dfs(_ visit: (CodeNode<Node>) -> Void) {
visit(self)
for child in children {
child.traverseDepthFirst(visit)
child.dfs(visit)
}
}

/// Breadth-first traversal of this node and all descendants
public func traverseBreadthFirst(_ visit: (CodeNode) -> Void) {
var queue: [CodeNode] = [self]
public func bfs(_ visit: (CodeNode<Node>) -> Void) {
var queue: [CodeNode<Node>] = [self]
while !queue.isEmpty {
let node = queue.removeFirst()
visit(node)
Expand All @@ -74,7 +84,7 @@ public class CodeNode {
}

/// Return the first node in the subtree satisfying the predicate
public func first(where predicate: (CodeNode) -> Bool) -> CodeNode? {
public func first(where predicate: (CodeNode<Node>) -> Bool) -> CodeNode<Node>? {
if predicate(self) { return self }
for child in children {
if let result = child.first(where: predicate) {
Expand All @@ -85,17 +95,17 @@ public class CodeNode {
}

/// Return all nodes in the subtree satisfying the predicate
public func findAll(where predicate: (CodeNode) -> Bool) -> [CodeNode] {
var results: [CodeNode] = []
traverseDepthFirst { node in
public func nodes(where predicate: (CodeNode<Node>) -> Bool) -> [CodeNode<Node>] {
var results: [CodeNode<Node>] = []
dfs { node in
if predicate(node) { results.append(node) }
}
return results
}

/// Number of nodes in this subtree including this node
public var subtreeCount: Int {
1 + children.reduce(0) { $0 + $1.subtreeCount }
public var count: Int {
1 + children.reduce(0) { $0 + $1.count }
}

/// Depth of this node from the root node
Expand Down
11 changes: 11 additions & 0 deletions Sources/SwiftParser/Core/CodeNodeBuilder.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import Foundation

/// Consume tokens to build a tree of nodes.
public protocol CodeNodeBuilder<Node, Token> where Node: CodeNodeElement, Token: CodeTokenElement {
associatedtype Node: CodeNodeElement
associatedtype Token: CodeTokenElement

/// Attempt to build part of the AST from the context.
/// Returns true if the builder successfully consumed tokens and updated the context.
func build(from context: inout CodeContext<Node, Token>) -> Bool
}
59 changes: 30 additions & 29 deletions Sources/SwiftParser/Core/CodeParser.swift
Original file line number Diff line number Diff line change
@@ -1,50 +1,51 @@
import Foundation

public final class CodeParser {
private var consumers: [CodeTokenConsumer]
private let tokenizer: CodeTokenizer
public final class CodeParser<Node, Token> where Node: CodeNodeElement, Token: CodeTokenElement {
private let language: any CodeLanguage<Node, Token>

// Registered state is now reset for each parse run

public init(language: CodeLanguage) {
self.tokenizer = language.tokenizer
self.consumers = language.consumers
public init(language: any CodeLanguage<Node, Token>) {
self.language = language
}

public func parse(_ input: String, root: CodeNode<Node>) -> (node: CodeNode<Node>, context: CodeContext<Node, Token>) {
let normalized = normalize(input)
let tokens = language.tokenizer.tokenize(normalized)
var context = CodeContext(current: root, tokens: tokens, state: language.state(of: normalized))


public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) {
let tokens = tokenizer.tokenize(input)
var context = CodeContext(tokens: tokens, currentNode: rootNode, errors: [])

// Infinite loop protection: track token count progression
var lastCount = context.tokens.count + 1

while let token = context.tokens.first {
// Infinite loop detection - if token count hasn't decreased, terminate parsing immediately
if context.tokens.count == lastCount {
context.errors.append(CodeError("Infinite loop detected: parser stuck at token \(token.kindDescription). Terminating parse to prevent hang.", range: token.range))
while context.consuming < context.tokens.count {
// Stop at EOF without recording an error
if let token = context.tokens[context.consuming] as? MarkdownToken,
token.element == .eof {
break
}
lastCount = context.tokens.count

if token.kindDescription == "eof" {
break
}
var matched = false
for consumer in consumers {
if consumer.consume(context: &context, token: token) {
for builder in language.builders {
if builder.build(from: &context) {
matched = true
break
}
}

if !matched {
context.errors.append(CodeError("Unrecognized token \(token.kindDescription)", range: token.range))
context.tokens.removeFirst()
// If no builder matched, record an error and skip the token
let token = context.tokens[context.consuming]
let error = CodeError("Unrecognized token: \(token.element)", range: token.range)
context.errors.append(error)
context.consuming += 1
}
}

return (rootNode, context)
return (root, context)
}

/// Normalizes input string to handle line ending inconsistencies and other common issues
/// This ensures consistent behavior across different platforms and input sources
private func normalize(_ raw: String) -> String {
// Normalize line endings: Convert CRLF (\r\n) and CR (\r) to LF (\n)
// This prevents issues with different line ending conventions
return raw
.replacingOccurrences(of: "\r\n", with: "\n") // Windows CRLF -> Unix LF
.replacingOccurrences(of: "\r", with: "\n") // Classic Mac CR -> Unix LF
}
}
7 changes: 5 additions & 2 deletions Sources/SwiftParser/Core/CodeToken.swift
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import Foundation

public protocol CodeToken {
var kindDescription: String { get }
public protocol CodeTokenElement: CaseIterable, RawRepresentable where RawValue == String {}

public protocol CodeToken<Element> where Element: CodeTokenElement {
associatedtype Element: CodeTokenElement
var element: Element { get }
var text: String { get }
var range: Range<String.Index> { get }
}
7 changes: 0 additions & 7 deletions Sources/SwiftParser/Core/CodeTokenConsumer.swift

This file was deleted.

6 changes: 3 additions & 3 deletions Sources/SwiftParser/Core/CodeTokenizer.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import Foundation

public protocol CodeTokenizer {
func tokenize(_ input: String) -> [any CodeToken]
public protocol CodeTokenizer<Element> where Element: CodeTokenElement {
associatedtype Element: CodeTokenElement
func tokenize(_ input: String) -> [any CodeToken<Element>]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import Foundation

public class MarkdownAdmonitionBuilder: CodeNodeBuilder {
public init() {}

public func build(from context: inout CodeContext<MarkdownNodeElement, MarkdownTokenElement>) -> Bool {
guard context.consuming < context.tokens.count,
isStartOfLine(context),
let gt = context.tokens[context.consuming] as? MarkdownToken,
gt.element == .gt else { return false }
var idx = context.consuming + 1
if idx < context.tokens.count,
let space = context.tokens[idx] as? MarkdownToken,
space.element == .space {
idx += 1
}
guard idx + 3 < context.tokens.count,
let lb = context.tokens[idx] as? MarkdownToken, lb.element == .leftBracket,
let ex = context.tokens[idx+1] as? MarkdownToken, ex.element == .exclamation,
let text = context.tokens[idx+2] as? MarkdownToken, text.element == .text,
let rb = context.tokens[idx+3] as? MarkdownToken, rb.element == .rightBracket else { return false }
let kind = text.text.lowercased()
idx += 4
guard idx < context.tokens.count,
let nl = context.tokens[idx] as? MarkdownToken,
nl.element == .newline else { return false }
idx += 1
guard idx < context.tokens.count,
isStartOfLine(index: idx, tokens: context.tokens),
let gt2 = context.tokens[idx] as? MarkdownToken,
gt2.element == .gt else { return false }
idx += 1
if idx < context.tokens.count,
let sp = context.tokens[idx] as? MarkdownToken,
sp.element == .space { idx += 1 }
context.consuming = idx
let children = MarkdownInlineParser.parseInline(&context)
let node = AdmonitionNode(kind: kind)
for c in children { node.append(c) }
context.current.append(node)
if context.consuming < context.tokens.count,
let nl2 = context.tokens[context.consuming] as? MarkdownToken,
nl2.element == .newline { context.consuming += 1 }
return true
}

private func isStartOfLine(_ context: CodeContext<MarkdownNodeElement, MarkdownTokenElement>) -> Bool {
if context.consuming == 0 { return true }
if let prev = context.tokens[context.consuming - 1] as? MarkdownToken {
return prev.element == .newline
}
return false
}

private func isStartOfLine(index: Int, tokens: [any CodeToken<MarkdownTokenElement>]) -> Bool {
if index == 0 { return true }
if index - 1 < tokens.count,
let prev = tokens[index - 1] as? MarkdownToken {
return prev.element == .newline
}
return false
}
}
Loading