From 6fdad4df317de310a001944a7418bdea75cca6bf Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 17:19:27 +0800 Subject: [PATCH 01/10] Add Pratt parsing support --- Sources/SwiftParser/CodeParser.swift | 45 +++ Sources/SwiftParser/Core.swift | 72 +++++ .../Languages/MarkdownLanguage.swift | 128 ++++++++ .../Languages/PythonLanguage.swift | 300 ++++++++++++++++++ Sources/SwiftParser/SwiftParser.swift | 26 +- Sources/SwiftParserShowCase/ContentView.swift | 52 +-- Tests/SwiftParserTests/SwiftParserTests.swift | 53 ++-- 7 files changed, 614 insertions(+), 62 deletions(-) create mode 100644 Sources/SwiftParser/CodeParser.swift create mode 100644 Sources/SwiftParser/Core.swift create mode 100644 Sources/SwiftParser/Languages/MarkdownLanguage.swift create mode 100644 Sources/SwiftParser/Languages/PythonLanguage.swift diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift new file mode 100644 index 0000000..84970fe --- /dev/null +++ b/Sources/SwiftParser/CodeParser.swift @@ -0,0 +1,45 @@ +import Foundation + +public final class CodeParser { + private var builders: [CodeElementBuilder] + private let tokenizer: CodeTokenizer + + public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = []) { + self.tokenizer = tokenizer + self.builders = builders + } + + public func register(builder: CodeElementBuilder) { + builders.append(builder) + } + + public func clearBuilders() { + builders.removeAll() + } + + public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { + let tokens = tokenizer.tokenize(input) + var context = CodeContext(tokens: tokens, index: 0, currentNode: rootNode, errors: [], input: input) + while context.index < context.tokens.count { + let token = context.tokens[context.index] + var matched = false + for builder in builders { + if builder.accept(context: context, token: token) { + builder.build(context: &context) + matched = true + break + } + } + if !matched { + context.errors.append(CodeError("Unrecognized token \(token.kindDescription)", range: token.range)) + context.index += 1 + } + } + return (rootNode, context) + } + + public func update(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { + // Simple implementation: reparse everything + return parse(input, rootNode: rootNode) + } +} diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift new file mode 100644 index 0000000..414b123 --- /dev/null +++ b/Sources/SwiftParser/Core.swift @@ -0,0 +1,72 @@ +import Foundation + +public protocol CodeElement {} + +public protocol CodeToken { + var kindDescription: String { get } + var text: String { get } + var range: Range { get } +} + +public protocol CodeTokenizer { + func tokenize(_ input: String) -> [any CodeToken] +} + +public protocol CodeElementBuilder { + func accept(context: CodeContext, token: any CodeToken) -> Bool + func build(context: inout CodeContext) +} + +public final class CodeNode { + public let type: any CodeElement + public var value: String + public weak var parent: CodeNode? + public var children: [CodeNode] = [] + public var range: Range? + + public var id: Int { + return String(describing: type).hashValue ^ value.hashValue + } + + public init(type: any CodeElement, value: String, range: Range? = nil) { + self.type = type + self.value = value + self.range = range + } + + public func addChild(_ node: CodeNode) { + node.parent = self + children.append(node) + } +} + +public struct CodeError: Error { + public let message: String + public let range: Range? + public init(_ message: String, range: Range? = nil) { + self.message = message + self.range = range + } +} + +public struct CodeContext { + public var tokens: [any CodeToken] + public var index: Int + public var currentNode: CodeNode + public var errors: [CodeError] + public let input: String + + public init(tokens: [any CodeToken], index: Int, currentNode: CodeNode, errors: [CodeError], input: String) { + self.tokens = tokens + self.index = index + self.currentNode = currentNode + self.errors = errors + self.input = input + } +} + +public protocol CodeLanguage { + var tokenizer: CodeTokenizer { get } + var builders: [CodeElementBuilder] { get } + var rootElement: any CodeElement { get } +} diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift new file mode 100644 index 0000000..eb41e87 --- /dev/null +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -0,0 +1,128 @@ +import Foundation + +public struct MarkdownLanguage: CodeLanguage { + public enum Element: String, CodeElement { + case root + case paragraph + case heading + case text + } + + public enum Token: CodeToken { + case text(String, Range) + case hash(Range) + case newline(Range) + case eof(Range) + + public var kindDescription: String { + switch self { + case .text: return "text" + case .hash: return "#" + case .newline: return "newline" + case .eof: return "eof" + } + } + + public var text: String { + switch self { + case .text(let s, _): return s + case .hash: return "#" + case .newline: return "\n" + case .eof: return "" + } + } + + public var range: Range { + switch self { + case .text(_, let r), .hash(let r), .newline(let r), .eof(let r): + return r + } + } + } + + public class Tokenizer: CodeTokenizer { + public init() {} + + public func tokenize(_ input: String) -> [any CodeToken] { + var tokens: [Token] = [] + var index = input.startIndex + func advance() { index = input.index(after: index) } + func add(_ t: Token) { tokens.append(t) } + while index < input.endIndex { + let ch = input[index] + if ch == "#" { + let start = index + advance() + add(.hash(start.. Bool { + guard let tok = token as? Token else { return false } + if case .hash = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + guard context.index < context.tokens.count else { return } + if let textTok = context.tokens[context.index] as? Token { + let node = CodeNode(type: Element.heading, value: textTok.text) + context.currentNode.addChild(node) + context.index += 1 + } + // consume newline if exists + if let nl = context.tokens[context.index] as? Token, case .newline = nl { context.index += 1 } + } + } + + public class ParagraphBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + if token is Token { return true } else { return false } + } + public func build(context: inout CodeContext) { + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .text(let t, _): + text += t + context.index += 1 + case .newline: + context.index += 1 + let node = CodeNode(type: Element.paragraph, value: text) + context.currentNode.addChild(node) + return + case .hash, .eof: + let node = CodeNode(type: Element.paragraph, value: text) + context.currentNode.addChild(node) + return + } + } else { context.index += 1 } + } + } + } + + public var tokenizer: CodeTokenizer { Tokenizer() } + public var builders: [CodeElementBuilder] { [HeadingBuilder(), ParagraphBuilder()] } + public var rootElement: any CodeElement { Element.root } + public init() {} +} diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift new file mode 100644 index 0000000..9447a58 --- /dev/null +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -0,0 +1,300 @@ +import Foundation + +public struct PythonLanguage: CodeLanguage { + public enum Element: String, CodeElement { + case root + case statement + case identifier + case number + case string + case assignment + case function + case parameters + case body + case expression + } + + public enum Token: CodeToken { + case identifier(String, Range) + case number(String, Range) + case string(String, Range) + case keyword(String, Range) + case equal(Range) + case colon(Range) + case comma(Range) + case plus(Range) + case minus(Range) + case star(Range) + case slash(Range) + case lparen(Range) + case rparen(Range) + case newline(Range) + case eof(Range) + + public var kindDescription: String { + switch self { + case .identifier: return "identifier" + case .number: return "number" + case .string: return "string" + case .keyword(let k, _): return "keyword(\(k))" + case .equal: return "=" + case .colon: return ":" + case .comma: return "," + case .plus: return "+" + case .minus: return "-" + case .star: return "*" + case .slash: return "/" + case .lparen: return "(" + case .rparen: return ")" + case .newline: return "newline" + case .eof: return "eof" + } + } + + public var text: String { + switch self { + case let .identifier(s, _), let .number(s, _), let .string(s, _), let .keyword(s, _): + return s + case .equal: return "=" + case .colon: return ":" + case .comma: return "," + case .plus: return "+" + case .minus: return "-" + case .star: return "*" + case .slash: return "/" + case .lparen: return "(" + case .rparen: return ")" + case .newline: return "\n" + case .eof: return "" + } + } + + public var range: Range { + switch self { + case .identifier(_, let r), .number(_, let r), .string(_, let r), .keyword(_, let r), .equal(let r), + .colon(let r), .comma(let r), .plus(let r), .minus(let r), .star(let r), .slash(let r), + .lparen(let r), .rparen(let r), .newline(let r), .eof(let r): + return r + } + } + } + + public class Tokenizer: CodeTokenizer { + public init() {} + + public func tokenize(_ input: String) -> [any CodeToken] { + var tokens: [Token] = [] + var index = input.startIndex + func advance() { index = input.index(after: index) } + func add(_ token: Token) { tokens.append(token) } + + while index < input.endIndex { + let ch = input[index] + if ch.isWhitespace { + if ch == "\n" { + let start = index + advance() + add(.newline(start.. CodeNode? { + guard index < tokens.count, let first = tokens[index] as? Token else { return nil } + index += 1 + var left: CodeNode? + switch first { + case .number(let text, let range): + left = CodeNode(type: Element.number, value: text, range: range) + case .identifier(let text, let range): + left = CodeNode(type: Element.identifier, value: text, range: range) + case .lparen: + left = parse(0) + if index < tokens.count, let t = tokens[index] as? Token, case .rparen = t { index += 1 } + default: + return nil + } + guard var l = left else { return nil } + while index < tokens.count, let op = tokens[index] as? Token, let bp = infixBindingPower(op), bp.left >= minBP { + index += 1 + let rhs = parse(bp.right) ?? CodeNode(type: Element.number, value: "", range: op.range) + let opNode = CodeNode(type: Element.expression, value: op.text, range: op.range) + opNode.addChild(l) + opNode.addChild(rhs) + l = opNode + } + return l + } + + private func infixBindingPower(_ token: Token) -> (left: Int, right: Int)? { + switch token { + case .plus, .minus: + return (left: 10, right: 11) + case .star, .slash: + return (left: 20, right: 21) + default: + return nil + } + } + } + + public class AssignmentBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard context.index + 2 < context.tokens.count else { return false } + if let tok = context.tokens[context.index] as? Token, + case .identifier = tok, + let eq = context.tokens[context.index + 1] as? Token, + case .equal = eq { + return true + } + return false + } + + public func build(context: inout CodeContext) { + guard let identifierTok = context.tokens[context.index] as? Token else { return } + let node = CodeNode(type: Element.assignment, value: identifierTok.text) + context.currentNode.addChild(node) + context.index += 2 // skip identifier and '=' + + var parser = ExpressionParser(tokens: context.tokens, startIndex: context.index) + if let exprNode = parser.parse() { + node.addChild(exprNode) + context.index = parser.index + } + if context.index < context.tokens.count, + let nl = context.tokens[context.index] as? Token, + case .newline = nl { + context.index += 1 + } + } + } + + public class FunctionBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .keyword("def", _) = tok { return true } + return false + } + + public func build(context: inout CodeContext) { + // def name():\n + context.index += 1 // skip 'def' + guard let nameTok = context.tokens[context.index] as? Token else { return } + let funcNode = CodeNode(type: Element.function, value: nameTok.text) + context.currentNode.addChild(funcNode) + context.index += 1 // skip name + // skip params + if let lparen = context.tokens[context.index] as? Token, case .lparen = lparen { + context.index += 1 + let paramsNode = CodeNode(type: Element.parameters, value: "") + funcNode.addChild(paramsNode) + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .identifier: + paramsNode.addChild(CodeNode(type: Element.identifier, value: tok.text)) + context.index += 1 + if let comma = context.tokens[context.index] as? Token, case .comma = comma { + context.index += 1 + } + case .rparen: + context.index += 1 + break + default: + context.index += 1 + } + if case .rparen = tok { break } + } + } + } + if let colon = context.tokens[context.index] as? Token, case .colon = colon { + context.index += 1 + } + let bodyNode = CodeNode(type: Element.body, value: "") + funcNode.addChild(bodyNode) + // consume until newline or eof + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token, case .newline = tok { context.index += 1; break } + context.index += 1 + } + } + } + + public var tokenizer: CodeTokenizer { Tokenizer() } + + public var builders: [CodeElementBuilder] { [FunctionBuilder(), AssignmentBuilder()] } + + public var rootElement: any CodeElement { Element.root } + + public init() {} +} diff --git a/Sources/SwiftParser/SwiftParser.swift b/Sources/SwiftParser/SwiftParser.swift index b1d357d..31456fb 100644 --- a/Sources/SwiftParser/SwiftParser.swift +++ b/Sources/SwiftParser/SwiftParser.swift @@ -3,21 +3,29 @@ import Foundation /// SwiftParser - A Swift parsing framework public struct SwiftParser { public init() {} - - /// Parse a Swift source code string - /// - Parameter source: The Swift source code to parse - /// - Returns: A parsed representation of the source code + + public func parse(_ source: String, language: CodeLanguage) -> ParsedSource { + let root = CodeNode(type: language.rootElement, value: "") + let parser = CodeParser(tokenizer: language.tokenizer, builders: language.builders) + let result = parser.parse(source, rootNode: root) + return ParsedSource(content: source, root: result.node, errors: result.context.errors) + } + + /// Convenience method using Python language by default public func parse(_ source: String) -> ParsedSource { - // TODO: Implement parsing logic - return ParsedSource(content: source) + return parse(source, language: PythonLanguage()) } } -/// Represents a parsed Swift source file +/// Represents a parsed source file public struct ParsedSource { public let content: String - - public init(content: String) { + public let root: CodeNode + public let errors: [CodeError] + + public init(content: String, root: CodeNode, errors: [CodeError] = []) { self.content = content + self.root = root + self.errors = errors } } diff --git a/Sources/SwiftParserShowCase/ContentView.swift b/Sources/SwiftParserShowCase/ContentView.swift index 4df9718..01bed24 100644 --- a/Sources/SwiftParserShowCase/ContentView.swift +++ b/Sources/SwiftParserShowCase/ContentView.swift @@ -2,21 +2,26 @@ import SwiftUI import SwiftParser struct ContentView: View { - @State private var sourceCode: String = """ - import Foundation - - struct Example { - let name: String - - func greet() { - print("Hello, \\(name)!") + enum DemoLanguage: String, CaseIterable, Identifiable { + case python + case markdown + var id: String { rawValue } + + var language: CodeLanguage { + switch self { + case .python: return PythonLanguage() + case .markdown: return MarkdownLanguage() + } } } - """ - + + @State private var language: DemoLanguage = .python + @State private var sourceCode: String = """ +print("Hello") +""" @State private var parsedResult: String = "" private let parser = SwiftParser() - + var body: some View { NavigationView { VStack(spacing: 20) { @@ -24,11 +29,18 @@ struct ContentView: View { .font(.largeTitle) .fontWeight(.bold) .padding() - + + Picker("Language", selection: $language) { + ForEach(DemoLanguage.allCases) { lang in + Text(lang.rawValue.capitalized).tag(lang) + } + }.pickerStyle(.segmented) + .padding(.horizontal) + VStack(alignment: .leading, spacing: 10) { - Text("Swift Source Code:") + Text("Source Code:") .font(.headline) - + TextEditor(text: $sourceCode) .font(.system(.body, design: .monospaced)) .padding(8) @@ -36,19 +48,19 @@ struct ContentView: View { .cornerRadius(8) .frame(minHeight: 200) } - + Button("Parse Code") { - let result = parser.parse(sourceCode) - parsedResult = "Parsed content: \\(result.content.count) characters" + let result = parser.parse(sourceCode, language: language.language) + parsedResult = "Errors: \(result.errors.count), children: \(result.root.children.count)" } .buttonStyle(.borderedProminent) .padding() - + if !parsedResult.isEmpty { VStack(alignment: .leading, spacing: 10) { Text("Parse Result:") .font(.headline) - + Text(parsedResult) .font(.system(.body, design: .monospaced)) .padding(8) @@ -57,7 +69,7 @@ struct ContentView: View { .frame(maxWidth: .infinity, alignment: .leading) } } - + Spacer() } .padding() diff --git a/Tests/SwiftParserTests/SwiftParserTests.swift b/Tests/SwiftParserTests/SwiftParserTests.swift index 39551fc..c9f09a9 100644 --- a/Tests/SwiftParserTests/SwiftParserTests.swift +++ b/Tests/SwiftParserTests/SwiftParserTests.swift @@ -2,47 +2,34 @@ import XCTest @testable import SwiftParser final class SwiftParserTests: XCTestCase { - + func testParserInitialization() { let parser = SwiftParser() XCTAssertNotNil(parser) } - - func testBasicParsing() { + + func testPythonAssignment() { let parser = SwiftParser() - let sourceCode = "let x = 42" - - let result = parser.parse(sourceCode) - - XCTAssertEqual(result.content, sourceCode) + let source = "x = 1" + let result = parser.parse(source, language: PythonLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? PythonLanguage.Element, PythonLanguage.Element.assignment) } - - func testEmptySourceParsing() { + + func testMarkdownHeading() { let parser = SwiftParser() - let sourceCode = "" - - let result = parser.parse(sourceCode) - - XCTAssertEqual(result.content, sourceCode) + let source = "# Title\nHello" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.count, 2) } - - func testComplexSourceParsing() { + + func testPrattExpression() { let parser = SwiftParser() - let sourceCode = """ - import Foundation - - struct Example { - let name: String - - func greet() { - print("Hello, \\(name)!") - } - } - """ - - let result = parser.parse(sourceCode) - - XCTAssertEqual(result.content, sourceCode) - XCTAssertTrue(result.content.contains("struct Example")) + let source = "x = 1 + 2 * 3" + let result = parser.parse(source, language: PythonLanguage()) + XCTAssertEqual(result.errors.count, 0) + let assign = result.root.children.first + XCTAssertEqual(assign?.children.first?.type as? PythonLanguage.Element, PythonLanguage.Element.expression) } } From 95437e8a101a0b41b2c72aa24b21b5f985caacbd Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 17:37:45 +0800 Subject: [PATCH 02/10] Fix parser loops and errors --- Sources/SwiftParser/CodeParser.swift | 3 +++ .../SwiftParser/Languages/MarkdownLanguage.swift | 7 ++++++- Sources/SwiftParser/Languages/PythonLanguage.swift | 14 +++++++++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift index 84970fe..5f013ce 100644 --- a/Sources/SwiftParser/CodeParser.swift +++ b/Sources/SwiftParser/CodeParser.swift @@ -22,6 +22,9 @@ public final class CodeParser { var context = CodeContext(tokens: tokens, index: 0, currentNode: rootNode, errors: [], input: input) while context.index < context.tokens.count { let token = context.tokens[context.index] + if token.kindDescription == "eof" { + break + } var matched = false for builder in builders { if builder.accept(context: context, token: token) { diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift index eb41e87..8727894 100644 --- a/Sources/SwiftParser/Languages/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -111,10 +111,15 @@ public struct MarkdownLanguage: CodeLanguage { let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return - case .hash, .eof: + case .hash: let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return + case .eof: + let node = CodeNode(type: Element.paragraph, value: text) + context.currentNode.addChild(node) + context.index += 1 + return } } else { context.index += 1 } } diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift index 9447a58..2a1feb2 100644 --- a/Sources/SwiftParser/Languages/PythonLanguage.swift +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -238,6 +238,18 @@ public struct PythonLanguage: CodeLanguage { } } + public class NewlineBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .newline = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + } + } + public class FunctionBuilder: CodeElementBuilder { public init() {} public func accept(context: CodeContext, token: any CodeToken) -> Bool { @@ -292,7 +304,7 @@ public struct PythonLanguage: CodeLanguage { public var tokenizer: CodeTokenizer { Tokenizer() } - public var builders: [CodeElementBuilder] { [FunctionBuilder(), AssignmentBuilder()] } + public var builders: [CodeElementBuilder] { [NewlineBuilder(), FunctionBuilder(), AssignmentBuilder()] } public var rootElement: any CodeElement { Element.root } From b0b1f13ac98c97aa943d960417bd868eb569c18c Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 17:48:22 +0800 Subject: [PATCH 03/10] Ensure stable CodeNode ids --- Sources/SwiftParser/Core.swift | 8 +++++++- Tests/SwiftParserTests/SwiftParserTests.swift | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift index 414b123..e3e730f 100644 --- a/Sources/SwiftParser/Core.swift +++ b/Sources/SwiftParser/Core.swift @@ -25,7 +25,13 @@ public final class CodeNode { public var range: Range? public var id: Int { - return String(describing: type).hashValue ^ value.hashValue + var hasher = Hasher() + hasher.combine(String(describing: type)) + hasher.combine(value) + for child in children { + hasher.combine(child.id) + } + return hasher.finalize() } public init(type: any CodeElement, value: String, range: Range? = nil) { diff --git a/Tests/SwiftParserTests/SwiftParserTests.swift b/Tests/SwiftParserTests/SwiftParserTests.swift index c9f09a9..28cca71 100644 --- a/Tests/SwiftParserTests/SwiftParserTests.swift +++ b/Tests/SwiftParserTests/SwiftParserTests.swift @@ -32,4 +32,14 @@ final class SwiftParserTests: XCTestCase { let assign = result.root.children.first XCTAssertEqual(assign?.children.first?.type as? PythonLanguage.Element, PythonLanguage.Element.expression) } + + func testStableNodeID() { + let n1 = CodeNode(type: PythonLanguage.Element.identifier, value: "x") + n1.addChild(CodeNode(type: PythonLanguage.Element.number, value: "1")) + + let n2 = CodeNode(type: PythonLanguage.Element.identifier, value: "x") + n2.addChild(CodeNode(type: PythonLanguage.Element.number, value: "1")) + + XCTAssertEqual(n1.id, n2.id) + } } From 60d73651924465ff11b3837f4b4e53a1f7177676 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 17:57:57 +0800 Subject: [PATCH 04/10] Add Pratt parsing support --- Sources/SwiftParser/CodeParser.swift | 19 ++++- Sources/SwiftParser/Core.swift | 1 + Sources/SwiftParser/ExpressionBuilder.swift | 36 +++++++++ .../Languages/MarkdownLanguage.swift | 1 + .../Languages/PythonLanguage.swift | 79 +++++++++++-------- Sources/SwiftParser/SwiftParser.swift | 2 +- 6 files changed, 101 insertions(+), 37 deletions(-) create mode 100644 Sources/SwiftParser/ExpressionBuilder.swift diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift index 5f013ce..051f733 100644 --- a/Sources/SwiftParser/CodeParser.swift +++ b/Sources/SwiftParser/CodeParser.swift @@ -3,10 +3,12 @@ import Foundation public final class CodeParser { private var builders: [CodeElementBuilder] private let tokenizer: CodeTokenizer + private var expressionBuilder: CodeExpressionBuilder? - public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = []) { + public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = [], expressionBuilder: CodeExpressionBuilder? = nil) { self.tokenizer = tokenizer self.builders = builders + self.expressionBuilder = expressionBuilder } public func register(builder: CodeElementBuilder) { @@ -17,6 +19,10 @@ public final class CodeParser { builders.removeAll() } + public func register(expressionBuilder: CodeExpressionBuilder) { + self.expressionBuilder = expressionBuilder + } + public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { let tokens = tokenizer.tokenize(input) var context = CodeContext(tokens: tokens, index: 0, currentNode: rootNode, errors: [], input: input) @@ -33,6 +39,12 @@ public final class CodeParser { break } } + if !matched, let expr = expressionBuilder, expr.accept(context: context, token: token) { + if let node = expr.parse(context: &context) { + context.currentNode.addChild(node) + } + matched = true + } if !matched { context.errors.append(CodeError("Unrecognized token \(token.kindDescription)", range: token.range)) context.index += 1 @@ -45,4 +57,9 @@ public final class CodeParser { // Simple implementation: reparse everything return parse(input, rootNode: rootNode) } + + public func parseExpression(context: inout CodeContext, minBP: Int = 0) -> CodeNode? { + guard let expr = expressionBuilder else { return nil } + return expr.parse(context: &context, minBP: minBP) + } } diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift index e3e730f..13fa0b5 100644 --- a/Sources/SwiftParser/Core.swift +++ b/Sources/SwiftParser/Core.swift @@ -75,4 +75,5 @@ public protocol CodeLanguage { var tokenizer: CodeTokenizer { get } var builders: [CodeElementBuilder] { get } var rootElement: any CodeElement { get } + var expressionBuilder: CodeExpressionBuilder? { get } } diff --git a/Sources/SwiftParser/ExpressionBuilder.swift b/Sources/SwiftParser/ExpressionBuilder.swift new file mode 100644 index 0000000..7014380 --- /dev/null +++ b/Sources/SwiftParser/ExpressionBuilder.swift @@ -0,0 +1,36 @@ +import Foundation + +public protocol CodeExpressionBuilder: CodeElementBuilder { + func isPrefix(token: any CodeToken) -> Bool + func prefix(context: inout CodeContext, token: any CodeToken) -> CodeNode? + func infixBindingPower(of token: any CodeToken) -> (left: Int, right: Int)? + func infix(context: inout CodeContext, left: CodeNode, token: any CodeToken, right: CodeNode) -> CodeNode +} + +public extension CodeExpressionBuilder { + func accept(context: CodeContext, token: any CodeToken) -> Bool { + return isPrefix(token: token) + } + + func build(context: inout CodeContext) { + if let node = parse(context: &context) { + context.currentNode.addChild(node) + } + } + + func parse(context: inout CodeContext, minBP: Int = 0) -> CodeNode? { + guard context.index < context.tokens.count else { return nil } + let first = context.tokens[context.index] + guard isPrefix(token: first) else { return nil } + context.index += 1 + guard var left = prefix(context: &context, token: first) else { return nil } + while context.index < context.tokens.count { + let opToken = context.tokens[context.index] + guard let bp = infixBindingPower(of: opToken), bp.left >= minBP else { break } + context.index += 1 + let right = parse(context: &context, minBP: bp.right) ?? CodeNode(type: left.type, value: "") + left = infix(context: &context, left: left, token: opToken, right: right) + } + return left + } +} diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift index 8727894..c91e133 100644 --- a/Sources/SwiftParser/Languages/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -128,6 +128,7 @@ public struct MarkdownLanguage: CodeLanguage { public var tokenizer: CodeTokenizer { Tokenizer() } public var builders: [CodeElementBuilder] { [HeadingBuilder(), ParagraphBuilder()] } + public var expressionBuilder: CodeExpressionBuilder? { nil } public var rootElement: any CodeElement { Element.root } public init() {} } diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift index 2a1feb2..c4326d4 100644 --- a/Sources/SwiftParser/Languages/PythonLanguage.swift +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -158,56 +158,62 @@ public struct PythonLanguage: CodeLanguage { } } - struct ExpressionParser { - private let tokens: [any CodeToken] - private(set) var index: Int - - init(tokens: [any CodeToken], startIndex: Int) { - self.tokens = tokens - self.index = startIndex + public final class ExpressionBuilder: CodeExpressionBuilder { + public func isPrefix(token: any CodeToken) -> Bool { + guard let t = token as? Token else { return false } + switch t { + case .number, .identifier, .lparen: + return true + default: + return false + } } - mutating func parse(_ minBP: Int = 0) -> CodeNode? { - guard index < tokens.count, let first = tokens[index] as? Token else { return nil } - index += 1 - var left: CodeNode? - switch first { + public func prefix(context: inout CodeContext, token: any CodeToken) -> CodeNode? { + guard let t = token as? Token else { return nil } + switch t { case .number(let text, let range): - left = CodeNode(type: Element.number, value: text, range: range) + return CodeNode(type: Element.number, value: text, range: range) case .identifier(let text, let range): - left = CodeNode(type: Element.identifier, value: text, range: range) + return CodeNode(type: Element.identifier, value: text, range: range) case .lparen: - left = parse(0) - if index < tokens.count, let t = tokens[index] as? Token, case .rparen = t { index += 1 } + let node = parse(context: &context, minBP: 0) + if context.index < context.tokens.count, let r = context.tokens[context.index] as? Token, case .rparen = r { + context.index += 1 + } + return node default: return nil } - guard var l = left else { return nil } - while index < tokens.count, let op = tokens[index] as? Token, let bp = infixBindingPower(op), bp.left >= minBP { - index += 1 - let rhs = parse(bp.right) ?? CodeNode(type: Element.number, value: "", range: op.range) - let opNode = CodeNode(type: Element.expression, value: op.text, range: op.range) - opNode.addChild(l) - opNode.addChild(rhs) - l = opNode - } - return l } - private func infixBindingPower(_ token: Token) -> (left: Int, right: Int)? { - switch token { + public func infixBindingPower(of token: any CodeToken) -> (left: Int, right: Int)? { + guard let t = token as? Token else { return nil } + switch t { case .plus, .minus: - return (left: 10, right: 11) + return (10, 11) case .star, .slash: - return (left: 20, right: 21) + return (20, 21) default: return nil } } + + public func infix(context: inout CodeContext, left: CodeNode, token: any CodeToken, right: CodeNode) -> CodeNode { + let text = token.text + let node = CodeNode(type: Element.expression, value: text, range: token.range) + node.addChild(left) + node.addChild(right) + return node + } } public class AssignmentBuilder: CodeElementBuilder { - public init() {} + private let expr: ExpressionBuilder + + public init(expressionBuilder: ExpressionBuilder) { + self.expr = expressionBuilder + } public func accept(context: CodeContext, token: any CodeToken) -> Bool { guard context.index + 2 < context.tokens.count else { return false } if let tok = context.tokens[context.index] as? Token, @@ -225,10 +231,8 @@ public struct PythonLanguage: CodeLanguage { context.currentNode.addChild(node) context.index += 2 // skip identifier and '=' - var parser = ExpressionParser(tokens: context.tokens, startIndex: context.index) - if let exprNode = parser.parse() { + if let exprNode = expr.parse(context: &context) { node.addChild(exprNode) - context.index = parser.index } if context.index < context.tokens.count, let nl = context.tokens[context.index] as? Token, @@ -304,7 +308,12 @@ public struct PythonLanguage: CodeLanguage { public var tokenizer: CodeTokenizer { Tokenizer() } - public var builders: [CodeElementBuilder] { [NewlineBuilder(), FunctionBuilder(), AssignmentBuilder()] } + public var builders: [CodeElementBuilder] { + let expr = ExpressionBuilder() + return [NewlineBuilder(), FunctionBuilder(), AssignmentBuilder(expressionBuilder: expr)] + } + + public var expressionBuilder: CodeExpressionBuilder? { ExpressionBuilder() } public var rootElement: any CodeElement { Element.root } diff --git a/Sources/SwiftParser/SwiftParser.swift b/Sources/SwiftParser/SwiftParser.swift index 31456fb..2086b37 100644 --- a/Sources/SwiftParser/SwiftParser.swift +++ b/Sources/SwiftParser/SwiftParser.swift @@ -6,7 +6,7 @@ public struct SwiftParser { public func parse(_ source: String, language: CodeLanguage) -> ParsedSource { let root = CodeNode(type: language.rootElement, value: "") - let parser = CodeParser(tokenizer: language.tokenizer, builders: language.builders) + let parser = CodeParser(tokenizer: language.tokenizer, builders: language.builders, expressionBuilder: language.expressionBuilder) let result = parser.parse(source, rootNode: root) return ParsedSource(content: source, root: result.node, errors: result.context.errors) } From 30f875d50cfe74b7af6a30833ba787776e4ae2c5 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 18:17:19 +0800 Subject: [PATCH 05/10] Allow multiple expression builders --- Sources/SwiftParser/CodeParser.swift | 31 +++++++++++++------ Sources/SwiftParser/Core.swift | 2 +- .../Languages/MarkdownLanguage.swift | 2 +- .../Languages/PythonLanguage.swift | 2 +- Sources/SwiftParser/SwiftParser.swift | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift index 051f733..eb8d17e 100644 --- a/Sources/SwiftParser/CodeParser.swift +++ b/Sources/SwiftParser/CodeParser.swift @@ -3,12 +3,12 @@ import Foundation public final class CodeParser { private var builders: [CodeElementBuilder] private let tokenizer: CodeTokenizer - private var expressionBuilder: CodeExpressionBuilder? + private var expressionBuilders: [CodeExpressionBuilder] - public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = [], expressionBuilder: CodeExpressionBuilder? = nil) { + public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = [], expressionBuilders: [CodeExpressionBuilder] = []) { self.tokenizer = tokenizer self.builders = builders - self.expressionBuilder = expressionBuilder + self.expressionBuilders = expressionBuilders } public func register(builder: CodeElementBuilder) { @@ -20,7 +20,7 @@ public final class CodeParser { } public func register(expressionBuilder: CodeExpressionBuilder) { - self.expressionBuilder = expressionBuilder + expressionBuilders.append(expressionBuilder) } public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { @@ -39,11 +39,16 @@ public final class CodeParser { break } } - if !matched, let expr = expressionBuilder, expr.accept(context: context, token: token) { - if let node = expr.parse(context: &context) { - context.currentNode.addChild(node) + if !matched { + for expr in expressionBuilders { + if expr.accept(context: context, token: token) { + if let node = expr.parse(context: &context) { + context.currentNode.addChild(node) + } + matched = true + break + } } - matched = true } if !matched { context.errors.append(CodeError("Unrecognized token \(token.kindDescription)", range: token.range)) @@ -59,7 +64,13 @@ public final class CodeParser { } public func parseExpression(context: inout CodeContext, minBP: Int = 0) -> CodeNode? { - guard let expr = expressionBuilder else { return nil } - return expr.parse(context: &context, minBP: minBP) + guard context.index < context.tokens.count else { return nil } + let token = context.tokens[context.index] + for expr in expressionBuilders { + if expr.accept(context: context, token: token) { + return expr.parse(context: &context, minBP: minBP) + } + } + return nil } } diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift index 13fa0b5..ad4954e 100644 --- a/Sources/SwiftParser/Core.swift +++ b/Sources/SwiftParser/Core.swift @@ -75,5 +75,5 @@ public protocol CodeLanguage { var tokenizer: CodeTokenizer { get } var builders: [CodeElementBuilder] { get } var rootElement: any CodeElement { get } - var expressionBuilder: CodeExpressionBuilder? { get } + var expressionBuilders: [CodeExpressionBuilder] { get } } diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift index c91e133..54da274 100644 --- a/Sources/SwiftParser/Languages/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -128,7 +128,7 @@ public struct MarkdownLanguage: CodeLanguage { public var tokenizer: CodeTokenizer { Tokenizer() } public var builders: [CodeElementBuilder] { [HeadingBuilder(), ParagraphBuilder()] } - public var expressionBuilder: CodeExpressionBuilder? { nil } + public var expressionBuilders: [CodeExpressionBuilder] { [] } public var rootElement: any CodeElement { Element.root } public init() {} } diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift index c4326d4..0f0bf90 100644 --- a/Sources/SwiftParser/Languages/PythonLanguage.swift +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -313,7 +313,7 @@ public struct PythonLanguage: CodeLanguage { return [NewlineBuilder(), FunctionBuilder(), AssignmentBuilder(expressionBuilder: expr)] } - public var expressionBuilder: CodeExpressionBuilder? { ExpressionBuilder() } + public var expressionBuilders: [CodeExpressionBuilder] { [ExpressionBuilder()] } public var rootElement: any CodeElement { Element.root } diff --git a/Sources/SwiftParser/SwiftParser.swift b/Sources/SwiftParser/SwiftParser.swift index 2086b37..0efcb89 100644 --- a/Sources/SwiftParser/SwiftParser.swift +++ b/Sources/SwiftParser/SwiftParser.swift @@ -6,7 +6,7 @@ public struct SwiftParser { public func parse(_ source: String, language: CodeLanguage) -> ParsedSource { let root = CodeNode(type: language.rootElement, value: "") - let parser = CodeParser(tokenizer: language.tokenizer, builders: language.builders, expressionBuilder: language.expressionBuilder) + let parser = CodeParser(tokenizer: language.tokenizer, builders: language.builders, expressionBuilders: language.expressionBuilders) let result = parser.parse(source, rootNode: root) return ParsedSource(content: source, root: result.node, errors: result.context.errors) } From f12da1cec9d761eafc1cc1d35c59d35a754b6aa1 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Mon, 14 Jul 2025 18:44:52 +0800 Subject: [PATCH 06/10] Add backtracking snapshot and unterminated string support --- Sources/SwiftParser/Core.swift | 25 +++++++++++++++++++ .../Languages/PythonLanguage.swift | 20 ++++++++++++--- Tests/SwiftParserTests/SwiftParserTests.swift | 22 ++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift index ad4954e..ae67663 100644 --- a/Sources/SwiftParser/Core.swift +++ b/Sources/SwiftParser/Core.swift @@ -69,6 +69,31 @@ public struct CodeContext { self.errors = errors self.input = input } + + /// Snapshot represents a parser state that can be restored later. + public struct Snapshot { + fileprivate let index: Int + fileprivate let node: CodeNode + fileprivate let childCount: Int + fileprivate let errorCount: Int + } + + /// Capture the current parser state so it can be restored on demand. + public func snapshot() -> Snapshot { + Snapshot(index: index, node: currentNode, childCount: currentNode.children.count, errorCount: errors.count) + } + + /// Restore the parser to a previously captured state, discarding any new nodes or errors. + public mutating func restore(_ snapshot: Snapshot) { + index = snapshot.index + currentNode = snapshot.node + if currentNode.children.count > snapshot.childCount { + currentNode.children.removeLast(currentNode.children.count - snapshot.childCount) + } + if errors.count > snapshot.errorCount { + errors.removeLast(errors.count - snapshot.errorCount) + } + } } public protocol CodeLanguage { diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift index 0f0bf90..31e7033 100644 --- a/Sources/SwiftParser/Languages/PythonLanguage.swift +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -18,6 +18,7 @@ public struct PythonLanguage: CodeLanguage { case identifier(String, Range) case number(String, Range) case string(String, Range) + case unterminatedString(String, Range) case keyword(String, Range) case equal(Range) case colon(Range) @@ -36,6 +37,7 @@ public struct PythonLanguage: CodeLanguage { case .identifier: return "identifier" case .number: return "number" case .string: return "string" + case .unterminatedString: return "unterminatedString" case .keyword(let k, _): return "keyword(\(k))" case .equal: return "=" case .colon: return ":" @@ -55,6 +57,8 @@ public struct PythonLanguage: CodeLanguage { switch self { case let .identifier(s, _), let .number(s, _), let .string(s, _), let .keyword(s, _): return s + case let .unterminatedString(s, _): + return s case .equal: return "=" case .colon: return ":" case .comma: return "," @@ -71,7 +75,7 @@ public struct PythonLanguage: CodeLanguage { public var range: Range { switch self { - case .identifier(_, let r), .number(_, let r), .string(_, let r), .keyword(_, let r), .equal(let r), + case .identifier(_, let r), .number(_, let r), .string(_, let r), .unterminatedString(_, let r), .keyword(_, let r), .equal(let r), .colon(let r), .comma(let r), .plus(let r), .minus(let r), .star(let r), .slash(let r), .lparen(let r), .rparen(let r), .newline(let r), .eof(let r): return r @@ -112,9 +116,14 @@ public struct PythonLanguage: CodeLanguage { while index < input.endIndex && input[index] != quote { advance() } - if index < input.endIndex { advance() } - let text = String(input[start.. Date: Mon, 14 Jul 2025 19:07:52 +0800 Subject: [PATCH 07/10] Implement incremental update with rollback --- Sources/SwiftParser/CodeParser.swift | 77 ++++++++++++++++++- Sources/SwiftParser/Core.swift | 25 ++++++ .../Languages/PythonLanguage.swift | 20 ++++- Tests/SwiftParserTests/SwiftParserTests.swift | 32 ++++++++ 4 files changed, 148 insertions(+), 6 deletions(-) diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift index eb8d17e..0b17707 100644 --- a/Sources/SwiftParser/CodeParser.swift +++ b/Sources/SwiftParser/CodeParser.swift @@ -5,6 +5,11 @@ public final class CodeParser { private let tokenizer: CodeTokenizer private var expressionBuilders: [CodeExpressionBuilder] + // State for incremental parsing + private var lastContext: CodeContext? + private var snapshots: [Int: CodeContext.Snapshot] = [:] + private var lastTokens: [any CodeToken] = [] + public init(tokenizer: CodeTokenizer, builders: [CodeElementBuilder] = [], expressionBuilders: [CodeExpressionBuilder] = []) { self.tokenizer = tokenizer self.builders = builders @@ -26,7 +31,12 @@ public final class CodeParser { public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { let tokens = tokenizer.tokenize(input) var context = CodeContext(tokens: tokens, index: 0, currentNode: rootNode, errors: [], input: input) + + snapshots = [:] + lastTokens = tokens + while context.index < context.tokens.count { + snapshots[context.index] = context.snapshot() let token = context.tokens[context.index] if token.kindDescription == "eof" { break @@ -55,12 +65,75 @@ public final class CodeParser { context.index += 1 } } + snapshots[context.index] = context.snapshot() + lastContext = context return (rootNode, context) } public func update(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { - // Simple implementation: reparse everything - return parse(input, rootNode: rootNode) + guard var context = lastContext else { + return parse(input, rootNode: rootNode) + } + + let newTokens = tokenizer.tokenize(input) + + var diffIndex = 0 + while diffIndex < min(lastTokens.count, newTokens.count) { + if !tokenEqual(lastTokens[diffIndex], newTokens[diffIndex]) { + break + } + diffIndex += 1 + } + + var restoreIndex = diffIndex + while restoreIndex >= 0 && snapshots[restoreIndex] == nil { + restoreIndex -= 1 + } + if let snap = snapshots[restoreIndex] { + context.restore(snap) + } + + context.tokens = newTokens + context.index = restoreIndex + + snapshots = snapshots.filter { $0.key <= restoreIndex } + lastTokens = newTokens + + while context.index < context.tokens.count { + snapshots[context.index] = context.snapshot() + let token = context.tokens[context.index] + if token.kindDescription == "eof" { break } + var matched = false + for builder in builders { + if builder.accept(context: context, token: token) { + builder.build(context: &context) + matched = true + break + } + } + if !matched { + for expr in expressionBuilders { + if expr.accept(context: context, token: token) { + if let node = expr.parse(context: &context) { + context.currentNode.addChild(node) + } + matched = true + break + } + } + } + if !matched { + context.errors.append(CodeError("Unrecognized token \(token.kindDescription)", range: token.range)) + context.index += 1 + } + } + snapshots[context.index] = context.snapshot() + lastContext = context + return (rootNode, context) + } + + private func tokenEqual(_ a: any CodeToken, _ b: any CodeToken) -> Bool { + return a.kindDescription == b.kindDescription && a.text == b.text } public func parseExpression(context: inout CodeContext, minBP: Int = 0) -> CodeNode? { diff --git a/Sources/SwiftParser/Core.swift b/Sources/SwiftParser/Core.swift index ad4954e..ae67663 100644 --- a/Sources/SwiftParser/Core.swift +++ b/Sources/SwiftParser/Core.swift @@ -69,6 +69,31 @@ public struct CodeContext { self.errors = errors self.input = input } + + /// Snapshot represents a parser state that can be restored later. + public struct Snapshot { + fileprivate let index: Int + fileprivate let node: CodeNode + fileprivate let childCount: Int + fileprivate let errorCount: Int + } + + /// Capture the current parser state so it can be restored on demand. + public func snapshot() -> Snapshot { + Snapshot(index: index, node: currentNode, childCount: currentNode.children.count, errorCount: errors.count) + } + + /// Restore the parser to a previously captured state, discarding any new nodes or errors. + public mutating func restore(_ snapshot: Snapshot) { + index = snapshot.index + currentNode = snapshot.node + if currentNode.children.count > snapshot.childCount { + currentNode.children.removeLast(currentNode.children.count - snapshot.childCount) + } + if errors.count > snapshot.errorCount { + errors.removeLast(errors.count - snapshot.errorCount) + } + } } public protocol CodeLanguage { diff --git a/Sources/SwiftParser/Languages/PythonLanguage.swift b/Sources/SwiftParser/Languages/PythonLanguage.swift index 0f0bf90..31e7033 100644 --- a/Sources/SwiftParser/Languages/PythonLanguage.swift +++ b/Sources/SwiftParser/Languages/PythonLanguage.swift @@ -18,6 +18,7 @@ public struct PythonLanguage: CodeLanguage { case identifier(String, Range) case number(String, Range) case string(String, Range) + case unterminatedString(String, Range) case keyword(String, Range) case equal(Range) case colon(Range) @@ -36,6 +37,7 @@ public struct PythonLanguage: CodeLanguage { case .identifier: return "identifier" case .number: return "number" case .string: return "string" + case .unterminatedString: return "unterminatedString" case .keyword(let k, _): return "keyword(\(k))" case .equal: return "=" case .colon: return ":" @@ -55,6 +57,8 @@ public struct PythonLanguage: CodeLanguage { switch self { case let .identifier(s, _), let .number(s, _), let .string(s, _), let .keyword(s, _): return s + case let .unterminatedString(s, _): + return s case .equal: return "=" case .colon: return ":" case .comma: return "," @@ -71,7 +75,7 @@ public struct PythonLanguage: CodeLanguage { public var range: Range { switch self { - case .identifier(_, let r), .number(_, let r), .string(_, let r), .keyword(_, let r), .equal(let r), + case .identifier(_, let r), .number(_, let r), .string(_, let r), .unterminatedString(_, let r), .keyword(_, let r), .equal(let r), .colon(let r), .comma(let r), .plus(let r), .minus(let r), .star(let r), .slash(let r), .lparen(let r), .rparen(let r), .newline(let r), .eof(let r): return r @@ -112,9 +116,14 @@ public struct PythonLanguage: CodeLanguage { while index < input.endIndex && input[index] != quote { advance() } - if index < input.endIndex { advance() } - let text = String(input[start.. Date: Mon, 14 Jul 2025 21:38:19 +0800 Subject: [PATCH 08/10] Add builder removal APIs and tests --- Sources/SwiftParser/CodeParser.swift | 20 +++++++++++ Tests/SwiftParserTests/SwiftParserTests.swift | 36 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/Sources/SwiftParser/CodeParser.swift b/Sources/SwiftParser/CodeParser.swift index 0b17707..95fb9c0 100644 --- a/Sources/SwiftParser/CodeParser.swift +++ b/Sources/SwiftParser/CodeParser.swift @@ -20,6 +20,14 @@ public final class CodeParser { builders.append(builder) } + public func unregister(builder: CodeElementBuilder) { + if let target = builder as? AnyObject { + if let index = builders.firstIndex(where: { ($0 as? AnyObject) === target }) { + builders.remove(at: index) + } + } + } + public func clearBuilders() { builders.removeAll() } @@ -28,6 +36,18 @@ public final class CodeParser { expressionBuilders.append(expressionBuilder) } + public func unregister(expressionBuilder: CodeExpressionBuilder) { + if let target = expressionBuilder as? AnyObject { + if let index = expressionBuilders.firstIndex(where: { ($0 as? AnyObject) === target }) { + expressionBuilders.remove(at: index) + } + } + } + + public func clearExpressionBuilders() { + expressionBuilders.removeAll() + } + public func parse(_ input: String, rootNode: CodeNode) -> (node: CodeNode, context: CodeContext) { let tokens = tokenizer.tokenize(input) var context = CodeContext(tokens: tokens, index: 0, currentNode: rootNode, errors: [], input: input) diff --git a/Tests/SwiftParserTests/SwiftParserTests.swift b/Tests/SwiftParserTests/SwiftParserTests.swift index 8cdb01a..d72b1f3 100644 --- a/Tests/SwiftParserTests/SwiftParserTests.swift +++ b/Tests/SwiftParserTests/SwiftParserTests.swift @@ -74,4 +74,40 @@ final class SwiftParserTests: XCTestCase { _ = parser.update("x = 2", rootNode: root) XCTAssertEqual(root.children.first?.children.first?.value, "2") } + + func testUnregisterElementBuilder() { + let tokenizer = PythonLanguage.Tokenizer() + let expr = PythonLanguage.ExpressionBuilder() + let assign = PythonLanguage.AssignmentBuilder(expressionBuilder: expr) + let parser = CodeParser(tokenizer: tokenizer) + parser.register(builder: assign) + parser.register(expressionBuilder: expr) + + let root1 = CodeNode(type: PythonLanguage.Element.root, value: "") + _ = parser.parse("x = 1", rootNode: root1) + XCTAssertEqual(root1.children.first?.type as? PythonLanguage.Element, .assignment) + + parser.unregister(builder: assign) + + let root2 = CodeNode(type: PythonLanguage.Element.root, value: "") + _ = parser.parse("x = 1", rootNode: root2) + XCTAssertEqual(root2.children.first?.type as? PythonLanguage.Element, .identifier) + } + + func testUnregisterExpressionBuilder() { + let tokenizer = PythonLanguage.Tokenizer() + let expr = PythonLanguage.ExpressionBuilder() + let parser = CodeParser(tokenizer: tokenizer) + parser.register(expressionBuilder: expr) + + let root1 = CodeNode(type: PythonLanguage.Element.root, value: "") + _ = parser.parse("1 + 2", rootNode: root1) + XCTAssertEqual(root1.children.count, 1) + + parser.unregister(expressionBuilder: expr) + + let root2 = CodeNode(type: PythonLanguage.Element.root, value: "") + _ = parser.parse("1 + 2", rootNode: root2) + XCTAssertEqual(root2.children.count, 0) + } } From 09c5f0bcca36b57067ba9d6ec0fdca5ebc608d5a Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Tue, 15 Jul 2025 00:31:36 +0800 Subject: [PATCH 09/10] Expand Markdown support --- .../Languages/MarkdownLanguage.swift | 372 +++++++++++++++++- Tests/SwiftParserTests/SwiftParserTests.swift | 44 +++ 2 files changed, 412 insertions(+), 4 deletions(-) diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift index 54da274..8a14f80 100644 --- a/Sources/SwiftParser/Languages/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -6,11 +6,29 @@ public struct MarkdownLanguage: CodeLanguage { case paragraph case heading case text + case listItem + case orderedListItem + case emphasis + case strong + case codeBlock + case inlineCode + case link } public enum Token: CodeToken { case text(String, Range) case hash(Range) + case dash(Range) + case star(Range) + case underscore(Range) + case plus(Range) + case backtick(Range) + case lbracket(Range) + case rbracket(Range) + case lparen(Range) + case rparen(Range) + case dot(Range) + case number(String, Range) case newline(Range) case eof(Range) @@ -18,6 +36,17 @@ public struct MarkdownLanguage: CodeLanguage { switch self { case .text: return "text" case .hash: return "#" + case .dash: return "-" + case .star: return "*" + case .underscore: return "_" + case .plus: return "+" + case .backtick: return "`" + case .lbracket: return "[" + case .rbracket: return "]" + case .lparen: return "(" + case .rparen: return ")" + case .dot: return "." + case .number: return "number" case .newline: return "newline" case .eof: return "eof" } @@ -27,6 +56,17 @@ public struct MarkdownLanguage: CodeLanguage { switch self { case .text(let s, _): return s case .hash: return "#" + case .dash: return "-" + case .star: return "*" + case .underscore: return "_" + case .plus: return "+" + case .backtick: return "`" + case .lbracket: return "[" + case .rbracket: return "]" + case .lparen: return "(" + case .rparen: return ")" + case .dot: return "." + case .number(let s, _): return s case .newline: return "\n" case .eof: return "" } @@ -34,7 +74,9 @@ public struct MarkdownLanguage: CodeLanguage { public var range: Range { switch self { - case .text(_, let r), .hash(let r), .newline(let r), .eof(let r): + case .text(_, let r), .hash(let r), .dash(let r), .star(let r), .underscore(let r), + .plus(let r), .backtick(let r), .lbracket(let r), .rbracket(let r), + .lparen(let r), .rparen(let r), .dot(let r), .number(_, let r), .newline(let r), .eof(let r): return r } } @@ -54,13 +96,61 @@ public struct MarkdownLanguage: CodeLanguage { let start = index advance() add(.hash(start.. Bool { + guard let tok = token as? Token else { return false } + switch tok { + case .dash, .star, .plus: + if context.index + 1 < context.tokens.count, + let next = context.tokens[context.index + 1] as? Token, + case .text(let s, _) = next, + s.first?.isWhitespace == true { + if context.index == 0 { return true } + if let prev = context.tokens[context.index - 1] as? Token, case .newline = prev { + return true + } + } + default: + break + } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 // skip bullet + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .newline: + context.index += 1 + let node = CodeNode(type: Element.listItem, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + return + case .eof: + let node = CodeNode(type: Element.listItem, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + context.index += 1 + return + default: + text += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + } + } + + public class OrderedListItemBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .number = tok { + if context.index + 1 < context.tokens.count, + let dot = context.tokens[context.index + 1] as? Token, + case .dot = dot { + if context.index == 0 { return true } + if let prev = context.tokens[context.index - 1] as? Token, case .newline = prev { + return true + } + } + } + return false + } + public func build(context: inout CodeContext) { + context.index += 2 // skip number and '.' + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .newline: + context.index += 1 + let node = CodeNode(type: Element.orderedListItem, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + return + case .eof: + let node = CodeNode(type: Element.orderedListItem, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + context.index += 1 + return + default: + text += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + } + } + + public class CodeBlockBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard context.index + 2 < context.tokens.count else { return false } + guard let t1 = token as? Token, + let t2 = context.tokens[context.index + 1] as? Token, + let t3 = context.tokens[context.index + 2] as? Token else { return false } + if case .backtick = t1, case .backtick = t2, case .backtick = t3 { + if context.index == 0 { return true } + if let prev = context.tokens[context.index - 1] as? Token, case .newline = prev { + return true + } + } + return false + } + public func build(context: inout CodeContext) { + context.index += 3 // skip opening ``` + var text = "" + while context.index + 2 < context.tokens.count { + if let t1 = context.tokens[context.index] as? Token, + let t2 = context.tokens[context.index + 1] as? Token, + let t3 = context.tokens[context.index + 2] as? Token, + case .backtick = t1, case .backtick = t2, case .backtick = t3 { + context.index += 3 + if let nl = context.tokens[context.index] as? Token, case .newline = nl { + context.index += 1 + } + let node = CodeNode(type: Element.codeBlock, value: text) + context.currentNode.addChild(node) + return + } else if let tok = context.tokens[context.index] as? Token { + text += tok.text + context.index += 1 + } else { context.index += 1 } + } + let node = CodeNode(type: Element.codeBlock, value: text) + context.currentNode.addChild(node) + } + } + + public class StrongBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard context.index + 1 < context.tokens.count else { return false } + guard let t1 = token as? Token, + let t2 = context.tokens[context.index + 1] as? Token else { return false } + switch (t1, t2) { + case (.star, .star), (.underscore, .underscore): + return true + default: + return false + } + } + public func build(context: inout CodeContext) { + guard let open = context.tokens[context.index] as? Token else { return } + context.index += 2 + var text = "" + while context.index + 1 < context.tokens.count { + if let t1 = context.tokens[context.index] as? Token, + let t2 = context.tokens[context.index + 1] as? Token, + (t1.kindDescription == open.kindDescription && t2.kindDescription == open.kindDescription) { + context.index += 2 + let node = CodeNode(type: Element.strong, value: text) + context.currentNode.addChild(node) + return + } else if let tok = context.tokens[context.index] as? Token { + text += tok.text + context.index += 1 + } else { context.index += 1 } + } + let node = CodeNode(type: Element.strong, value: text) + context.currentNode.addChild(node) + } + } + + public class EmphasisBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .star = tok { return true } + if case .underscore = tok { return true } + return false + } + public func build(context: inout CodeContext) { + guard let open = context.tokens[context.index] as? Token else { return } + context.index += 1 + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token, + tok.kindDescription == open.kindDescription { + context.index += 1 + let node = CodeNode(type: Element.emphasis, value: text) + context.currentNode.addChild(node) + return + } else if let tok = context.tokens[context.index] as? Token { + text += tok.text + context.index += 1 + } else { context.index += 1 } + } + let node = CodeNode(type: Element.emphasis, value: text) + context.currentNode.addChild(node) + } + } + + public class InlineCodeBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .backtick = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token, case .backtick = tok { + context.index += 1 + let node = CodeNode(type: Element.inlineCode, value: text) + context.currentNode.addChild(node) + return + } else if let tok = context.tokens[context.index] as? Token { + text += tok.text + context.index += 1 + } else { context.index += 1 } + } + let node = CodeNode(type: Element.inlineCode, value: text) + context.currentNode.addChild(node) + } + } + + public class LinkBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .lbracket = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .rbracket = tok { + context.index += 1 + break + } else { + text += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + var url = "" + if context.index < context.tokens.count, let lparen = context.tokens[context.index] as? Token, case .lparen = lparen { + context.index += 1 + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .rparen = tok { + context.index += 1 + break + } else { + url += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + } + let node = CodeNode(type: Element.link, value: text + "|" + url) + context.currentNode.addChild(node) + } + } + public class ParagraphBuilder: CodeElementBuilder { public init() {} public func accept(context: CodeContext, token: any CodeToken) -> Bool { @@ -111,15 +458,30 @@ public struct MarkdownLanguage: CodeLanguage { let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return - case .hash: + case .dash, .hash, .star, .underscore, .plus, .backtick, .lbracket: let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return + case .number: + if context.index + 1 < context.tokens.count, + let dot = context.tokens[context.index + 1] as? Token, + case .dot = dot { + let node = CodeNode(type: Element.paragraph, value: text) + context.currentNode.addChild(node) + return + } else { + text += tok.text + context.index += 1 + } case .eof: let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) context.index += 1 return + case .dot, .rbracket, .lparen, .rparen: + // treat as text for now + text += tok.text + context.index += 1 } } else { context.index += 1 } } @@ -127,7 +489,9 @@ public struct MarkdownLanguage: CodeLanguage { } public var tokenizer: CodeTokenizer { Tokenizer() } - public var builders: [CodeElementBuilder] { [HeadingBuilder(), ParagraphBuilder()] } + public var builders: [CodeElementBuilder] { + [HeadingBuilder(), CodeBlockBuilder(), OrderedListItemBuilder(), ListItemBuilder(), LinkBuilder(), StrongBuilder(), EmphasisBuilder(), InlineCodeBuilder(), ParagraphBuilder()] + } public var expressionBuilders: [CodeExpressionBuilder] { [] } public var rootElement: any CodeElement { Element.root } public init() {} diff --git a/Tests/SwiftParserTests/SwiftParserTests.swift b/Tests/SwiftParserTests/SwiftParserTests.swift index d72b1f3..98b999d 100644 --- a/Tests/SwiftParserTests/SwiftParserTests.swift +++ b/Tests/SwiftParserTests/SwiftParserTests.swift @@ -24,6 +24,50 @@ final class SwiftParserTests: XCTestCase { XCTAssertEqual(result.root.children.count, 2) } + func testMarkdownListItem() { + let parser = SwiftParser() + let source = "- item1\n- item2" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.count, 2) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .listItem) + } + + func testMarkdownOrderedList() { + let parser = SwiftParser() + let source = "1. first\n2. second" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .orderedListItem) + } + + func testMarkdownEmphasisAndStrong() { + let parser = SwiftParser() + let source = "*em* **strong**" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.count, 3) + XCTAssertEqual(result.root.children[0].type as? MarkdownLanguage.Element, .emphasis) + XCTAssertEqual(result.root.children[2].type as? MarkdownLanguage.Element, .strong) + } + + func testMarkdownCodeBlockAndInline() { + let parser = SwiftParser() + let source = "```\ncode\n```\ninline `code`" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .codeBlock) + XCTAssertEqual(result.root.children.last?.type as? MarkdownLanguage.Element, .inlineCode) + } + + func testMarkdownLink() { + let parser = SwiftParser() + let source = "[title](url)" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .link) + } + func testPrattExpression() { let parser = SwiftParser() let source = "x = 1 + 2 * 3" From 30d596a70be2ab5fdb25e15853229b9e4ec05591 Mon Sep 17 00:00:00 2001 From: Dongyu Zhao Date: Tue, 15 Jul 2025 00:53:02 +0800 Subject: [PATCH 10/10] Add additional Markdown features --- .../Languages/MarkdownLanguage.swift | 375 +++++++++++++++++- Tests/SwiftParserTests/SwiftParserTests.swift | 16 + 2 files changed, 386 insertions(+), 5 deletions(-) diff --git a/Sources/SwiftParser/Languages/MarkdownLanguage.swift b/Sources/SwiftParser/Languages/MarkdownLanguage.swift index 8a14f80..f259250 100644 --- a/Sources/SwiftParser/Languages/MarkdownLanguage.swift +++ b/Sources/SwiftParser/Languages/MarkdownLanguage.swift @@ -13,6 +13,14 @@ public struct MarkdownLanguage: CodeLanguage { case codeBlock case inlineCode case link + case blockQuote + case thematicBreak + case image + case html + case entity + case strikethrough + case table + case autoLink } public enum Token: CodeToken { @@ -23,6 +31,14 @@ public struct MarkdownLanguage: CodeLanguage { case underscore(Range) case plus(Range) case backtick(Range) + case greaterThan(Range) + case exclamation(Range) + case tilde(Range) + case equal(Range) + case lessThan(Range) + case ampersand(Range) + case semicolon(Range) + case pipe(Range) case lbracket(Range) case rbracket(Range) case lparen(Range) @@ -41,6 +57,14 @@ public struct MarkdownLanguage: CodeLanguage { case .underscore: return "_" case .plus: return "+" case .backtick: return "`" + case .greaterThan: return ">" + case .exclamation: return "!" + case .tilde: return "~" + case .equal: return "=" + case .lessThan: return "<" + case .ampersand: return "&" + case .semicolon: return ";" + case .pipe: return "|" case .lbracket: return "[" case .rbracket: return "]" case .lparen: return "(" @@ -61,6 +85,14 @@ public struct MarkdownLanguage: CodeLanguage { case .underscore: return "_" case .plus: return "+" case .backtick: return "`" + case .greaterThan: return ">" + case .exclamation: return "!" + case .tilde: return "~" + case .equal: return "=" + case .lessThan: return "<" + case .ampersand: return "&" + case .semicolon: return ";" + case .pipe: return "|" case .lbracket: return "[" case .rbracket: return "]" case .lparen: return "(" @@ -75,8 +107,10 @@ public struct MarkdownLanguage: CodeLanguage { public var range: Range { switch self { case .text(_, let r), .hash(let r), .dash(let r), .star(let r), .underscore(let r), - .plus(let r), .backtick(let r), .lbracket(let r), .rbracket(let r), - .lparen(let r), .rparen(let r), .dot(let r), .number(_, let r), .newline(let r), .eof(let r): + .plus(let r), .backtick(let r), .greaterThan(let r), .exclamation(let r), .tilde(let r), + .equal(let r), .lessThan(let r), .ampersand(let r), .semicolon(let r), .pipe(let r), + .lbracket(let r), .rbracket(let r), .lparen(let r), .rparen(let r), .dot(let r), + .number(_, let r), .newline(let r), .eof(let r): return r } } @@ -116,6 +150,38 @@ public struct MarkdownLanguage: CodeLanguage { let start = index advance() add(.backtick(start.." { + let start = index + advance() + add(.greaterThan(start..!~|;&=".contains(input[index]) && !input[index].isNumber { advance() } @@ -309,6 +375,304 @@ public struct MarkdownLanguage: CodeLanguage { } } + public class BlockQuoteBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .greaterThan = tok { + if context.index == 0 { return true } + if let prev = context.tokens[context.index - 1] as? Token, case .newline = prev { return true } + } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 // skip '>' + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .newline: + context.index += 1 + let node = CodeNode(type: Element.blockQuote, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + return + case .eof: + let node = CodeNode(type: Element.blockQuote, value: text.trimmingCharacters(in: .whitespaces)) + context.currentNode.addChild(node) + context.index += 1 + return + default: + text += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + } + } + + public class IndentedCodeBlockBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .text(let s, _) = tok { + if (context.index == 0 || (context.tokens[context.index - 1] as? Token)?.kindDescription == "newline") && s.hasPrefix(" ") { + return true + } + } + return false + } + public func build(context: inout CodeContext) { + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .newline: + context.index += 1 + if context.index < context.tokens.count, let next = context.tokens[context.index] as? Token, case .text(let s, _) = next, s.hasPrefix(" ") { + text += "\n" + String(s.dropFirst(4)) + context.index += 1 + } else { + context.currentNode.addChild(CodeNode(type: Element.codeBlock, value: text)) + return + } + case .text(let s, _): + text += String(s.dropFirst(4)) + context.index += 1 + default: + text += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.codeBlock, value: text)) + } + } + + public class ThematicBreakBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + switch tok { + case .dash, .star, .underscore: + if context.index == 0 || (context.index > 0 && (context.tokens[context.index - 1] as? Token) is Token && (context.tokens[context.index - 1] as? Token)?.kindDescription == "newline") { + var count = 0 + var idx = context.index + while idx < context.tokens.count, let t = context.tokens[idx] as? Token, t.kindDescription == tok.kindDescription { + count += 1; idx += 1 + } + if count >= 3 { + return true + } + } + default: + break + } + return false + } + public func build(context: inout CodeContext) { + if let tok = context.tokens[context.index] as? Token { + let kind = tok.kindDescription + while context.index < context.tokens.count { + if let t = context.tokens[context.index] as? Token, t.kindDescription == kind { + context.index += 1 + } else { + break + } + } + } + if let nl = context.tokens[context.index] as? Token, case .newline = nl { context.index += 1 } + context.currentNode.addChild(CodeNode(type: Element.thematicBreak, value: "")) + } + } + + public class ImageBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .exclamation = tok, + context.index + 1 < context.tokens.count, + let next = context.tokens[context.index + 1] as? Token, + case .lbracket = next { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 2 // skip ![ + var alt = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .rbracket = tok { context.index += 1; break } + else { alt += tok.text; context.index += 1 } + } else { context.index += 1 } + } + var url = "" + if context.index < context.tokens.count, let lp = context.tokens[context.index] as? Token, case .lparen = lp { + context.index += 1 + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .rparen = tok { context.index += 1; break } + else { url += tok.text; context.index += 1 } + } else { context.index += 1 } + } + } + context.currentNode.addChild(CodeNode(type: Element.image, value: alt + "|" + url)) + } + } + + public class HTMLBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + return tok.kindDescription == "<" + } + public func build(context: inout CodeContext) { + context.index += 1 // skip < + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .greaterThan = tok { context.index += 1; break } + else { text += tok.text; context.index += 1 } + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.html, value: text)) + } + } + + public class EntityBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .ampersand = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .semicolon = tok { context.index += 1; break } + else { text += tok.text; context.index += 1 } + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.entity, value: text)) + } + } + + public class StrikethroughBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard context.index + 1 < context.tokens.count else { return false } + guard let t1 = token as? Token, let t2 = context.tokens[context.index + 1] as? Token else { return false } + return t1.kindDescription == "~" && t2.kindDescription == "~" + } + public func build(context: inout CodeContext) { + context.index += 2 + var text = "" + while context.index + 1 < context.tokens.count { + if let t1 = context.tokens[context.index] as? Token, + let t2 = context.tokens[context.index + 1] as? Token, + t1.kindDescription == "~" && t2.kindDescription == "~" { + context.index += 2 + context.currentNode.addChild(CodeNode(type: Element.strikethrough, value: text)) + return + } else if let tok = context.tokens[context.index] as? Token { + text += tok.text + context.index += 1 + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.strikethrough, value: text)) + } + } + + public class AutoLinkBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .lessThan = tok { return true } + return false + } + public func build(context: inout CodeContext) { + context.index += 1 + var text = "" + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .greaterThan = tok { context.index += 1; break } + else { text += tok.text; context.index += 1 } + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.autoLink, value: text)) + } + } + + public class TableBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard let tok = token as? Token else { return false } + if case .pipe = tok { + if context.index == 0 { return true } + if let prev = context.tokens[context.index - 1] as? Token, case .newline = prev { return true } + } + return false + } + public func build(context: inout CodeContext) { + var cells: [String] = [] + var cell = "" + context.index += 1 // skip first pipe + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + switch tok { + case .pipe: + cells.append(cell.trimmingCharacters(in: .whitespaces)) + cell = "" + context.index += 1 + case .newline: + cells.append(cell.trimmingCharacters(in: .whitespaces)) + context.index += 1 + context.currentNode.addChild(CodeNode(type: Element.table, value: cells.joined(separator: "|"))) + return + case .eof: + cells.append(cell.trimmingCharacters(in: .whitespaces)) + context.index += 1 + context.currentNode.addChild(CodeNode(type: Element.table, value: cells.joined(separator: "|"))) + return + default: + cell += tok.text + context.index += 1 + } + } else { context.index += 1 } + } + } + } + + public class FootnoteBuilder: CodeElementBuilder { + public init() {} + public func accept(context: CodeContext, token: any CodeToken) -> Bool { + guard context.index + 3 < context.tokens.count else { return false } + guard let lb = token as? Token, + let txt = context.tokens[context.index + 1] as? Token, + let rb = context.tokens[context.index + 2] as? Token else { return false } + if case .lbracket = lb, + case .text(let s, _) = txt, s.starts(with: "^") , + case .rbracket = rb { + return true + } + return false + } + public func build(context: inout CodeContext) { + context.index += 3 // skip [^x] + if context.index < context.tokens.count, let colon = context.tokens[context.index] as? Token, case .text(let s, _) = colon, s.trimmingCharacters(in: .whitespaces).hasPrefix(":") { + var text = s + context.index += 1 + while context.index < context.tokens.count { + if let tok = context.tokens[context.index] as? Token { + if case .newline = tok { context.index += 1; break } + else { text += tok.text; context.index += 1 } + } else { context.index += 1 } + } + context.currentNode.addChild(CodeNode(type: Element.text, value: text.trimmingCharacters(in: .whitespaces))) + } + } + } + public class StrongBuilder: CodeElementBuilder { public init() {} public func accept(context: CodeContext, token: any CodeToken) -> Bool { @@ -458,7 +822,8 @@ public struct MarkdownLanguage: CodeLanguage { let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return - case .dash, .hash, .star, .underscore, .plus, .backtick, .lbracket: + case .dash, .hash, .star, .underscore, .plus, .backtick, .lbracket, + .greaterThan, .exclamation, .tilde, .equal, .lessThan, .ampersand, .semicolon, .pipe: let node = CodeNode(type: Element.paragraph, value: text) context.currentNode.addChild(node) return @@ -490,7 +855,7 @@ public struct MarkdownLanguage: CodeLanguage { public var tokenizer: CodeTokenizer { Tokenizer() } public var builders: [CodeElementBuilder] { - [HeadingBuilder(), CodeBlockBuilder(), OrderedListItemBuilder(), ListItemBuilder(), LinkBuilder(), StrongBuilder(), EmphasisBuilder(), InlineCodeBuilder(), ParagraphBuilder()] + [HeadingBuilder(), CodeBlockBuilder(), IndentedCodeBlockBuilder(), BlockQuoteBuilder(), ThematicBreakBuilder(), OrderedListItemBuilder(), ListItemBuilder(), ImageBuilder(), HTMLBuilder(), EntityBuilder(), StrikethroughBuilder(), AutoLinkBuilder(), TableBuilder(), FootnoteBuilder(), LinkBuilder(), StrongBuilder(), EmphasisBuilder(), InlineCodeBuilder(), ParagraphBuilder()] } public var expressionBuilders: [CodeExpressionBuilder] { [] } public var rootElement: any CodeElement { Element.root } diff --git a/Tests/SwiftParserTests/SwiftParserTests.swift b/Tests/SwiftParserTests/SwiftParserTests.swift index 98b999d..0640b74 100644 --- a/Tests/SwiftParserTests/SwiftParserTests.swift +++ b/Tests/SwiftParserTests/SwiftParserTests.swift @@ -68,6 +68,22 @@ final class SwiftParserTests: XCTestCase { XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .link) } + func testMarkdownBlockQuote() { + let parser = SwiftParser() + let source = "> quote" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .blockQuote) + } + + func testMarkdownImage() { + let parser = SwiftParser() + let source = "![alt](url)" + let result = parser.parse(source, language: MarkdownLanguage()) + XCTAssertEqual(result.errors.count, 0) + XCTAssertEqual(result.root.children.first?.type as? MarkdownLanguage.Element, .image) + } + func testPrattExpression() { let parser = SwiftParser() let source = "x = 1 + 2 * 3"