Skip to content

Commit 509864c

Browse files
authored
Merge pull request #101 from DongyuZhao/copilot/fix-da3a1cd3-7bcb-496f-ad74-3ceee87548b1
Fix soft line break processing in Markdown paragraph and blockquote parsing
2 parents fa91739 + 006a478 commit 509864c

26 files changed

+2641
-4334
lines changed
Lines changed: 12 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,25 @@
11
import CodeParserCore
22
import Foundation
33

4-
/// Main construction state for Markdown language with line-based processing
4+
/// Minimal construction state for Markdown language
5+
/// Only contains state that cannot be derived from the AST (context.current)
56
public class MarkdownConstructState: CodeConstructState {
67
public typealias Node = MarkdownNodeElement
78
public typealias Token = MarkdownTokenElement
89

9-
// Current token index in the line
10-
public var position: Int = 0
11-
// Flag indicates if the block builders should run another round on the same line.
12-
public var refreshed: Bool = false
13-
// Flag indicates if the current line is being reprocessed after partial consumption
14-
public var isPartialLine: Bool = false
15-
16-
// Fenced code block state
17-
public var openFence: OpenFenceInfo?
18-
19-
// HTML block state
20-
public var openHTMLBlock: OpenHTMLBlockInfo?
21-
22-
/// Stack for nested list processing
23-
public var listStack: [ListNode] = []
24-
public var currentDefinitionList: DefinitionListNode?
25-
26-
/// Enhanced list context tracking for better indentation and nesting management
27-
public var listContextStack: [ListContextInfo] = []
28-
29-
/// Indicates the last consumed line break formed a blank line (two or more consecutive newlines)
30-
public var lastWasBlankLine: Bool = false
31-
32-
/// When a quoted blank line (`>\\n`) is seen inside a blockquote, the next quoted
33-
/// content should start a new paragraph inside the same blockquote instead of
34-
/// merging into the previous one.
35-
public var pendingBlockquoteParagraphSplit: Bool = false
36-
37-
/// True when the previous quoted line (inside a blockquote) began with a token
38-
/// that could start a block (e.g., `#`, `-`, `*`, `+`, number.). We use this to
39-
/// prevent merging the next quoted line into the same paragraph, matching CommonMark
40-
/// semantics where block-starting constructs introduce a new block.
41-
public var prevBlockquoteLineWasBlockStart: Bool = false
42-
4310
/// Reference link definitions storage for resolving reference links
4411
/// Key is normalized reference identifier (case-insensitive, whitespace collapsed)
12+
/// Note: This cannot be derived from AST since reference definitions may appear
13+
/// anywhere in the document and need to be available for link resolution
4514
public var referenceDefinitions: [String: (url: String, title: String)] = [:]
46-
47-
/// Pending reference link definition being parsed across multiple lines
48-
public var pendingReference: PendingReferenceDefinition?
15+
16+
/// Current line tokens being processed - builders can modify these
17+
/// This allows builders to consume their part and leave remaining tokens for further processing
18+
public var tokens: [any CodeToken<MarkdownTokenElement>] = []
19+
20+
/// Flag indicating if current line has been fully processed by a builder
21+
/// When false, MarkdownBlockBuilder should continue processing the remaining tokens
22+
public var currentLineProcessed: Bool = true
4923

5024
public init() {}
5125

@@ -72,85 +46,3 @@ public class MarkdownConstructState: CodeConstructState {
7246
.trimmingCharacters(in: .whitespacesAndNewlines)
7347
}
7448
}
75-
76-
/// Information about a pending reference link definition being parsed across multiple lines
77-
public struct PendingReferenceDefinition {
78-
public let identifier: String
79-
public let referenceNode: ReferenceNode
80-
public var hasDestination: Bool
81-
public var hasTitle: Bool
82-
public let originalLineTokens: [any CodeToken<MarkdownTokenElement>] // For fallback to paragraph
83-
84-
public init(identifier: String, referenceNode: ReferenceNode, originalLineTokens: [any CodeToken<MarkdownTokenElement>]) {
85-
self.identifier = identifier
86-
self.referenceNode = referenceNode
87-
self.hasDestination = false
88-
self.hasTitle = false
89-
self.originalLineTokens = originalLineTokens
90-
}
91-
}
92-
93-
/// Information about an open fenced code block
94-
public struct OpenFenceInfo {
95-
public let character: String
96-
public let length: Int
97-
public let indentation: Int
98-
public let codeBlock: CodeBlockNode
99-
100-
public init(character: String, length: Int, indentation: Int, codeBlock: CodeBlockNode) {
101-
self.character = character
102-
self.length = length
103-
self.indentation = indentation
104-
self.codeBlock = codeBlock
105-
}
106-
}
107-
108-
/// Information about an open HTML block
109-
public struct OpenHTMLBlockInfo {
110-
public let type: Int // HTML block type (1-7)
111-
public let endCondition: String? // What string ends this block
112-
public let htmlBlock: HTMLBlockNode
113-
114-
public init(type: Int, endCondition: String?, htmlBlock: HTMLBlockNode) {
115-
self.type = type
116-
self.endCondition = endCondition
117-
self.htmlBlock = htmlBlock
118-
}
119-
}
120-
121-
/// Information about detected HTML block type
122-
public struct HTMLBlockTypeInfo {
123-
public let type: Int
124-
public let name: String
125-
public let closedOnSameLine: Bool
126-
public let endCondition: String?
127-
128-
public init(type: Int, name: String, closedOnSameLine: Bool, endCondition: String? = nil) {
129-
self.type = type
130-
self.name = name
131-
self.closedOnSameLine = closedOnSameLine
132-
self.endCondition = endCondition
133-
}
134-
}
135-
136-
/// Enhanced list context information for better nesting and indentation management
137-
public struct ListContextInfo {
138-
/// The list node itself
139-
public let list: ListNode
140-
/// The parent list item that contains this list (nil for top-level lists)
141-
public let parentListItem: ListItemNode?
142-
/// The calculated indentation level for content in this list context
143-
public let contentIndent: Int
144-
/// The nesting level (1 for top-level, 2 for first nested, etc.)
145-
public let level: Int
146-
/// The marker type for compatibility checking
147-
public let markerType: String
148-
149-
public init(list: ListNode, parentListItem: ListItemNode?, contentIndent: Int, level: Int, markerType: String) {
150-
self.list = list
151-
self.parentListItem = parentListItem
152-
self.contentIndent = contentIndent
153-
self.level = level
154-
self.markerType = markerType
155-
}
156-
}

Sources/CodeParserCollection/Markdown/MarkdownNodes.swift

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,16 @@ public class DocumentNode: MarkdownNodeBase {
110110
}
111111

112112
// MARK: - Block Elements
113-
public class ParagraphNode: MarkdownNodeBase {
113+
public class ParagraphNode: MarkdownNodeBase, MarkdownBlockNode {
114+
public var blockType: String { "paragraph" }
115+
114116
public init(range: Range<String.Index>) {
115117
super.init(element: .paragraph)
116118
}
117119
}
118120

119-
public class HeaderNode: MarkdownNodeBase {
121+
public class HeaderNode: MarkdownNodeBase, MarkdownBlockNode {
122+
public var blockType: String { "heading" }
120123
public var level: Int
121124

122125
public init(level: Int) {
@@ -130,7 +133,8 @@ public class HeaderNode: MarkdownNodeBase {
130133
}
131134
}
132135

133-
public class ThematicBreakNode: MarkdownNodeBase {
136+
public class ThematicBreakNode: MarkdownNodeBase, MarkdownBlockNode {
137+
public var blockType: String { "thematic_break" }
134138
public var marker: String
135139

136140
public init(marker: String = "---") {
@@ -144,8 +148,14 @@ public class ThematicBreakNode: MarkdownNodeBase {
144148
}
145149
}
146150

147-
public class BlockquoteNode: MarkdownNodeBase {
151+
public class BlockquoteNode: MarkdownNodeBase, MarkdownBlockNode {
152+
public var blockType: String { "blockquote" }
148153
public var level: Int
154+
155+
// Package-level indentation properties for nested block parsing
156+
package var indent: Int = 0 // Number of spaces before the '>' marker
157+
package var markerColumn: Int = 0 // Column position of the '>' marker
158+
package var contentColumn: Int = 0 // Column position where content starts after '> '
149159

150160
public init(level: Int = 1) {
151161
self.level = level
@@ -198,11 +208,17 @@ public class UnorderedListNode: ListNode {
198208
}
199209
}
200210

201-
public class ListItemNode: MarkdownNodeBase {
211+
public class ListItemNode: MarkdownNodeBase, MarkdownBlockNode {
212+
public var blockType: String { "list_item" }
202213
public var marker: String
203214
// indentation before marker and content indent column for continuation
204215
public var markerIndent: Int = 0
205216
public var contentIndent: Int = 0
217+
218+
// Package-level properties for enhanced nested block parsing
219+
package var markerColumn: Int = 0 // Exact column position of the marker
220+
package var contentColumn: Int = 0 // Exact column position where content starts
221+
package var markerLength: Int = 0 // Length of the marker (e.g., "1." = 2, "-" = 1)
206222

207223
public init(marker: String) {
208224
self.marker = marker
@@ -215,9 +231,13 @@ public class ListItemNode: MarkdownNodeBase {
215231
}
216232
}
217233

218-
public class CodeBlockNode: MarkdownNodeBase {
234+
public class CodeBlockNode: MarkdownNodeBase, MarkdownBlockNode {
235+
public var blockType: String { "code_block" }
219236
public var language: String?
220237
public var source: String
238+
239+
// Package-level indentation properties for nested block parsing
240+
package var indent: Int = 0 // Number of spaces before the code block
221241

222242
public init(source: String, language: String? = nil) {
223243
self.language = language
@@ -635,3 +655,12 @@ public class ContentNode: MarkdownNodeBase {
635655
super.init(element: .content)
636656
}
637657
}
658+
659+
// MARK: - Type Aliases for Block Builders
660+
public typealias MarkdownHeading = HeaderNode
661+
public typealias MarkdownThematicBreak = ThematicBreakNode
662+
public typealias MarkdownText = TextNode
663+
public typealias MarkdownParagraph = ParagraphNode
664+
public typealias MarkdownBlockquote = BlockquoteNode
665+
public typealias MarkdownLineBreak = LineBreakNode
666+
public typealias MarkdownListItem = ListItemNode

0 commit comments

Comments
 (0)