Skip to content

Draft: #2298/change yaml parser to a maintained one #2392

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ var completedSuccessfully = false
var forceExpression = ""

var expressionFile = ""

var yamlParser = ""
15 changes: 15 additions & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ yq -P -oy sample.json
logging.SetBackend(backend)
yqlib.InitExpressionParser()

// Handle YAML parser selection with validation
switch yamlParser {
case "goccy", "":
yqlib.ConfiguredYamlPreferences.UseGoccyParser = true
case "legacy-v3":
yqlib.ConfiguredYamlPreferences.UseGoccyParser = false
default:
return fmt.Errorf("invalid yaml-parser value '%s'. Valid options are: 'goccy', 'legacy-v3'", yamlParser)
}

return nil
},
}
Expand Down Expand Up @@ -197,6 +207,11 @@ yq -P -oy sample.json
}
rootCmd.PersistentFlags().BoolVarP(&yqlib.ConfiguredYamlPreferences.LeadingContentPreProcessing, "header-preprocess", "", true, "Slurp any header comments and separators before processing expression.")

rootCmd.PersistentFlags().StringVar(&yamlParser, "yaml-parser", "legacy-v3", "YAML parser to use: 'goccy' (actively maintained) or 'legacy-v3' (default, legacy gopkg.in/yaml.v3)")
if err = rootCmd.RegisterFlagCompletionFunc("yaml-parser", cobra.FixedCompletions([]string{"goccy", "legacy-v3"}, cobra.ShellCompDirectiveNoFileComp)); err != nil {
panic(err)
}

rootCmd.PersistentFlags().StringVarP(&splitFileExp, "split-exp", "s", "", "print each result (or doc) into a file named (exp). [exp] argument must return a string. You can use $index in the expression as the result counter. The necessary directories will be created.")
if err = rootCmd.RegisterFlagCompletionFunc("split-exp", cobra.NoFileCompletions); err != nil {
panic(err)
Expand Down
94 changes: 94 additions & 0 deletions pkg/yqlib/candidate_node_goccy_yaml.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func (o *CandidateNode) UnmarshalGoccyYAML(node ast.Node, cm yaml.CommentMap, an
// to solve the multiline > problem
o.Value = astLiteral.Value.Value
case ast.TagType:
// Recursively unmarshal the tagged value, then apply the tag to the CandidateNode.
if err := o.UnmarshalGoccyYAML(node.(*ast.TagNode).Value, cm, anchorMap); err != nil {
return err
}
Expand Down Expand Up @@ -208,3 +209,96 @@ func (o *CandidateNode) goccyProcessMappingValueNode(mappingEntry *ast.MappingVa

return nil
}

func (o *CandidateNode) MarshalGoccyYAML() (interface{}, error) {
log.Debug("MarshalGoccyYAML to goccy: %v", o.Tag)

switch o.Kind {
case AliasNode:
log.Debug("MarshalGoccyYAML - alias to goccy: %v", o.Tag)
// For goccy, we'll return the referenced value directly
// The goccy encoder will handle alias creation
if o.Alias != nil {
return o.Alias.MarshalGoccyYAML()
}
return o.Value, nil

case ScalarNode:
// Handle different scalar types based on tag for correct marshalling.
switch o.Tag {
case "!!int":
val, err := parseInt(o.Value)
if err == nil {
return val, nil
}

return nil, fmt.Errorf("cannot marshal node %s as int: %w", NodeToString(o), err)
case "!!float":
val, err := parseFloat(o.Value)
if err == nil {
return val, nil
}

return nil, fmt.Errorf("cannot marshal node %s as float: %w", NodeToString(o), err)
case "!!bool":
val, err := parseBool(o.Value)
if err == nil {
return val, nil
}

return nil, fmt.Errorf("cannot marshal node %s as bool: %w", NodeToString(o), err)
case "!!null":
// goccy/go-yaml expects a nil interface{} for null values.
return nil, nil
default:
// For standard strings (!!str) or unknown/custom tags, marshal as a string.
// The goccy encoder will handle quoting and style if it's a plain string.
// For custom tags, goccy prepends the tag if the value is a string.
return o.Value, nil
}

case MappingNode:
log.Debug("MarshalGoccyYAML - mapping: %v", NodeToString(o))
// Ensure even number of children for key-value pairs
if len(o.Content)%2 != 0 {
return nil, fmt.Errorf("mapping node at %s has an odd number of children (%d), malformed key-value pairs", NodeToString(o), len(o.Content))
}
result := make(map[string]interface{})

for i := 0; i < len(o.Content); i += 2 {
// No need to check i+1 >= len(o.Content) here due to the check above

keyNode := o.Content[i]
valueNode := o.Content[i+1]

key := keyNode.Value
if key == "" {
key = NodeToString(keyNode)
}

value, err := valueNode.MarshalGoccyYAML()
if err != nil {
return nil, err
}

result[key] = value
}
return result, nil

case SequenceNode:
log.Debug("MarshalGoccyYAML - sequence: %v", NodeToString(o))
result := make([]interface{}, len(o.Content))

for i, childNode := range o.Content {
value, err := childNode.MarshalGoccyYAML()
if err != nil {
return nil, err
}
result[i] = value
}
return result, nil
}

// Default case
return o.Value, nil
}
191 changes: 191 additions & 0 deletions pkg/yqlib/datetime_preprocessor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package yqlib

import (
"regexp"
"strings"
"time"
)

// DateTimePreprocessor handles automatic timestamp tagging for Goccy parser compatibility
type DateTimePreprocessor struct {
enabled bool
}

// NewDateTimePreprocessor creates a new datetime preprocessor
func NewDateTimePreprocessor(enabled bool) *DateTimePreprocessor {
return &DateTimePreprocessor{enabled: enabled}
}

// ISO8601 date/time patterns that should be automatically tagged as timestamps
var dateTimePatterns = []*regexp.Regexp{
// RFC3339 / ISO8601 with timezone: 2006-01-02T15:04:05Z or 2006-01-02T15:04:05+07:00
regexp.MustCompile(`^\s*([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?(?:Z|[+-][0-9]{2}:[0-9]{2}))\s*$`),
// Date only: 2006-01-02
regexp.MustCompile(`^\s*([0-9]{4}-[0-9]{2}-[0-9]{2})\s*$`),
// RFC3339 without timezone: 2006-01-02T15:04:05
regexp.MustCompile(`^\s*([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?)\s*$`),
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this aside the function that use it

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed


// isValidDateTime checks if a string represents a valid ISO8601/RFC3339 datetime
func isValidDateTime(value string) bool {
// Try to parse with RFC3339 first
if _, err := time.Parse(time.RFC3339, value); err == nil {
return true
}

// Try to parse date-only format
if _, err := time.Parse("2006-01-02", value); err == nil {
return true
}

// Try RFC3339 without timezone
if _, err := time.Parse("2006-01-02T15:04:05", value); err == nil {
return true
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are constants in time package for this.

You can use usestdlibvars linter to identify them easily

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surprisingly usestdlibvars did not show any errors, maybe my setup has a problem...

Anyway fixed (next commit)


return false
}

// PreprocessDocument automatically adds !!timestamp tags to ISO8601 datetime strings
// in YAML documents when using Goccy parser for consistent datetime arithmetic behaviour
func (dtp *DateTimePreprocessor) PreprocessDocument(yamlContent string) string {
if !dtp.enabled {
return yamlContent
}

lines := strings.Split(yamlContent, "\n")
var result strings.Builder

for i, line := range lines {
processed := dtp.processLine(line)
result.WriteString(processed)

// Add newline except for the last line (preserve original ending)
if i < len(lines)-1 {
result.WriteString("\n")
}
}

return result.String()
}

// processLine processes a single line of YAML, adding timestamp tags where appropriate
func (dtp *DateTimePreprocessor) processLine(line string) string {
// Skip lines that are comments, already have tags, or are part of multi-line constructs
if isSkippableLine(line) {
return line
}

// Look for key-value pairs: "key: value" or "- value"
trimLeft := strings.TrimLeft(line, " \t")
if strings.HasPrefix(trimLeft, "- ") {
// Handle array items first (before key-value pairs)
return dtp.processArrayItemLine(line)
} else if colonIndex := strings.Index(line, ":"); colonIndex != -1 {
// Handle map key-value pairs
return dtp.processKeyValueLine(line, colonIndex)
}

return line
}

// isSkippableLine checks if a line should be skipped from datetime preprocessing
func isSkippableLine(line string) bool {
trimmed := strings.TrimSpace(line)

// Skip empty lines, comments, directives, document separators
if trimmed == "" || strings.HasPrefix(trimmed, "#") ||
strings.HasPrefix(trimmed, "%") || strings.HasPrefix(trimmed, "---") ||
strings.HasPrefix(trimmed, "...") {
return true
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest a switch with one case (or multiple) it would be more readable

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

// Skip lines that already have explicit tags
if strings.Contains(line, "!!") {
return true
}

// Skip multi-line scalar indicators
if strings.HasSuffix(trimmed, "|") || strings.HasSuffix(trimmed, ">") ||
strings.HasSuffix(trimmed, "|-") || strings.HasSuffix(trimmed, ">-") ||
strings.HasSuffix(trimmed, "|+") || strings.HasSuffix(trimmed, ">+") {
return true
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed


return false
}

// processKeyValueLine processes a line containing a key-value pair
func (dtp *DateTimePreprocessor) processKeyValueLine(line string, colonIndex int) string {
beforeColon := line[:colonIndex]
afterColon := line[colonIndex+1:]

// Check if the value part looks like a datetime
trimmedValue := strings.TrimSpace(afterColon)

// Skip if value is empty, quoted, or already complex
if trimmedValue == "" ||
strings.HasPrefix(trimmedValue, "\"") ||
strings.HasPrefix(trimmedValue, "'") ||
strings.HasPrefix(trimmedValue, "{") ||
strings.HasPrefix(trimmedValue, "[") ||
strings.HasPrefix(trimmedValue, "&") ||
strings.HasPrefix(trimmedValue, "*") {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a switch on trimmedValue[0] would be more readable

return line
}

// Check if it matches datetime patterns and is valid
if dtp.matchesDateTimePattern(trimmedValue) && isValidDateTime(trimmedValue) {
// Insert !!timestamp tag before the value
return beforeColon + ": !!timestamp " + trimmedValue
}

return line
}

// processArrayItemLine processes a line containing an array item
func (dtp *DateTimePreprocessor) processArrayItemLine(line string) string {
// Find the position of "- " after any leading whitespace
trimLeft := strings.TrimLeft(line, " \t")
if !strings.HasPrefix(trimLeft, "- ") {
return line
}

// Find the actual position of "- " in the original line
leadingWhitespace := line[:len(line)-len(trimLeft)]
dashIndex := len(leadingWhitespace)

beforeDash := line[:dashIndex+2] // Include leading whitespace and "- "
afterDash := line[dashIndex+2:]

trimmedValue := strings.TrimSpace(afterDash)

// Skip if value is empty, quoted, or already complex
if trimmedValue == "" ||
strings.HasPrefix(trimmedValue, "\"") ||
strings.HasPrefix(trimmedValue, "'") ||
strings.HasPrefix(trimmedValue, "{") ||
strings.HasPrefix(trimmedValue, "[") ||
strings.HasPrefix(trimmedValue, "&") ||
strings.HasPrefix(trimmedValue, "*") {
return line
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed


// Check if it matches datetime patterns and is valid
if dtp.matchesDateTimePattern(trimmedValue) && isValidDateTime(trimmedValue) {
// Insert !!timestamp tag before the value
return beforeDash + "!!timestamp " + trimmedValue
}

return line
}

// matchesDateTimePattern checks if a value matches any of the datetime regex patterns
func (dtp *DateTimePreprocessor) matchesDateTimePattern(value string) bool {
for _, pattern := range dateTimePatterns {
if pattern.MatchString(value) {
return true
}
}
return false
}
Loading