forked from grindlemire/go-lucene
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.go
234 lines (197 loc) · 5.75 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
package lucene
import (
"fmt"
"reflect"
"strconv"
"strings"
"github.com/AlxBystrov/go-lucene/internal/lex"
"github.com/AlxBystrov/go-lucene/pkg/lucene/expr"
"github.com/AlxBystrov/go-lucene/pkg/lucene/reduce"
)
// Parse will parse using a buffer and the shift reduce algorithm. It scales rather well since
// it is a one pass algorithm with no backtracking.
func Parse(input string) (e *expr.Expression, err error) {
p := &parser{
lex: lex.Lex(input),
stack: []any{},
nonTerminals: []lex.Token{{Typ: lex.TStart}},
}
ex, err := p.parse()
if err != nil {
return e, err
}
err = expr.Validate(ex)
if err != nil {
return e, err
}
return ex, nil
}
type parser struct {
lex *lex.Lexer
stack []any
nonTerminals []lex.Token
}
func (p *parser) parse() (e *expr.Expression, err error) {
for {
next := p.lex.Peek()
if p.shouldAccept(next) {
if len(p.stack) != 1 {
return e, fmt.Errorf("multiple expression left after parsing: %v", p.stack)
}
final, ok := p.stack[0].(*expr.Expression)
if !ok {
return e, fmt.Errorf(
"final parse didn't return an expression: %s [type: %s]",
p.stack[0],
reflect.TypeOf(final),
)
}
return final, nil
}
if p.shouldShift(next) {
tok := p.shift()
if lex.IsTerminal(tok) {
// if we have a terminal parse it and put it on the stack
lit, err := parseLiteral(tok)
if err != nil {
return e, err
}
// we should always check if the current top of the stack is another token
// if it isn't then we have an implicit AND we need to inject.
if len(p.stack) > 0 {
_, isTopToken := p.stack[len(p.stack)-1].(lex.Token)
if !isTopToken {
implAnd := lex.Token{Typ: lex.TAnd, Val: "AND"}
// act as if we just saw an AND and check if we need to reduce the
// current token stack first.
if !p.shouldShift(implAnd) {
err = p.reduce()
if err != nil {
return e, err
}
}
// if we have a literal as the previous parsed thing then
// we must be in an implicit AND and should reduce
p.stack = append(p.stack, implAnd)
p.nonTerminals = append(p.nonTerminals, implAnd)
}
}
p.stack = append(p.stack, lit)
continue
}
// otherwise just push the token on the stack
p.stack = append(p.stack, tok)
p.nonTerminals = append(p.nonTerminals, tok)
continue
}
err = p.reduce()
if err != nil {
return e, err
}
}
}
func (p *parser) shift() (tok lex.Token) {
return p.lex.Next()
}
// shouldShift determines if the parser should shift or not. This might end up in the grammar specific
// packages and implemented for each grammar this parser supports but for now it can live at the top level.
func (p *parser) shouldShift(next lex.Token) bool {
if next.Typ == lex.TEOF {
return false
}
if next.Typ == lex.TErr {
return false
}
curr := p.nonTerminals[len(p.nonTerminals)-1]
// if we have a terminal symbol then we always want to shift since it won't be
// matched by any rule
if lex.IsTerminal(next) {
return true
}
// if we have an open grouping or the next one is we want to always shift
if anyOpenBracket(curr, next) {
return true
}
// we need the closing bracket to reduce the range subexpression so shift that on
// if we see it
if endingRangeSubExpr(next) {
return true
}
// if we are ever attempting to move past a subexpr we need to parse it before moving on
if anyClosingBracket(curr) {
return false
}
// shift if our current token has less precedence than the next token
return lex.HasLessPrecedence(curr, next)
}
func anyOpenBracket(curr, next lex.Token) bool {
return curr.Typ == lex.TLSquare ||
next.Typ == lex.TLSquare ||
curr.Typ == lex.TLCurly ||
next.Typ == lex.TLCurly ||
curr.Typ == lex.TLParen ||
next.Typ == lex.TLParen
}
func anyClosingBracket(curr lex.Token) bool {
return curr.Typ == lex.TRParen ||
curr.Typ == lex.TRSquare ||
curr.Typ == lex.TRCurly
}
func endingRangeSubExpr(next lex.Token) bool {
return next.Typ == lex.TRSquare || next.Typ == lex.TRCurly
}
func (p *parser) shouldAccept(next lex.Token) bool {
return len(p.stack) == 1 &&
next.Typ == lex.TEOF
}
func (p *parser) reduce() (err error) {
top := []any{}
for {
if len(p.stack) == 0 {
return fmt.Errorf("error parsing, no items left to reduce, current state: %v", top)
}
// pull the top off the stack
s := p.stack[len(p.stack)-1]
p.stack = p.stack[:len(p.stack)-1]
// keep the original ordering when building up our subslice
top = append([]any{s}, top...)
// try to reduce with all our reducers
var reduced bool
top, p.nonTerminals, reduced = reduce.Reduce(top, p.nonTerminals)
// if we consumed some non terminals during the reduce it means we successfully reduced
if reduced {
// if we successfully reduced re-add it to the top of the stack and return
p.stack = append(p.stack, top...)
return nil
}
}
}
func parseLiteral(token lex.Token) (e any, err error) {
// if it is a quote then remove escape
if token.Typ == lex.TQuoted {
return expr.Lit(strings.ReplaceAll(token.Val, "\"", "")), nil
}
// if it is a regexp then parse it
if token.Typ == lex.TRegexp {
return expr.REGEXP(token.Val), nil
}
// attempt to parse it as an integer
ival, err := strconv.Atoi(token.Val)
if err == nil {
return expr.Lit(ival), nil
}
// attempt to parse it as a float
fval, err := strconv.ParseFloat(token.Val, 64)
if err == nil {
return expr.Lit(fval), nil
}
// if it contains unescaped wildcards then it is a wildcard string
if strings.ContainsAny(token.Val, "*?") {
return expr.WILD(token.Val), nil
}
// if it contains an escape string then strip it out now
if strings.Contains(token.Val, `\`) {
return expr.Lit(strings.ReplaceAll(token.Val, `\`, "")), nil
}
return expr.Lit(token.Val), nil
}