forked from stephens2424/php
-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexer.go
211 lines (179 loc) · 4.37 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
package php
import (
"fmt"
"regexp"
"strings"
"unicode"
"unicode/utf8"
"github.com/stephens2424/php/token"
)
// Lexer represents the state of "lexing" items from a source string.
// The idea is derived from a Rob Pike talk:
// http://www.youtube.com/watch?v=HxaD_trXwRE
type lexer struct {
// start stores the start position of the currently lexing token..
start int
// lastStart stores the start position of the previously lexed token..
lastStart int
// lastPos stores the position of the previous lexed element.
lastPos int
// pos is the current position of the lexer in the input, as an index
// of the input string.
pos int
line int
// width is the length of the current rune
width int
items chan Item // channel of scanned items.
// input is the full input string.
input string
// file is the filename of the input, used to print errors.
file string
}
func newLexer(input string) *lexer {
l := &lexer{
line: 1,
input: input,
items: make(chan Item),
}
go l.run()
return l
}
// stateFn represents the state of the scanner
// as a function that returns the next state.
type stateFn func(*lexer) stateFn
// run lexes the input by executing state functions until
// the state is nil.
func (l *lexer) run() {
for state := lexHTML; state != nil; {
state = state(l)
}
close(l.items) // No more tokens will be delivered.
}
// emit gets the current token., sends it on the token. channel
// and prepares for lexing the next token.
func (l *lexer) emit(t token.Token) {
i := Item{t, l.currentLocation(), l.input[l.start:l.pos]}
l.incrementLines()
l.lastPos = i.pos.Pos
l.items <- i
l.start = l.pos
}
func (l *lexer) currentLocation() Location {
return Location{Pos: l.start, Line: l.line, File: l.file}
}
// nextItem returns the next token. from the input.
func (l *lexer) nextItem() Item {
Item := <-l.items
return Item
}
// peek returns but does not consume the next rune in the input.
func (l *lexer) peek() rune {
r := l.next()
l.backup()
return r
}
// backup steps back one rune. Can only be called once per call of next.
func (l *lexer) backup() {
l.pos -= l.width
}
func (l *lexer) previous() rune {
r, _ := utf8.DecodeRuneInString(l.input[l.lastPos:])
return r
}
func (l *lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
// acceptRun consumes a run of runes from the valid set.
func (l *lexer) acceptRun(valid string) {
for strings.IndexRune(valid, l.next()) >= 0 {
}
l.backup()
}
func (l *lexer) next() rune {
if int(l.pos) >= len(l.input) {
l.width = 0
return eof
}
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
l.width = w
l.pos += l.width
return r
}
// ignore skips over the pending input before this point.
func (l *lexer) ignore() {
l.start = l.pos
l.incrementLines()
}
func (l *lexer) skipSpace() {
r := l.next()
for isSpace(r) {
r = l.next()
}
l.backup()
l.ignore()
}
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
i := Item{token.Error, l.currentLocation(), fmt.Sprintf(format, args...)}
l.incrementLines()
l.items <- i
return nil
}
func (l *lexer) incrementLines() {
l.line += strings.Count(l.input[l.lastStart:l.pos], "\n")
l.lastStart = l.pos
}
// isSpace reports whether r is a space character.
func isSpace(r rune) bool {
return unicode.IsSpace(r)
}
func isKeyword(i token.Token, tokenString string) bool {
_, ok := keywordMap[i]
return ok && !isNonAlphaOperator(tokenString)
}
var nonalpha *regexp.Regexp
func init() {
nonalpha = regexp.MustCompile(`^[^a-zA-Z0-9]*$`)
}
func isNonAlphaOperator(s string) bool {
return nonalpha.MatchString(s)
}
// keywordMap lists all keywords that should be ignored as a prefix to a longer
// identifier.
var keywordMap = map[token.Token]bool{}
func init() {
re := regexp.MustCompile("^[a-zA-Z]+")
for keyword, t := range token.TokenMap {
if re.MatchString(keyword) {
keywordMap[t] = true
}
}
}
// Item represents a lexed item.
type Item struct {
typ token.Token
pos Location
val string
}
// Location represents a position within a PHP file.
type Location struct {
Pos int
Line int
File string
}
// String renders a string representation of the item.
func (i Item) String() string {
switch i.typ {
case token.EOF:
return "EOF"
case token.Error:
return i.val
}
if len(i.val) > 10 {
return fmt.Sprintf("%v:%.10q...", i.typ, i.val)
}
return fmt.Sprintf("%v:%q", i.typ, i.val)
}