/
parser_utils.clj
243 lines (199 loc) · 8.34 KB
/
parser_utils.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
(ns drake.parser_utils
(:require [name.choi.joshua.fnparse :as p]
[flatland.useful.datatypes :refer [assoc-record]]
[clojure.tools.logging :refer [warn debug trace]]
[slingshot.slingshot :refer [throw+]]))
;; The parsing state data structure. The remaining tokens are stored
;; in :remainder, and the current column and line are stored in their
;; respective fields. :vars is a map of additional key/value pairs that
;; can help keep state. As FnParse applies rules to this state and consumes
;; tokens, the state gets modified accordingly.
;;
;; This is a record for performance reasons: this map gets updated very often,
;; and copying the array-map is otherwise needlessly expensive
(defrecord State [remainder vars methods column line value-ignroed])
(defn make-state [remainder vars methods column line]
(->State remainder vars methods column line false))
(defn remainder-accessor [^State s]
(.remainder s))
(defn remainder-setter [^State s new-remainder]
(assoc-record s :remainder new-remainder))
;; rep+ and rep* return a vector of the products of the rules being repeated.
;; Even if a rule's product is nil, the nil would show up in the vector.
;; The two following rules are helpful for ignoring products from certain
;; rules.
(defn nil-semantics [subrule]
(p/constant-semantics subrule nil))
(defn semantic-rm-nil [subrule]
(p/semantics subrule
(fn [product] (remove #(nil? %) product))))
(def apply-str
(partial apply str))
(defn flatten-apply-str
[& args]
(apply str (flatten args)))
;; These functions are given a rule and make it so that it
;; increments the current column (or the current line).
;;non-line-breaks
(defn nb-char [subrule]
(p/invisi-conc subrule (p/update-info :column inc)))
(def nb-char-lit
(comp nb-char p/lit)) ; lit is a FnParse function that creates a literal
; rule.
;;line-breaks
(defn b-char [subrule]
(p/invisi-conc subrule (p/update-info :line inc)
(p/set-info :column 1))) ;; column is 1-based
;; parse errors
(defn throw-parse-error [state message & message-args]
(throw+ {:msg
(str (if (:file-path state) (str "In " (:file-path state) ", ") "")
(format "parse error at line %s, column %s: "
(:line state) (:column state))
(apply format message message-args))}))
(defn first-word [lit-array]
"Input is array of literals, usually the remaining tokens. It
identifies the first word, which can be used to present a more helpful
error message."
(if (or (nil? (first lit-array))
(re-matches #"\s" (str (first lit-array))))
nil
(str (first lit-array) (first-word (rest lit-array)))))
(defn expectation-error-fn [expectation]
(fn [remainder state]
(throw-parse-error state "%s expected where \"%s\" is"
expectation (or (first-word remainder) "EOF"))))
(defn illegal-syntax-error-fn [var-type]
(fn [remainder state]
(throw-parse-error state "illegal syntax starting with \"%s\" for %s"
(or (first-word remainder) "EOF") var-type)))
;; And here are where this parser's rules are defined.
(def string-delimiter
(nb-char-lit \"))
(def escape-indicator
(nb-char-lit \\))
(def false-lit
(p/constant-semantics (p/lit-conc-seq "false" nb-char-lit)
false))
(def true-lit
(p/constant-semantics (p/lit-conc-seq "true" nb-char-lit)
true))
(def null-lit
(p/constant-semantics (p/alt (p/lit-conc-seq "null" nb-char-lit)
(p/lit-conc-seq "nil" nb-char-lit))
nil))
(def keyword-lit (p/alt false-lit true-lit null-lit))
(def space (nb-char-lit \space))
(def tab (nb-char-lit \tab))
(def newline-lit (p/lit \newline))
(def return-lit (p/lit \return))
(def windows-newline-lit (p/conc return-lit newline-lit))
(def line-break (b-char (p/alt windows-newline-lit newline-lit return-lit)))
(def continuation (p/conc (p/lit \\) line-break))
(def ws (p/constant-semantics (p/rep+ (p/alt space tab line-break)) :ws))
(def inline-ws (p/constant-semantics (p/rep+ (p/alt space tab continuation)) :inline-ws))
(defn opt-inline-ws-wrap [rule]
(p/complex [_ (p/opt inline-ws)
prod rule
_ (p/opt inline-ws)]
prod))
(def non-line-break (p/except (nb-char p/anything) line-break))
(def zero-digit (nb-char-lit \0))
(def nonzero-decimal-digit (p/lit-alt-seq "123456789" nb-char-lit))
(def decimal-digit (p/alt zero-digit nonzero-decimal-digit))
(def letter
(p/lit-alt-seq (map char (concat (range (int \A) (inc (int \Z)))
(range (int \a) (inc (int \z)))))
nb-char-lit))
(def alphanumeric
(p/alt letter decimal-digit))
(def period (nb-char-lit \.))
(def comma (nb-char-lit \,))
(def underscore (nb-char-lit \_))
(def hyphen (nb-char-lit \-))
(def forward-slash (nb-char-lit \/))
(def colon (nb-char-lit \:))
(def semicolon (nb-char-lit \;))
(def exclamation-mark (nb-char-lit \!))
(def question-mark (nb-char-lit \?))
(def open-bracket (nb-char-lit \[))
(def close-bracket (nb-char-lit \]))
(def open-paren (nb-char-lit \())
(def close-paren (nb-char-lit \)))
(def minus-sign (nb-char-lit \-))
(def plus-sign (nb-char-lit \+))
(def decimal-point (nb-char-lit \.))
(def exponential-sign (p/lit-alt-seq "eE" nb-char-lit))
(def equal-sign (nb-char-lit \=))
(def lt-sign (nb-char-lit \<))
(def caret (nb-char-lit \^))
(def dollar-sign (nb-char-lit \$))
(def hashtag-sign (nb-char-lit \#))
(def percent-sign (nb-char-lit \%))
(def tilde (nb-char-lit \~))
(def single-quote (nb-char-lit \'))
(def non-single-quote (p/except p/anything single-quote))
(def double-quote (nb-char-lit \"))
(def backslash (nb-char-lit \\))
(def backquote (nb-char-lit \`))
(def double-quote-escaped-chars (p/conc backslash
(p/alt dollar-sign
backquote
double-quote
backslash
line-break)))
(def non-double-quote-or-backslash (p/except p/anything
(p/alt double-quote backslash)))
(defn delimited [delim body]
(p/conc delim body delim))
(defn ignore-delimiter [parser]
(p/semantics parser #(apply str (second %))))
(defn include-delimiter [parser]
(p/semantics parser #(apply str `(~(first %) ~@(second %) ~(last %)))))
(def single-quote-shell-string
(include-delimiter (delimited single-quote (p/rep* non-single-quote))))
(def fractional-part (p/conc decimal-point (p/rep* decimal-digit)))
(def exponential-part
(p/conc exponential-sign (p/opt (p/alt plus-sign minus-sign))
(p/failpoint (p/rep+ decimal-digit)
(expectation-error-fn
(str "in number literal, after an exponent sign, decimal"
"digit")))))
(def number-lit
(p/complex [minus (p/opt minus-sign)
above-one (p/alt zero-digit (p/rep+ nonzero-decimal-digit))
below-one (p/opt fractional-part)
power (p/opt exponential-part)]
(-> [minus above-one below-one power] flatten apply-str
Double/parseDouble
((if (or below-one power) identity int)))))
(def hexadecimal-digit
(p/alt decimal-digit (p/lit-alt-seq "ABCDEF" nb-char-lit)))
(def any-char
(p/alt line-break (nb-char p/anything)))
(def unescaped-char
(p/except any-char (p/alt escape-indicator string-delimiter)))
(def unicode-char-sequence
(p/complex [_ (nb-char-lit \u)
digits (p/factor= 4
(p/failpoint hexadecimal-digit
(expectation-error-fn "hexadecimal digit")))]
(-> digits apply-str (Integer/parseInt 16) char)))
(def escaped-characters
{\\ \\, \/ \/, \b \backspace, \f \formfeed, \n \newline, \r \return,
\t \tab, \" \", \' \', \$ \$, \) \)})
(def normal-escape-sequence
(p/semantics (p/lit-alt-seq (keys escaped-characters) nb-char-lit)
escaped-characters))
(def escape-sequence
(p/complex [_ escape-indicator
character (p/alt unicode-char-sequence
normal-escape-sequence)]
character))
(def string-char
(p/alt escape-sequence unescaped-char))
(def strong-quote
(p/complex [_ single-quote
chars (p/rep* (p/except p/anything single-quote))
_ single-quote]
(apply str chars)))