haberman / gazelle

A system for creating fast, reusable parsers

This URL has Read+Write access

haberman (author)
Sat Feb 21 18:43:54 -0800 2009
commit  e3d2566a0f805cc67b44f1f065d57eb99b9ed141
tree    3886bc67dd20482d058aa425eebdb90772ee9b0e
parent  338a003e541f73d9c22741174362447f56654d02
gazelle / compiler / grammar.lua
100644 623 lines (524 sloc) 20.203 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
--[[--------------------------------------------------------------------
 
Gazelle: a system for building fast, reusable parsers
 
grammar.lua
 
This is the top-level data structure representing a Gazelle grammar.
 
It contains all DFAs for all levels of the grammar, as well as all
metadata about the grammar, like what the start symbol is.
 
It is created and initially populated by the grammar parser. The
parser adds RTNs for each rule and IntFAs for each terminal, as well
as metadata explicitly or implicitly contained in the source file.
 
It is annotated with lookahead information in the form of GLAs by
the LL lookahead calculation.
 
The IntFAs for each terminal are combined into the grammar's master
IntFAs by the IntFA-combining step.
 
Finally, it is written out to bytecode by the bytecode emitting step.
 
Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
 
--------------------------------------------------------------------]]--
 
require "data_structures"
require "misc"
require "fa_algorithms"
require "intfa_combine"
 
define_class("TextOffset")
  function TextOffset:initialize(line, column, offset)
    self.line = line
    self.column = column
    self.offset = offset
  end
 
  function TextOffset:tostring()
    return string.format("line %d, column %d", self.line, self.column)
  end
-- class TextOffset
 
--[[--------------------------------------------------------------------
 
GrammarObj: a class for representing grammar objects (rules,
terminals, etc) that are seen or even just referenced in a Gazelle
grammar file. The symbol table maps key (names) to objects of this
type.
 
--------------------------------------------------------------------]]--
 
define_class("GrammarObj")
 
-- What types of grammar objects can exist in a grammar?
GrammarObj.RULE = "rule"
GrammarObj.TERMINAL = "terminal"
-- (more to come...)
 
  function GrammarObj:initialize(name)
    self.name = name
    self.definition = nil -- starts out undefined
    self.explicit = false
    self.type = nil
    self.expected_types = nil
    self.explicit_offset = nil
    self.implicit_offsets = {}
  end
 
  function GrammarObj:set_expected(types)
    types = Set:new(types)
    if not self.expected_types then
      self.expected_types = types
    else
      local intersection = self.expected_types:intersection(types)
      if intersection:isempty() then
        if self.definition then
          error(string.format("Symbol '%s' was previously defined as %s, but its " ..
                              "use here expects it to be %s.",
                              self.name, self:type_str(), self:expected_types_str()))
        else
          error(string.format("Symbol '%s' was previously referenced as a(n) %s, but " ..
                              "its use here expects it to be %s.",
                              self.name, self:expected_types_str(),
                              self:expected_types_str(types)))
        end
      end
    end
  end
 
  function GrammarObj:define(definition, _type, offset, implicit)
    local explicit = not implicit
    if self.definition then
      if self.explicit == false and self.definition == definition and self.type == _type then
        -- redefinition is ok here.
      else
        error(string.format("Redefinition of symbol '%s' at %s (previous definition was at %s).",
                            self.name, offset:tostring(),
                            (self.explicit_offset or self.implicit_offsets[1]):tostring()))
      end
    elseif self.expected_types and not self.expected_types:contains(_type) then
      error(string.format("Symbol '%s' defined as %s, but previously used as if it were %s.",
            self.name, self:type_str(_type), self:expected_types_str()))
    else
      self.definition = definition
      self.type = _type
      self.expected_types = Set:new({_type})
    end
    self.explicit = self.explicit or explicit
    if explicit then
      self.explicit_offset = offset
    else
      table.insert(self.implicit_offsets, offset)
    end
  end
 
  function GrammarObj:type_str(_type)
    _type = _type or self.type
    return "a " .. _type
  end
 
  function GrammarObj:expected_types_str(types)
    types = types or self.expected_types
    types = types:to_array()
    local str = "a " .. types[1]
    if #types > 1 then
      str = str .. " or " .. types[2]
    end
    return str
  end
-- class GrammarObj
 
--[[--------------------------------------------------------------------
 
Grammar: the Grammar class itself, which represents a single Gazelle
grammar that is parsed from one or more input files, analzed and
translated, and finally emitted to output.
 
--------------------------------------------------------------------]]--
 
define_class("Grammar")
function Grammar:initialize()
  -- The symbol table of objects that have been defined or referenced.
  self.objects = MemoizedObject:new(GrammarObj)
 
  -- The master IntFAs we build once all IntFAs are combined.
  self.master_intfas = OrderedSet:new()
 
  -- What rule the entire grammar starts on.
  self.start = nil
 
  -- @allow definitions.
  self.allow = {}
 
  -- Once we start processing, we assign all the rtns and terminals
  -- to separate lists for processing.
  self.rtns = nil
  self.terminals = nil
end
 
function Grammar:parse_source_string(string)
  local parser = RTNParser:new()
  parser:parse(CharStream:new(string), self)
end
 
--[[--------------------------------------------------------------------
 
The next block of functions are intended for the Gazelle grammar
parser to call as they are parsing a Gazelle grammar file.
 
--------------------------------------------------------------------]]--
 
function Grammar:get_object(name, expected)
  local obj = self.objects:get(name)
  obj:set_expected(expected)
  return obj
end
 
function Grammar:add_nonterm(name, rtn, slot_count, text, offset)
  rtn.name = name
  rtn.slot_count = slot_count
  rtn.text = text
  for state in each(rtn:states()) do
    state.rtn = rtn
  end
  self.objects:get(name):define(rtn, GrammarObj.RULE, offset)
end
 
function Grammar:add_terminal(name, string_or_intfa, text, offset)
  assert(type(name) == "string")
  assert(string_or_intfa)
  local obj = self.objects:get(name)
  obj:define(string_or_intfa, GrammarObj.TERMINAL, offset)
  return obj
end
 
function Grammar:add_implicit_terminal(name, string, offset)
  assert(type(name) == "string")
  local obj = self.objects:get(name)
  obj:define(string, GrammarObj.TERMINAL, offset, true)
  return obj
end
 
--[[--------------------------------------------------------------------
 
Methods for analyzing/translating the grammar once it has been
fully read from source files.
 
--------------------------------------------------------------------]]--
 
function Grammar:process()
  self:bind_symbols()
  -- start symbol defaults to the first rule.
  self.start = self.start or self.rtns:get_key_at_offset(1)
 
  -- assign priorities to RTN transitions
  --print_verbose("Assigning RTN transition priorities...")
  self:assign_priorities()
 
  -- make the RTNs in the grammar determistic and minimal
  --print_verbose("Convering RTN NFAs to DFAs...")
  self:determinize_rtns()
  self:process_allow()
  self:canonicalize_properties()
end
 
function Grammar:compute_lookahead(max_k)
  -- Generate GLAs by doing lookahead calculations.
  -- This annotates every nontrivial state in the grammar with a GLA.
  --print_verbose("Doing LL(*) lookahead calculations...")
  compute_lookahead(self, max_k)
end
 
-- we now have everything figured out at the RTN and GLA levels. Now we just
-- need to figure out how many IntFAs to generate, which terminals each one
-- should handle, and generate/determinize/minimize those IntFAs.
 
function Grammar:bind_symbols()
  -- Ensure that all referenced symbols were defined, and replace the GrammarObj
  -- objects with the raw values.
  local objects = {}
  self.rtns = OrderedMap:new()
  self.terminals = {}
  for name, obj in self.objects:get_objects():each() do
    local definition = obj.definition
    if not definition then
      error(string.format("Symbol '%s' was referenced but never defined.", name))
    end
    assert(name == obj.name)
    objects[name] = definition
    if obj.type == GrammarObj.RULE then
      self.rtns:add(name, definition)
    else
      self.terminals[name] = definition
    end
  end
 
  -- Replace all references to GrammarObj objects with the actual value.
  for name, rtn in each(self.rtns) do
    for rtn_state in each(rtn:states()) do
      local new_transitions = {}
      for edge_val, dest_state, properties in rtn_state:transitions() do
        -- This is embarrassingly hacky. Need to have a more polymorphic way
        -- of dealing with multiple kinds of edge values.
        if edge_val == fa.eof or edge_val == fa.e then
          -- do nothing
        elseif edge_val.type == GrammarObj.TERMINAL then
          edge_val = edge_val.name
        else
          edge_val = objects[edge_val.name]
        end
        table.insert(new_transitions, {edge_val, dest_state, properties})
      end
      rtn_state:clear_transitions()
      for tuple in each(new_transitions) do
        local edge_val, dest_state, properties = unpack(tuple)
        rtn_state:add_transition(edge_val, dest_state, properties)
      end
    end
  end
end
 
 
function Grammar:add_allow(what_to_allow, start_nonterm, end_nonterms)
  table.insert(self.allow, {what_to_allow, start_nonterm, end_nonterms})
end
 
function Grammar:process_allow()
  for allow in each(self.allow) do
    local what_to_allow, start_nonterm, end_nonterms = unpack(allow)
    local function children_func(rule_name)
      if not end_nonterms:contains(rule_name) then
        local rtn = self.rtns:get(rule_name)
        if not rtn then
          error(string.format("Error computing ignore: rule %s does not exist", rule_name))
        end
 
        -- get sub-rules
        local subrules = Set:new()
        for state in each(rtn:states()) do
          for edge_val, dest_state, properties in state:transitions() do
            if fa.is_nonterm(edge_val) and properties.slotnum ~= -1 then
              subrules:add(edge_val.name)
            end
          end
        end
 
        -- add self-transitions for every state
        for state in each(rtn:states()) do
          local allow_rtn = self.rtns:get(what_to_allow)
          state:add_transition(allow_rtn, state, {name=allow_rtn.name, slotnum=-1})
        end
 
        return subrules
      end
    end
 
    depth_first_traversal(start_nonterm, children_func)
  end
end
 
function Grammar:canonicalize_properties()
  for name, rtn in each(self.rtns) do
    for state in each(rtn:states()) do
      state:canonicalize_properties()
    end
  end
end
 
function Grammar:get_rtn_states_needing_intfa()
  local states = Set:new()
  for name, rtn in each(self.rtns) do
    for state in each(rtn:states()) do
      if state:needs_intfa() then
        states:add(state)
      end
    end
  end
  return states
end
 
function Grammar:get_rtn_states_needing_gla()
  local states = Set:new()
  for name, rtn in each(self.rtns) do
    for state in each(rtn:states()) do
      if state:needs_gla() then
        states:add(state)
      end
    end
  end
  return states
end
 
function Grammar:assign_priorities()
  -- For each non-epsilon transition in the grammar, we want to find the epsilon
  -- closure (within the rule -- no following nonterminal or final transitions)
  -- of all reverse transitions and assign any priorities the epsilon
  -- transitions have to the non-epsilon transition.
 
  -- Begin by building a list of reverse epsilon transitions. For each state
  -- that has at least one epsilon transition going into it, we build a 2-tuple
  -- of {set of states that are the source of an epsilon transitions,
  -- map of priority_class -> priority in that class}
  for name, rtn in each(self.rtns) do
    local reverse_epsilon_transitions = {}
    for state in each(rtn:states()) do
      for edge_val, dest_state, properties in state:transitions() do
        if edge_val == fa.e then
          reverse_epsilon_transitions[dest_state] = reverse_epsilon_transitions[dest_state] or {Set:new(), {}}
          reverse_epsilon_transitions[dest_state][1]:add(state)
          if properties and properties.priority_class then
            if reverse_epsilon_transitions[dest_state][2][properties.priority_class] then
              error("Unexpected.")
            end
            reverse_epsilon_transitions[dest_state][2][properties.priority_class] = properties.priority
          end
        end
      end
    end
 
    local priorities = {}
    local function children(state, stack)
      local reverse_transitions = reverse_epsilon_transitions[state]
      if reverse_transitions then
        local child_states, priority_classes = unpack(reverse_transitions)
        for priority_class, priority in pairs(priority_classes) do
          if priorities[priority_class] then
            error("Unexpected please report the grammar that triggered this error!")
          end
          priorities[priority_class] = priority
        end
        return child_states
      end
    end
    for state in each(rtn:states()) do
      priorities = {}
      depth_first_traversal(state, children)
      priorities = get_unique_table_for_table(priorities)
      for edge_val, dest_state, properties in state:transitions() do
        if edge_val ~= fa.e then
          -- non-epsilon transitions should always have properties assigned,
          -- because they always have slotnums and slotnames.
          properties.priorities = priorities
        end
      end
      if state.final then
        state.final = {priorities = priorities}
      end
    end
  end
end
 
function Grammar:determinize_rtns()
  local new_rtns = OrderedMap:new()
  for name, rtn in each(self.rtns) do
    local new_rtn = nfa_to_dfa(rtn)
    copy_attributes(rtn, new_rtn)
    new_rtns:add(name, new_rtn)
  end
  self.rtns = new_rtns
end
 
function Grammar:minimize_rtns()
  local new_rtns = OrderedMap:new()
  for name, rtn in each(self.rtns) do
    local new_rtn = hopcroft_minimize(rtn)
    copy_attributes(rtn, new_rtn)
    new_rtns:add(name, new_rtn)
  end
  self.rtns = new_rtns
end
 
function Grammar:generate_intfas()
  --print_verbose("Generating lexer DFAs...")
  -- first generate the set of states that need an IntFA: some RTN
  -- states and all nonfinal GLA states.
  local states = self:get_rtn_states_needing_intfa()
  for rtn_state in each(self:get_rtn_states_needing_gla()) do
    for gla_state in each(rtn_state.gla:states()) do
      if not gla_state.final then
        states:add(gla_state)
      end
    end
  end
 
  -- All states in the "states" set are nonfinal and have only
  -- terminals as transitions. Create a list of:
  -- {state, set of outgoing terms}
  -- pairs for the states.
  local state_term_pairs = {}
  for state in each(states) do
    local terms = Set:new()
    for edge_val in state:transitions() do
      if fa.is_nonterm(edge_val) or terms:contains(edge_val) then
        error(string.format("Internal error with state %s, edge %s", serialize(state, 6, " "), serialize(edge_val)))
      end
      if edge_val ~= fa.eof then -- EOF doesn't need to be lexed in the IntFAs
        terms:add(edge_val)
      end
    end
    assert(terms:count() > 0)
    table.insert(state_term_pairs, {state, terms})
  end
 
  self.master_intfas = intfa_combine(self.terminals, state_term_pairs)
end
 
--[[--------------------------------------------------------------------
 
The remaining methods are for linearizing all the graph-like data
structures into an ordered form, in preparation for emitting them to
an output format like bytecode.
 
--------------------------------------------------------------------]]--
 
 
--[[--------------------------------------------------------------------
 
Grammar:get_strings(): Returns an ordered set that contains all strings
needed by any part of the grammar as it currently stands.
 
--------------------------------------------------------------------]]--
 
function Grammar:get_strings()
  local strings = Set:new()
 
  -- add the names of all the terminals
  for term, _ in pairs(self.terminals) do
    strings:add(term)
  end
 
  -- add the names of the rtns, and of named edges with the rtns.
  for name, rtn in each(grammar.rtns) do
    strings:add(name)
 
    for rtn_state in each(rtn:states()) do
      for edge_val, target_state, properties in rtn_state:transitions() do
        if properties then
          strings:add(properties.name)
        end
      end
    end
  end
 
  -- sort the strings for deterministic output
  strings = strings:to_array()
  table.sort(strings)
  local strings_ordered_set = OrderedSet:new()
  for string in each(strings) do
    strings_ordered_set:add(string)
  end
 
  return strings_ordered_set
end
 
 
--[[--------------------------------------------------------------------
 
Grammar:get_flattened_rtn_list(): Creates and returns a list of all
the RTNs, states, and transitions in a particular and stable order,
ready for emitting to the outside world. The RTNs themselves are
returned in the order they were defined, except that the start RTN
is always emitted first.
 
Returns:
OrderedMap: {rtn_name -> {
states=OrderedSet {RTNState},
transitions={RTNState -> {list of {edge_val, target_state, properties}}
slot_count=slot_count
}
 
--------------------------------------------------------------------]]--
 
function Grammar:get_flattened_rtn_list()
  local rtns = OrderedMap:new()
 
  -- create each RTN with a list of states
  for name, rtn in self.rtns:each() do
 
    -- order states such that the start state is emitted first.
    -- TODO (maybe): make this completely stable.
    local states = rtn:states()
    states:remove(rtn.start)
    states = states:to_array()
    table.insert(states, 1, rtn.start)
    states = OrderedSet:new(states)
 
    -- ensure that start RTN is emitted first
    if name == self.start then
      rtns:insert_front(name, {states=states, transitions={}, slot_count=rtn.slot_count})
    else
      rtns:add(name, {states=states, transitions={}, slot_count=rtn.slot_count})
    end
  end
 
  -- create a list of transitions for every state
  for name, rtn in each(rtns) do
    for state in each(rtn.states) do
      local transitions = {}
      for edge_val, target_state, properties in state:transitions() do
        table.insert(transitions, {edge_val, target_state, properties})
      end
 
      -- sort RTN transitions thusly:
      -- 1. terminal transitions come before nonterminal transitions
      -- 2. terminal transitions are sorted by the low integer of the range
      -- 3. nonterminal transitions are sorted by the name of the nonterminal
      -- 4. transitions with the same edge value are sorted by their order
      -- in the list of states (which is stable) (TODO: no it's not, yet)
      local function sort_func(a, b)
        if not fa.is_nonterm(a[1]) and fa.is_nonterm(b[1]) then
          return true
        elseif fa.is_nonterm(a[1]) and not fa.is_nonterm(b[1]) then
          return false
        elseif fa.is_nonterm(a[1]) then
          if a[1] == b[1] then
            return rtn.states:offset_of(a[2]) < rtn.states:offset_of(b[2])
          else
            return a[1].name < b[1].name
          end
        else
          if a[1].low == b[1].low then
            return rtn.states:offset_of(a[2]) < rtn.states:offset_of(b[2])
          else
            return a[1].low < b[1].low
          end
        end
      end
      table.sort(transitions, sort_func)
 
      rtn.transitions[state] = transitions
    end
  end
 
  return rtns
end
 
function Grammar:get_flattened_gla_list()
  local glas = OrderedSet:new()
 
  for name, rtn in each(self.rtns) do
    for state in each(rtn:states()) do
      if state.gla then
        glas:add(state.gla)
      end
    end
  end
 
  return glas
end
 
function copy_attributes(rtn, new_rtn)
  new_rtn.name = rtn.name
  new_rtn.slot_count = rtn.slot_count
  new_rtn.text = rtn.text
  for state in each(new_rtn:states()) do
    state.rtn = new_rtn
  end
end
 
-- vim:et:sts=2:sw=2