haberman / gazelle

A system for creating fast, reusable parsers

This URL has Read+Write access

haberman (author)
Sat Feb 21 18:43:54 -0800 2009
commit  e3d2566a0f805cc67b44f1f065d57eb99b9ed141
tree    3886bc67dd20482d058aa425eebdb90772ee9b0e
parent  338a003e541f73d9c22741174362447f56654d02
gazelle / compiler / intfa_combine.lua
100644 141 lines (119 sloc) 4.465 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
--[[--------------------------------------------------------------------
 
Gazelle: a system for building fast, reusable parsers
 
intfa_combine.lua
 
Once the lookahead has been calculated, we know what terminal(s)
each RTN/GLA state is expecting to see. We use this information to
build IntFAs that recognize any possible valid token that could
occur at this point in the input. We combine and reuse DFAs as much
as possible -- only when two terminals conflict is it necessary to
use different DFAs.
 
Copyright (c) 2007 Joshua Haberman. See LICENSE for details.
 
--------------------------------------------------------------------]]--
 
-- Determine what terminals (if any) conflict with each other.
-- In this context, "conflict" means that a string of characters can
-- be interpreted as one or more terminals.
function analyze_conflicts(terminals)
  -- We detect conflicts by combining all the NFAs into a single DFA.
  -- We then observe what states are final to more than one terminal.
  local conflicts = {}
  local nfas = {}
  for name, terminal in pairs(terminals) do
    if type(terminal) == "string" then
      terminal = fa.intfa_for_string(terminal)
    end
    table.insert(nfas, {terminal, name})
  end
 
  local uber_dfa = nfas_to_dfa(nfas, true)
  for state in each(uber_dfa:states()) do
    if type(state.final) == "table" then -- more than one terminal ended in this state
      for term1 in each(state.final) do
        for term2 in each(state.final) do
          if term1 ~= term2 then
            conflicts[term1] = conflicts[term1] or Set:new()
            conflicts[term1]:add(term2)
          end
        end
      end
    end
  end
 
  return conflicts
end
 
function has_conflicts(conflicts, term_set1, term_set2)
  for term1 in each(term_set1) do
    if conflicts[term1] then
      for conflict in each(conflicts[term1]) do
        if term_set2:contains(conflict) then
          return true, term1, conflict
        end
      end
    end
  end
end
 
function create_or_reuse_termset_for(terminals, conflicts, termsets, nonterm)
  if has_conflicts(conflicts, terminals, terminals) then
    local has_conflict, c1, c2 = has_conflicts(conflicts, terminals, terminals)
    error(string.format("Can't build DFA inside %s, because terminals %s and %s conflict",
                        nonterm, c1, c2))
  end
 
  local found_termset = false
  for i, termset in ipairs(termsets) do
    -- will this termset do? it will if none of our terminals conflict with any of the
    -- existing terminals in this set.
    -- (we can probably compute this faster by pre-computing equivalence classes)
    if not has_conflicts(conflicts, termset, terminals) then
      found_termset = i
      break
    end
  end
 
  if found_termset == false then
    local new_termset = Set:new()
    table.insert(termsets, new_termset)
    found_termset = #termsets
  end
 
  -- add all the terminals for this phase of lookahead to the termset we found
  local termset = termsets[found_termset]
  for term in each(terminals) do
    termset:add(term)
  end
  assert(termset:count() > 0)
 
  return found_termset
end
 
function intfa_combine(all_terminals, state_term_pairs)
  local conflicts = analyze_conflicts(all_terminals)
 
  -- For each state in the grammar, create (or reuse) a DFA to run
  -- when we hit that state.
  local termsets = {}
  local intfa_nums = {}
  for state_term_pair in each(state_term_pairs) do
    local state, terms = unpack(state_term_pair)
    assert(terms:count() > 0)
    local nonterm
    if state.rtn == nil then
      nonterm = state.gla.rtn_state.rtn.name
    else
      nonterm = state.rtn.name
    end
    intfa_nums[state] = create_or_reuse_termset_for(terms, conflicts, termsets, nonterm)
  end
 
  local dfas = OrderedSet:new()
  for termset in each(termsets) do
    local nfas = {}
    for term in each(termset) do
      local target = all_terminals[term]
      assert(target, string.format("No terminal for terminal '%s'", tostring(serialize(term))))
      if type(target) == "string" then
        target = fa.intfa_for_string(target)
      end
      table.insert(nfas, {target, term})
    end
    local dfa = hopcroft_minimize(nfas_to_dfa(nfas))
    dfa.termset = termset
    dfas:add(dfa)
  end
 
  for state, intfa_num in pairs(intfa_nums) do
    state.intfa = dfas:element_at(intfa_num)
  end
 
  dfas:sort(function (a, b) return b.termset:count() < a.termset:count() end)
 
  return dfas
end
 
-- vim:et:sts=2:sw=2