Permalink
Browse files

The Trie type now stores values with each leaf node, of type interfac…

…e{}; ValueTrie no longer exists.

The file hyphen_trie.go contains a hyphenation-table-specific function for reading string/value from a TeX hyphenation table entry.
  • Loading branch information...
Jim Dovey
Jim Dovey committed Jul 20, 2010
1 parent 9cada5e commit 6d6d043c12434a0549da789e55a603706fc8ea8f
Showing with 319 additions and 462 deletions.
  1. +1 −2 Makefile
  2. +60 −21 defs.go → hyphen_trie.go
  3. +99 −11 trie.go
  4. +159 −85 trie_test.go
  5. +0 −343 value_trie.go
View
@@ -3,8 +3,7 @@ include $(GOROOT)/src/Make.$(GOARCH)
TARG=trie
GOFILES=\
- defs.go\
trie.go\
- value_trie.go\
+ hyphen_trie.go\
include $(GOROOT)/src/Make.pkg
View
@@ -1,5 +1,5 @@
/*
- * defs.go
+ * trie.go
* Trie
*
* Created by Jim Dovey on 16/07/2010.
@@ -36,27 +36,66 @@
*
*/
-/*
- The trie package implements a basic character trie type. Instead of using bytes however, it uses
- integer-sized runes as traversal keys. In Go, this means that each node refers to exactly one Unicode
- character, so the implementation doesn't depend on the particular semantics of UTF-8 byte streams.
-
- There is an additional specialization, which stores an integer value along with the Unicode character
- on each node. This is to implement TeX-style hyphenation pattern storage.
-*/
package trie
-// The basic form of a Trie uses runes rather than characters, therefore it works on integer types.
-type Trie struct {
- leaf bool // whether the node is a leaf (the end of an input string).
- children map[int]*Trie // a map of sub-tries for each child rune value.
-}
+import (
+ "unicode"
+ "utf8"
+ "container/vector"
+ "strings"
+)
+
+
+// Specialized function for TeX-style hyphenation patterns. Accepts strings of the form '.hy2p'.
+// The value it stores is of type vector.IntVector
+func (p *Trie) AddPatternString(s string) {
+ v := new(vector.IntVector)
+
+ // precompute the Unicode rune for the character '0'
+ rune0, _ := utf8.DecodeRune([]byte{'0'})
+
+ strLen := len(s)
+
+ // Using the range keyword will give us each Unicode rune.
+ for pos, rune := range s {
+ if unicode.IsDigit(rune) {
+ if pos == 0 {
+ // This is a prefix number
+ v.Push(rune - rune0)
+ }
+
+ // this is a number referring to the previous character, and has
+ // already been handled
+ continue
+ }
+
+ if pos < strLen-1 {
+ // look ahead to see if it's followed by a number
+ next := int(s[pos+1])
+ if unicode.IsDigit(next) {
+ // next char is the hyphenation value for this char
+ v.Push(next - rune0)
+ } else {
+ // hyphenation for this char is an implied zero
+ v.Push(0)
+ }
+ } else {
+ // last character gets an implied zero
+ v.Push(0)
+ }
+ }
+
+ pure := strings.Map(func(rune int) int {
+ if unicode.IsDigit(rune) {
+ return -1
+ }
+ return rune
+ },
+ s)
+ leaf := p.addRunes(strings.NewReader(pure))
+ if leaf == nil {
+ return
+ }
-// The second form stores a rune:integer pair. This is used in the implementation of TeX hyphenation
-// pattern tries.
-type ValueTrie struct {
- value int // the value for the letter which indexed this node.
- prefixValue int // some hyphenation strings *begin* with a numeric value. Le sigh.
- leaf bool // whether the node is a leaf (where an input string ended).
- children map[int]*ValueTrie // a map of sub-tries for each child rune value.
+ leaf.value = v
}
View
110 trie.go
@@ -53,20 +53,29 @@ import (
"sort"
)
+// A Trie uses runes rather than characters for indexing, therefore its child key values are integers.
+type Trie struct {
+ leaf bool // whether the node is a leaf (the end of an input string).
+ value interface{} // the value associated with the string up to this leaf node.
+ children map[int]*Trie // a map of sub-tries for each child rune value.
+}
+
// Creates and returns a new Trie instance.
func NewTrie() *Trie {
t := new(Trie)
t.leaf = false
+ t.value = nil
t.children = make(map[int]*Trie)
return t
}
-// Internal function: adds items to the trie, reading runes from a strings.Reader
-func (p *Trie) addRunes(r *strings.Reader) {
+// Internal function: adds items to the trie, reading runes from a strings.Reader. It returns
+// the leaf node at which the addition ends.
+func (p *Trie) addRunes(r *strings.Reader) *Trie {
rune, _, err := r.ReadRune()
if err != nil {
p.leaf = true
- return
+ return p
}
n := p.children[rune]
@@ -76,23 +85,37 @@ func (p *Trie) addRunes(r *strings.Reader) {
}
// recurse to store sub-runes below the new node
- n.addRunes(r)
+ return n.addRunes(r)
}
// Adds a string to the trie. If the string is already present, no additional storage happens. Yay!
-func (p *Trie) Add(s string) {
+func (p *Trie) AddString(s string) {
if len(s) == 0 {
return
}
- // append the runes to the trie
+ // append the runes to the trie -- we're ignoring the value in this invocation
p.addRunes(strings.NewReader(s))
}
-// Internal string removal function. Returns trie if this node is empty following the removal.
+// Adds a string to the trie, with an associated value. If the string is already present, only
+// the value is updated.
+func (p *Trie) AddValue(s string, v interface{}) {
+ if len(s) == 0 {
+ return
+ }
+
+ // append the runes to the trie
+ leaf := p.addRunes(strings.NewReader(s))
+ leaf.value = v
+}
+
+// Internal string removal function. Returns true if this node is empty following the removal.
func (p *Trie) removeRunes(r *strings.Reader) bool {
rune, _, err := r.ReadRune()
if err != nil {
+ // remove value, remove leaf flag
+ p.value = nil
p.leaf = false
return len(p.children) == 0
}
@@ -117,15 +140,18 @@ func (p *Trie) Remove(s string) bool {
}
// Internal string inclusion function.
-func (p *Trie) includes(r *strings.Reader) bool {
+func (p *Trie) includes(r *strings.Reader) *Trie {
rune, _, err := r.ReadRune()
if err != nil {
- return p.leaf // no more runes + leaf node == the string was present
+ if p.leaf {
+ return p
+ }
+ return nil
}
child, ok := p.children[rune]
if !ok {
- return false // no node for this rune was in the trie
+ return nil // no node for this rune was in the trie
}
// recurse down to the next node with the remainder of the string
@@ -137,7 +163,21 @@ func (p *Trie) Contains(s string) bool {
if len(s) == 0 {
return false // empty strings can't be included (how could we add them?)
}
- return p.includes(strings.NewReader(s))
+ return p.includes(strings.NewReader(s)) != nil
+}
+
+// Return the value associated with the given string. Double return: false if the given string was
+// not present, true if the string was present. The value could be both valid and nil.
+func (p *Trie) GetValue(s string) (interface{}, bool) {
+ if len(s) == 0 {
+ return nil, false
+ }
+
+ leaf := p.includes(strings.NewReader(s))
+ if leaf == nil {
+ return nil, false
+ }
+ return leaf.value, true
}
// Internal output-building function used by Members()
@@ -175,3 +215,51 @@ func (p *Trie) Size() (sz int) {
return
}
+
+// Return all anchored substrings of the given string within the Trie.
+func (p *Trie) AllSubstrings(s string) *vector.StringVector {
+ v := new(vector.StringVector)
+
+ for pos, rune := range s {
+ child, ok := p.children[rune]
+ if !ok {
+ // return whatever we have so far
+ break
+ }
+
+ // if this is a leaf node, add the string so far to the output vector
+ if child.leaf {
+ v.Push(s[0:pos])
+ }
+
+ p = child
+ }
+
+ return v
+}
+
+// Return all anchored substrings of the given string within the Trie, with a matching set of
+// their associated values.
+func (p *Trie) AllSubstringsAndValues(s string) (*vector.StringVector, *vector.Vector) {
+ sv := new(vector.StringVector)
+ vv := new(vector.Vector)
+
+ for pos, rune := range s {
+ child, ok := p.children[rune]
+ if !ok {
+ // return whatever we have so far
+ break
+ }
+
+ // if this is a leaf node, add the string so far and its value
+ if child.leaf {
+ sv.Push(s[0:pos+utf8.RuneLen(rune)])
+ vv.Push(child.value)
+ }
+
+ p = child
+ }
+
+ return sv, vv
+}
+
Oops, something went wrong.

0 comments on commit 6d6d043

Please sign in to comment.