Skip to content

Commit

Permalink
Making exact matches explicit
Browse files Browse the repository at this point in the history
  • Loading branch information
Dynom committed Jun 20, 2018
1 parent 67a5de0 commit ae65ea8
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 35 deletions.
26 changes: 12 additions & 14 deletions algorithm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"testing"

"fmt"
"math"
"strings"

"github.com/Dynom/TySug/finder"
Expand All @@ -14,15 +13,14 @@ import (

const (
defaultTestAlgorithm = `JaroWinkler .7/4`
floatTolerance = 0.000001
)

// Several algorithms to test with.
var algorithms = map[string]finder.Algorithm{
"Ukkonen 1/1/1": func(a, b string) float64 {
return -1 * float64(smetrics.Ukkonen(a, b, 1, 1, 1))
},
"JaroWinkler .7/4": func(a, b string) float64 {
defaultTestAlgorithm: func(a, b string) float64 {
return smetrics.JaroWinkler(a, b, .7, 4)
},
"WagnerFischer 1/1/1": func(a, b string) float64 {
Expand Down Expand Up @@ -82,7 +80,7 @@ func TestAlgorithms(t *testing.T) {
for expectedDomain, emailsToTest := range testData {
for _, domain := range emailsToTest {

bestMatch, score := sug.Find(domain)
bestMatch, score, _ := sug.Find(domain)
if bestMatch != expectedDomain {
t.Logf("Related score: %f", score)
t.Logf("Expected '%s' to result in '%s'. Instead I got: '%s'.", domain, expectedDomain, bestMatch)
Expand All @@ -96,7 +94,7 @@ func TestAlgorithms(t *testing.T) {
func TestNew(t *testing.T) {
expect := "example"
sug, _ := finder.New([]string{expect, "ample"}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
alt, _ := sug.Find("exampel")
alt, _, _ := sug.Find("exampel")

if alt != expect {
t.Errorf("Expected '%s' to be '%s'.", alt, expect)
Expand All @@ -114,14 +112,14 @@ func TestTestExactMatch(t *testing.T) {

for _, td := range cases {
sug, _ := finder.New([]string{"foo", "example", "CaseSensitive", "cASEsENSITIVE"}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
match, score := sug.Find(td.Input)
match, _, exact := sug.Find(td.Input)

if match != td.Expect {
t.Errorf("Expected the input '%s' to result in '%s', however the best match is '%s'", td.Input, td.Expect, match)
}

if math.Abs(1-score) > floatTolerance {
t.Errorf("Expected a score of ~1.0, instead it is: %f", score)
if !exact {
t.Errorf("Expected an exact match, instead I got %t", exact)
}
}
}
Expand All @@ -137,7 +135,7 @@ func TestApproximateMatch(t *testing.T) {

for _, td := range cases {
sug, _ := finder.New([]string{td.Reference}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
match, _ := sug.Find(td.Input)
match, _, _ := sug.Find(td.Input)

if match != td.Reference {
t.Errorf("Expected the input '%s' to result in '%s', however the best match '%s'", td.Input, td.Reference, match)
Expand All @@ -150,19 +148,19 @@ func BenchmarkBasicUsage(b *testing.B) {

b.Run("Direct match", func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = sug.Find("foo")
_, _, _ = sug.Find("foo")
}
})

b.Run("Non direct match, low score", func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = sug.Find("juice")
_, _, _ = sug.Find("juice")
}
})

b.Run("Non direct match, high score", func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = sug.Find("butterfyl")
_, _, _ = sug.Find("butterfyl")
}
})
}
Expand Down Expand Up @@ -190,9 +188,9 @@ func SuggestAlternative(email string, domains []string) (string, float64) {
hostname := email[i+1:]

sug, _ := finder.New(domains, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
alternative, score := sug.Find(strings.ToLower(hostname))
alternative, score, exact := sug.Find(strings.ToLower(hostname))

if score > 0.9 {
if exact || score > 0.9 {
combined := localPart + "@" + alternative
return combined, score
}
Expand Down
42 changes: 28 additions & 14 deletions finder/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ var (
ErrNoAlgorithmDefined = errors.New("no algorithm defined")
)

const WorstScoreValue = -1 * math.MaxFloat32

// New creates a new instance of Finder. The order of the list is significant
func New(list []string, options ...Option) (*Finder, error) {
i := &Finder{
Expand All @@ -44,41 +46,53 @@ func New(list []string, options ...Option) (*Finder, error) {
return i, nil
}

// Find returns the best alternative and a score. A score of 1 means a perfect match
func (t Finder) Find(input string) (string, float64) {
// Find returns the best alternative a score and if it was an exact match or not.
func (t Finder) Find(input string) (string, float64, bool) {
return t.FindCtx(context.Background(), input)
}

// FindCtx is the same as Find, with context support
func (t Finder) FindCtx(ctx context.Context, input string) (string, float64) {
func (t Finder) FindCtx(ctx context.Context, input string) (string, float64, bool) {
// Initial value, compatible with JSON serialisation. It's not ideal to mix presentation with business logic
// but in this instance it was convenient and similarly effective to math.Inf(-1)
var hs = WorstScoreValue

// Exact matches
if _, exists := t.referenceMap[input]; exists {
return input, 1
return input, hs, true
}

inputLen := len(input)

var hs = math.Inf(-1)
var best = input
for _, ref := range t.reference {
select {
case <-ctx.Done():
return input, hs
return input, hs, false
default:
}

// Test if the input length is much fewer, making it an unlikely typo. The result is 20% of the length or at least
// 1 (due to math.Ceil)
if t.LengthTolerance != 0 && inputLen < len(ref)-int(math.Ceil(float64(inputLen)*t.LengthTolerance)) {
// Test if the input length is much fewer, making it an unlikely typo.
if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
continue
}

if d := t.Alg(input, ref); d > hs {
hs = d
if score := t.Alg(input, ref); score > hs {
hs = score
best = ref
}
}

return best, hs
return best, hs, false
}

// meetsLengthTolerance checks if the input meets the length tolerance criteria
func meetsLengthTolerance(t float64, input, reference string) bool {
if t == 0 {
return true
}

inputLen := len(input)
refLen := len(reference)

// The result is N% of the length or at least 1 (due to math.Ceil)
return inputLen >= refLen-int(math.Ceil(float64(inputLen)*t))
}
13 changes: 7 additions & 6 deletions finder/find_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,16 @@ func TestNewWithCustomAlgorithm(t *testing.T) {
sug, _ := New([]string{"b"}, WithAlgorithm(exampleAlgorithm))

var score float64
var exact bool

_, score = sug.Find("a")
if score != 0 {
t.Errorf("Expected the score to be 0, instead I got %f.", score)
_, score, exact = sug.Find("a")
if exact {
t.Errorf("Expected exact to be false, instead I got %t (the score is %f).", exact, score)
}

_, score = sug.Find("b")
if score != 1 {
t.Errorf("Expected the score to be 1, instead I got %f.", score)
_, score, exact = sug.Find("b")
if !exact {
t.Errorf("Expected exact to be true, instead I got %t (the score is %f).", exact, score)
}
}

Expand Down
3 changes: 3 additions & 0 deletions finder/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ func WithAlgorithm(alg Algorithm) Option {
}
}

// WithLengthTolerance defines a percentage of length above we no longer consider a length difference a typo.
// A value of 0.2 specifies a tolerance of at most ~20% difference in size, with a minimum of 1 character.
// A value of 0 (the default) disables this feature.
func WithLengthTolerance(t float64) Option {
return func(s *Finder) {
s.LengthTolerance = t
Expand Down
2 changes: 1 addition & 1 deletion server/service/domain.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type Service struct {

// Find returns the nearest reference
func (s Service) Find(ctx context.Context, input string) (string, float64) {
suggestion, score := s.finder.FindCtx(ctx, input)
suggestion, score, _ := s.finder.FindCtx(ctx, input)
return suggestion, score
}

Expand Down

0 comments on commit ae65ea8

Please sign in to comment.