Making exact matches explicit

Dynom · Jun 20, 2018 · ae65ea8 · ae65ea8
1 parent 67a5de0
commit ae65ea8
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 35 deletions.
diff --git a/algorithm_test.go b/algorithm_test.go
@@ -4,7 +4,6 @@ import (
 	"testing"
 
 	"fmt"
-	"math"
 	"strings"
 
 	"github.com/Dynom/TySug/finder"
@@ -14,15 +13,14 @@ import (
 
 const (
 	defaultTestAlgorithm = `JaroWinkler .7/4`
-	floatTolerance       = 0.000001
 )
 
 // Several algorithms to test with.
 var algorithms = map[string]finder.Algorithm{
 	"Ukkonen 1/1/1": func(a, b string) float64 {
 		return -1 * float64(smetrics.Ukkonen(a, b, 1, 1, 1))
 	},
-	"JaroWinkler .7/4": func(a, b string) float64 {
+	defaultTestAlgorithm: func(a, b string) float64 {
 		return smetrics.JaroWinkler(a, b, .7, 4)
 	},
 	"WagnerFischer 1/1/1": func(a, b string) float64 {
@@ -82,7 +80,7 @@ func TestAlgorithms(t *testing.T) {
 			for expectedDomain, emailsToTest := range testData {
 				for _, domain := range emailsToTest {
 
-					bestMatch, score := sug.Find(domain)
+					bestMatch, score, _ := sug.Find(domain)
 					if bestMatch != expectedDomain {
 						t.Logf("Related score: %f", score)
 						t.Logf("Expected '%s' to result in '%s'. Instead I got: '%s'.", domain, expectedDomain, bestMatch)
@@ -96,7 +94,7 @@ func TestAlgorithms(t *testing.T) {
 func TestNew(t *testing.T) {
 	expect := "example"
 	sug, _ := finder.New([]string{expect, "ample"}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
-	alt, _ := sug.Find("exampel")
+	alt, _, _ := sug.Find("exampel")
 
 	if alt != expect {
 		t.Errorf("Expected '%s' to be '%s'.", alt, expect)
@@ -114,14 +112,14 @@ func TestTestExactMatch(t *testing.T) {
 
 	for _, td := range cases {
 		sug, _ := finder.New([]string{"foo", "example", "CaseSensitive", "cASEsENSITIVE"}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
-		match, score := sug.Find(td.Input)
+		match, _, exact := sug.Find(td.Input)
 
 		if match != td.Expect {
 			t.Errorf("Expected the input '%s' to result in '%s', however the best match is '%s'", td.Input, td.Expect, match)
 		}
 
-		if math.Abs(1-score) > floatTolerance {
-			t.Errorf("Expected a score of ~1.0, instead it is: %f", score)
+		if !exact {
+			t.Errorf("Expected an exact match, instead I got %t", exact)
 		}
 	}
 }
@@ -137,7 +135,7 @@ func TestApproximateMatch(t *testing.T) {
 
 	for _, td := range cases {
 		sug, _ := finder.New([]string{td.Reference}, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
-		match, _ := sug.Find(td.Input)
+		match, _, _ := sug.Find(td.Input)
 
 		if match != td.Reference {
 			t.Errorf("Expected the input '%s' to result in '%s', however the best match '%s'", td.Input, td.Reference, match)
@@ -150,19 +148,19 @@ func BenchmarkBasicUsage(b *testing.B) {
 
 	b.Run("Direct match", func(b *testing.B) {
 		for i := 0; i < b.N; i++ {
-			_, _ = sug.Find("foo")
+			_, _, _ = sug.Find("foo")
 		}
 	})
 
 	b.Run("Non direct match, low score", func(b *testing.B) {
 		for i := 0; i < b.N; i++ {
-			_, _ = sug.Find("juice")
+			_, _, _ = sug.Find("juice")
 		}
 	})
 
 	b.Run("Non direct match, high score", func(b *testing.B) {
 		for i := 0; i < b.N; i++ {
-			_, _ = sug.Find("butterfyl")
+			_, _, _ = sug.Find("butterfyl")
 		}
 	})
 }
@@ -190,9 +188,9 @@ func SuggestAlternative(email string, domains []string) (string, float64) {
 	hostname := email[i+1:]
 
 	sug, _ := finder.New(domains, finder.WithAlgorithm(algorithms[defaultTestAlgorithm]))
-	alternative, score := sug.Find(strings.ToLower(hostname))
+	alternative, score, exact := sug.Find(strings.ToLower(hostname))
 
-	if score > 0.9 {
+	if exact || score > 0.9 {
 		combined := localPart + "@" + alternative
 		return combined, score
 	}

diff --git a/finder/find.go b/finder/find.go
@@ -22,6 +22,8 @@ var (
 	ErrNoAlgorithmDefined = errors.New("no algorithm defined")
 )
 
+const WorstScoreValue = -1 * math.MaxFloat32
+
 // New creates a new instance of Finder. The order of the list is significant
 func New(list []string, options ...Option) (*Finder, error) {
 	i := &Finder{
@@ -44,41 +46,53 @@ func New(list []string, options ...Option) (*Finder, error) {
 	return i, nil
 }
 
-// Find returns the best alternative and a score. A score of 1 means a perfect match
-func (t Finder) Find(input string) (string, float64) {
+// Find returns the best alternative a score and if it was an exact match or not.
+func (t Finder) Find(input string) (string, float64, bool) {
 	return t.FindCtx(context.Background(), input)
 }
 
 // FindCtx is the same as Find, with context support
-func (t Finder) FindCtx(ctx context.Context, input string) (string, float64) {
+func (t Finder) FindCtx(ctx context.Context, input string) (string, float64, bool) {
+	// Initial value, compatible with JSON serialisation. It's not ideal to mix presentation with business logic
+	// but in this instance it was convenient and similarly effective to math.Inf(-1)
+	var hs = WorstScoreValue
 
 	// Exact matches
 	if _, exists := t.referenceMap[input]; exists {
-		return input, 1
+		return input, hs, true
 	}
 
-	inputLen := len(input)
-
-	var hs = math.Inf(-1)
 	var best = input
 	for _, ref := range t.reference {
 		select {
 		case <-ctx.Done():
-			return input, hs
+			return input, hs, false
 		default:
 		}
 
-		// Test if the input length is much fewer, making it an unlikely typo. The result is 20% of the length or at least
-		// 1 (due to math.Ceil)
-		if t.LengthTolerance != 0 && inputLen < len(ref)-int(math.Ceil(float64(inputLen)*t.LengthTolerance)) {
+		// Test if the input length is much fewer, making it an unlikely typo.
+		if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
 			continue
 		}
 
-		if d := t.Alg(input, ref); d > hs {
-			hs = d
+		if score := t.Alg(input, ref); score > hs {
+			hs = score
 			best = ref
 		}
 	}
 
-	return best, hs
+	return best, hs, false
+}
+
+// meetsLengthTolerance checks if the input meets the length tolerance criteria
+func meetsLengthTolerance(t float64, input, reference string) bool {
+	if t == 0 {
+		return true
+	}
+
+	inputLen := len(input)
+	refLen := len(reference)
+
+	// The result is N% of the length or at least 1 (due to math.Ceil)
+	return inputLen >= refLen-int(math.Ceil(float64(inputLen)*t))
 }
diff --git a/finder/find_test.go b/finder/find_test.go
@@ -42,15 +42,16 @@ func TestNewWithCustomAlgorithm(t *testing.T) {
 	sug, _ := New([]string{"b"}, WithAlgorithm(exampleAlgorithm))
 
 	var score float64
+	var exact bool
 
-	_, score = sug.Find("a")
-	if score != 0 {
-		t.Errorf("Expected the score to be 0, instead I got %f.", score)
+	_, score, exact = sug.Find("a")
+	if exact {
+		t.Errorf("Expected exact to be false, instead I got %t (the score is %f).", exact, score)
 	}
 
-	_, score = sug.Find("b")
-	if score != 1 {
-		t.Errorf("Expected the score to be 1, instead I got %f.", score)
+	_, score, exact = sug.Find("b")
+	if !exact {
+		t.Errorf("Expected exact to be true, instead I got %t (the score is %f).", exact, score)
 	}
 }
 

diff --git a/finder/option.go b/finder/option.go
@@ -10,6 +10,9 @@ func WithAlgorithm(alg Algorithm) Option {
 	}
 }
 
+// WithLengthTolerance defines a percentage of length above we no longer consider a length difference a typo.
+// A value of 0.2 specifies a tolerance of at most ~20% difference in size, with a minimum of 1 character.
+// A value of 0 (the default) disables this feature.
 func WithLengthTolerance(t float64) Option {
 	return func(s *Finder) {
 		s.LengthTolerance = t

diff --git a/server/service/domain.go b/server/service/domain.go
@@ -34,7 +34,7 @@ type Service struct {
 
 // Find returns the nearest reference
 func (s Service) Find(ctx context.Context, input string) (string, float64) {
-	suggestion, score := s.finder.FindCtx(ctx, input)
+	suggestion, score, _ := s.finder.FindCtx(ctx, input)
 	return suggestion, score
 }