Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: limit the number of top-n items (#11906) #11914

Merged
merged 2 commits into from Aug 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 25 additions & 24 deletions statistics/cmsketch.go
Expand Up @@ -61,35 +61,35 @@ func NewCMSketch(d, w int32) *CMSketch {
return &CMSketch{depth: d, width: w, table: tbl}
}

type dataCnt struct {
data []byte
cnt uint64
}

// topNHelper wraps some variables used when building cmsketch with top n.
type topNHelper struct {
sampleSize uint64
counter map[hack.MutableString]uint64
sorted []uint64
sorted []dataCnt
onlyOnceItems uint64
sumTopN uint64
lastVal uint64
actualNumTop uint32
}

func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
counter := make(map[hack.MutableString]uint64)
for i := range sample {
counter[hack.String(sample[i])]++
}
sorted, onlyOnceItems := make([]uint64, 0, len(counter)), uint64(0)
for _, cnt := range counter {
sorted = append(sorted, cnt)
sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0)
for key, cnt := range counter {
sorted = append(sorted, dataCnt{hack.Slice(string(key)), cnt})
if cnt == 1 {
onlyOnceItems++
}
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i] > sorted[j]
})
sort.SliceStable(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt })

var (
// last is the last element in top N index should occurres atleast `last` times.
last uint64
sumTopN uint64
sampleNDV = uint32(len(sorted))
)
Expand All @@ -98,15 +98,15 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
// frequency of the n-th element are added to the TopN statistics. We chose
// 2/3 as an empirical value because the average cardinality estimation
// error is relatively small compared with 1/2.
for i := uint32(0); i < sampleNDV && i < numTop*2; i++ {
if i >= numTop && sorted[i]*3 < sorted[numTop-1]*2 && last != sorted[i] {
var actualNumTop uint32
for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ {
if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 {
break
}
last = sorted[i]
sumTopN += sorted[i]
sumTopN += sorted[actualNumTop].cnt
}

return &topNHelper{uint64(len(sample)), counter, sorted, onlyOnceItems, sumTopN, last}
return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop}
}

// NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio.
Expand All @@ -126,22 +126,23 @@ func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, default
enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
if enableTopN {
c.topN = make(map[uint64][]*TopNMeta)
for i := uint32(0); i < helper.actualNumTop; i++ {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio})
}
helper.sorted = helper.sorted[helper.actualNumTop:]
}
c.defaultValue = defaultVal
for counterKey, cnt := range helper.counter {
data := hack.Slice(string(counterKey))
for i := range helper.sorted {
data, cnt := helper.sorted[i].data, helper.sorted[i].cnt
// If the value only occurred once in the sample, we assumes that there is no difference with
// value that does not occurred in the sample.
rowCount := defaultVal
if cnt > 1 {
rowCount = cnt * scaleRatio
}
if enableTopN && cnt >= helper.lastVal {
h1, h2 := murmur3.Sum128(data)
c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount})
} else {
c.insertBytesByCount(data, rowCount)
}
c.insertBytesByCount(data, rowCount)
}
return
}
Expand Down
1 change: 1 addition & 0 deletions statistics/cmsketch_test.go
Expand Up @@ -189,6 +189,7 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) {
for _, t := range tests {
lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor)
c.Check(err, IsNil)
c.Assert(len(lSketch.TopN()), LessEqual, 40)
avg, err := averageAbsoluteError(lSketch, lMap)
c.Assert(err, IsNil)
c.Check(avg, LessEqual, t.avgError)
Expand Down