From a45c616da3c9858084ef1e83831a42a6dba0745a Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Fri, 21 Mar 2014 22:54:16 +0100 Subject: [PATCH] Freq Terms Enum A frequency caching terms enum, that also allows to be configured with an optional filter. To be used by both significant terms and phrase suggester. This change extracts the frequency caching into the same code, and allow in the future to add a filter to control/customize the background frequencies --- .../common/lucene/index/FreqTermsEnum.java | 261 ++++++++++++++++++ .../SignificantLongTermsAggregator.java | 1 + .../SignificantStringTermsAggregator.java | 1 + .../SignificantTermsAggregatorFactory.java | 184 +++--------- .../search/suggest/phrase/WordScorer.java | 87 +----- .../lucene/index/FreqTermsEnumTests.java | 212 ++++++++++++++ 6 files changed, 512 insertions(+), 234 deletions(-) create mode 100644 src/main/java/org/elasticsearch/common/lucene/index/FreqTermsEnum.java create mode 100644 src/test/java/org/elasticsearch/common/lucene/index/FreqTermsEnumTests.java diff --git a/src/main/java/org/elasticsearch/common/lucene/index/FreqTermsEnum.java b/src/main/java/org/elasticsearch/common/lucene/index/FreqTermsEnum.java new file mode 100644 index 0000000000000..7689c0c45541c --- /dev/null +++ b/src/main/java/org/elasticsearch/common/lucene/index/FreqTermsEnum.java @@ -0,0 +1,261 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.index; + +import com.google.common.collect.Lists; +import org.apache.lucene.index.*; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.lease.Releasable; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.lucene.docset.DocIdSets; +import org.elasticsearch.common.lucene.search.ApplyAcceptedDocsFilter; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.BytesRefHash; +import org.elasticsearch.common.util.IntArray; +import org.elasticsearch.common.util.LongArray; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; + +/** + * A frequency terms enum that maintains a cache of docFreq, totalTermFreq, or both for repeated term lookup. It also + * allows to provide a filter to explicitly compute frequencies only for docs that match the filter (heavier!). + */ +public class FreqTermsEnum extends TermsEnum implements Releasable { + + static class Holder { + final TermsEnum termsEnum; + @Nullable + DocsEnum docsEnum; + @Nullable + final Bits bits; + + Holder(TermsEnum termsEnum, Bits bits) { + this.termsEnum = termsEnum; + this.bits = bits; + } + } + + static final int INITIAL_NUM_TERM_FREQS_CACHED = 512; + + private final boolean docFreq; + private final boolean totalTermFreq; + private final Holder[] enums; + + private final BigArrays bigArrays; + private IntArray termDocFreqs; + private LongArray termsTotalFreqs; + private BytesRefHash cachedTermOrds; + + private int currentDocFreq = 0; + private long currentTotalTermFreq = 0; + + private BytesRef current; + + public FreqTermsEnum(IndexReader reader, String field, boolean docFreq, boolean totalTermFreq, @Nullable Filter filter, BigArrays bigArrays) throws IOException { + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + if (!docFreq && !totalTermFreq) { + throw new ElasticsearchIllegalArgumentException("either docFreq or totalTermFreq must be true"); + } + List leaves = reader.leaves(); + List enums = Lists.newArrayListWithExpectedSize(leaves.size()); + for (AtomicReaderContext context : leaves) { + Terms terms = context.reader().terms(field); + if (terms == null) { + continue; + } + TermsEnum termsEnum = terms.iterator(null); + if (termsEnum == null) { + continue; + } + Bits bits = null; + if (filter != null) { + if (filter == Queries.MATCH_ALL_FILTER) { + bits = context.reader().getLiveDocs(); + } else { + // we want to force apply deleted docs + filter = new ApplyAcceptedDocsFilter(filter); + DocIdSet docIdSet = filter.getDocIdSet(context, context.reader().getLiveDocs()); + if (DocIdSets.isEmpty(docIdSet)) { + // fully filtered, none matching, no need to iterate on this + continue; + } + bits = DocIdSets.toSafeBits(context.reader(), docIdSet); + } + } + enums.add(new Holder(termsEnum, bits)); + } + this.bigArrays = bigArrays; + + this.enums = enums.toArray(new Holder[enums.size()]); + + if (docFreq) { + termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false); + } else { + termDocFreqs = null; + } + if (totalTermFreq) { + termsTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false); + } else { + termsTotalFreqs = null; + } + cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays); + } + + @Override + public BytesRef term() throws IOException { + return current; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + long currentTermOrd = cachedTermOrds.add(text); + if (currentTermOrd < 0) { // already seen, initialize instance data with the cached frequencies + currentTermOrd = -1 - currentTermOrd; + boolean found = true; + if (docFreq) { + currentDocFreq = termDocFreqs.get(currentTermOrd); + if (currentDocFreq == -2) { + found = false; + } + } + if (totalTermFreq) { + currentTotalTermFreq = termsTotalFreqs.get(currentTermOrd); + if (currentTotalTermFreq == -2) { + found = false; + } + } + current = found ? text : null; + return found; + } + + boolean found = false; + int docFreq = 0; + long totalTermFreq = 0; + for (Holder anEnum : enums) { + if (!anEnum.termsEnum.seekExact(text)) { + continue; + } + found = true; + if (anEnum.bits == null) { + docFreq += anEnum.termsEnum.docFreq(); + totalTermFreq += anEnum.termsEnum.totalTermFreq(); + } else { + DocsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.docs(anEnum.bits, anEnum.docsEnum, this.totalTermFreq ? DocsEnum.FLAG_FREQS : DocsEnum.FLAG_NONE); + for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { + docFreq++; + if (this.totalTermFreq) { + totalTermFreq += docsEnum.freq(); + } + } + } + } + + current = found ? text : null; + if (this.docFreq) { + if (!found) { + docFreq = -2; // -2 is used to indicate not found + } + currentDocFreq = docFreq; + termDocFreqs = bigArrays.grow(termDocFreqs, currentTermOrd + 1); + termDocFreqs.set(currentTermOrd, docFreq); + } + if (this.totalTermFreq) { + if (!found) { + totalTermFreq = -2; // -2 is used to indicate not found + } else if (totalTermFreq < 0) { + // no freqs really..., blast + totalTermFreq = -1; + } + currentTotalTermFreq = totalTermFreq; + termsTotalFreqs = bigArrays.grow(termsTotalFreqs, currentTermOrd + 1); + termsTotalFreqs.set(currentTermOrd, totalTermFreq); + } + + return found; + } + + @Override + public int docFreq() throws IOException { + return currentDocFreq; + } + + @Override + public long totalTermFreq() throws IOException { + return currentTotalTermFreq; + } + + @Override + public boolean release() throws ElasticsearchException { + try { + Releasables.release(cachedTermOrds, termDocFreqs, termsTotalFreqs); + } finally { + cachedTermOrds = null; + termDocFreqs = null; + termsTotalFreqs = null; + } + return true; + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public BytesRef next() throws IOException { + throw new UnsupportedOperationException("freq terms enum"); + } + + @Override + public Comparator getComparator() { + throw new UnsupportedOperationException("freq terms enum"); + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java index 6cc7f27c13233..3bbcae0731d8c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -75,6 +75,7 @@ public SignificantLongTerms buildAggregation(long owningBucketOrdinal) { if (spare == null) { spare = new SignificantLongTerms.Bucket(0, 0, 0, 0, 0, null); + termsAggFactory.buildTermsEnum(context); } spare.term = bucketOrds.key(i); spare.subsetDf = bucketDocCount(ord); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index 763a16cdb5e24..d10f16ae49ef0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -79,6 +79,7 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) { for (int i = 0; i < bucketOrds.size(); i++) { if (spare == null) { spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null); + termsAggFactory.buildTermsEnum(context); } bucketOrds.get(i, spare.termBytes); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index af43cdf87b64b..1b68b011b0d1c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -18,18 +18,14 @@ */ package org.elasticsearch.search.aggregations.bucket.significant; -import org.apache.lucene.index.*; -import org.apache.lucene.index.FilterAtomicReader.FilterTermsEnum; -import org.apache.lucene.util.Bits; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.lease.Releasable; -import org.elasticsearch.common.lease.Releasables; -import org.elasticsearch.common.util.BigArrays; -import org.elasticsearch.common.util.BytesRefHash; -import org.elasticsearch.common.util.IntArray; -import org.elasticsearch.common.util.LongArray; +import org.elasticsearch.common.lucene.index.FreqTermsEnum; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregator; @@ -53,7 +49,6 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact public static final String EXECUTION_HINT_VALUE_MAP = "map"; public static final String EXECUTION_HINT_VALUE_ORDINALS = "ordinals"; - static final int INITIAL_NUM_TERM_FREQS_CACHED = 512; private final int requiredSize; private final int shardSize; @@ -62,14 +57,11 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact private final String executionHint; private String indexedFieldName; private FieldMapper mapper; - private IntArray termDocFreqs; - private BytesRefHash cachedTermOrds; - private BigArrays bigArrays; private TermsEnum termsEnum; private int numberOfAggregatorsCreated = 0; public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, int requiredSize, - int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { + int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { super(name, SignificantStringTerms.TYPE.name(), valueSourceConfig); this.requiredSize = requiredSize; @@ -81,7 +73,6 @@ public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSo this.indexedFieldName = valuesSourceConfig.fieldContext().field(); mapper = SearchContext.current().smartNameFieldMapper(indexedFieldName); } - bigArrays = SearchContext.current().bigArrays(); } @Override @@ -100,31 +91,8 @@ private static boolean hasParentBucketAggregator(Aggregator parent) { @Override protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) { - numberOfAggregatorsCreated++; - if (numberOfAggregatorsCreated == 1) { - // Setup a termsEnum for use by first aggregator - try { - SearchContext searchContext = aggregationContext.searchContext(); - ContextIndexSearcher searcher = searchContext.searcher(); - Terms terms = MultiFields.getTerms(searcher.getIndexReader(), indexedFieldName); - // terms can be null if the choice of field is not found in this index - if (terms != null) { - termsEnum = terms.iterator(null); - } - } catch (IOException e) { - throw new ElasticsearchException("IOException loading background document frequency info", e); - } - } else if (numberOfAggregatorsCreated == 2) { - // When we have > 1 agg we have possibility of duplicate term frequency lookups and - // so introduce a cache in the form of a wrapper around the plain termsEnum created - // for use with the first agg - if (termsEnum != null) { - SearchContext searchContext = aggregationContext.searchContext(); - termsEnum = new FrequencyCachingTermsEnumWrapper(termsEnum, searchContext.bigArrays(), true, false); - } - } - + long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount(); if (estimatedBucketCount < 0) { // there isn't an estimation available.. 50 should be a good start @@ -183,8 +151,34 @@ protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount "]. It can only be applied to numeric or string fields."); } + public TermsEnum buildTermsEnum(AggregationContext context) { + if (termsEnum != null) { + return termsEnum; + } + if (numberOfAggregatorsCreated == 1) { + try { + // Setup a termsEnum for use by first aggregator + SearchContext searchContext = context.searchContext(); + ContextIndexSearcher searcher = searchContext.searcher(); + Terms terms = MultiFields.getTerms(searcher.getIndexReader(), indexedFieldName); + // terms can be null if the choice of field is not found in this index + if (terms != null) { + termsEnum = terms.iterator(null); + } else { + // When we have > 1 agg we have possibility of duplicate term frequency lookups and + // so introduce a cache in the form of a wrapper around the plain termsEnum created + // for use with the first agg + termsEnum = new FreqTermsEnum(searchContext.searcher().getIndexReader(), indexedFieldName, true, false, null, context.searchContext().bigArrays()); + } + } catch (IOException e) { + throw new ElasticsearchException("failed to build terms enumeration", e); + } + } + return termsEnum; + } + public long getBackgroundFrequency(BytesRef termBytes) { - assert termsEnum !=null; // having failed to find a field in the index we don't expect any calls for frequencies + assert termsEnum != null; // having failed to find a field in the index we don't expect any calls for frequencies long result = 0; try { if (termsEnum.seekExact(termBytes)) { @@ -213,116 +207,4 @@ public boolean release() throws ElasticsearchException { } return true; } - - // A specialist TermsEnum wrapper for use in the repeated look-ups of frequency stats. - // TODO factor out as a utility class to replace similar org.elasticsearch.search.suggest.phrase.WordScorer.FrequencyCachingTermsEnumWrapper - // This implementation is likely to produce less garbage than WordScorer's impl but will need benchmarking/testing for that use case. - static class FrequencyCachingTermsEnumWrapper extends FilterTermsEnum implements Releasable { - - int currentTermDocFreq = 0; - long currentTermTotalFreq = 0; - private IntArray termDocFreqs; - private LongArray termTotalFreqs; - private BytesRefHash cachedTermOrds; - protected BigArrays bigArrays; - private boolean cacheDocFreqs; - private boolean cacheTotalFreqs; - private long currentTermOrd; - - public FrequencyCachingTermsEnumWrapper(TermsEnum delegate, BigArrays bigArrays, boolean cacheDocFreqs, boolean cacheTotalFreqs) { - super(delegate); - this.bigArrays = bigArrays; - this.cacheDocFreqs = cacheDocFreqs; - this.cacheTotalFreqs = cacheTotalFreqs; - if (cacheDocFreqs) { - termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false); - } - if (cacheTotalFreqs) { - termTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false); - } - cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays); - } - - @Override - public boolean seekExact(BytesRef text) throws IOException { - currentTermDocFreq = 0; - currentTermTotalFreq = 0; - currentTermOrd = cachedTermOrds.add(text); - if (currentTermOrd < 0) { // already seen, initialize instance data with the cached frequencies - currentTermOrd = -1 - currentTermOrd; - if (cacheDocFreqs) { - currentTermDocFreq = termDocFreqs.get(currentTermOrd); - } - if (cacheTotalFreqs) { - currentTermTotalFreq = termTotalFreqs.get(currentTermOrd); - } - return true; - } else { // cache miss - pre-emptively read and cache the required frequency values - if (in.seekExact(text)) { - if (cacheDocFreqs) { - currentTermDocFreq = in.docFreq(); - termDocFreqs = bigArrays.grow(termDocFreqs, currentTermOrd + 1); - termDocFreqs.set(currentTermOrd, currentTermDocFreq); - } - if (cacheTotalFreqs) { - currentTermTotalFreq = in.totalTermFreq(); - termTotalFreqs = bigArrays.grow(termTotalFreqs, currentTermOrd + 1); - termTotalFreqs.set(currentTermOrd, currentTermTotalFreq); - } - return true; - } - } - return false; - } - - @Override - public long totalTermFreq() throws IOException { - assert cacheTotalFreqs; - return currentTermTotalFreq; - } - - @Override - public int docFreq() throws IOException { - assert cacheDocFreqs; - return currentTermDocFreq; - } - - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - public SeekStatus seekCeil(BytesRef text) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public BytesRef next() { - throw new UnsupportedOperationException(); - } - - @Override - public boolean release() throws ElasticsearchException { - try { - Releasables.release(cachedTermOrds, termDocFreqs, termTotalFreqs); - } finally { - cachedTermOrds = null; - termDocFreqs = null; - termTotalFreqs = null; - } - return true; - } - - } - } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 35193077689dd..6b3c1e910ef61 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -25,6 +25,8 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.lucene.index.FreqTermsEnum; +import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -39,7 +41,7 @@ public abstract class WordScorer { protected final double realWordLikelyhood; protected final BytesRef spare = new BytesRef(); protected final BytesRef separator; - protected final TermsEnum termsEnum; + private final TermsEnum termsEnum; private final long numTerms; private final boolean useTotalTermFreq; @@ -57,7 +59,7 @@ public WordScorer(IndexReader reader, Terms terms, String field, double realWord this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize; this.useTotalTermFreq = vocSize != -1; this.numTerms = terms.size(); - this.termsEnum = new FrequencyCachingTermsEnumWrapper(terms.iterator(null)); + this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; this.separator = separator; @@ -103,85 +105,4 @@ public static interface WordScorerFactory { public WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException; } - - /** - * Terms enum wrapper that caches term frequencies in an effort to outright skip seeks. Only works with seekExact(BytesRef), not next or - * not seekCeil. Because of this it really only makes sense in this context. - */ - private static class FrequencyCachingTermsEnumWrapper extends FilterTermsEnum { - private ObjectObjectMap cache = new ObjectObjectOpenHashMap(); - /** - * The last term that the called attempted to seek to. - */ - private CacheEntry last; - - public FrequencyCachingTermsEnumWrapper(TermsEnum in) { - super(in); - } - - @Override - public boolean seekExact(BytesRef text) throws IOException { - last = cache.get(text); - if (last != null) { - // This'll fail to work properly if the user seeks but doesn't check the frequency, causing us to cache it. - // That is OK because WordScorer only seeks to check the frequency. - return last.ttf != 0 || last.df != 0; - } - last = new CacheEntry(); - cache.put(BytesRef.deepCopyOf(text), last); - if (in.seekExact(text)) { - // Found so mark the term uncached. - last.df = -1; - last.ttf = -1; - return true; - } - // Not found. The cache will default to 0 for the freqs, meaning not found. - return false; - } - - @Override - public long totalTermFreq() throws IOException { - if (last.ttf == -1) { - last.ttf = in.totalTermFreq(); - } - return last.ttf; - } - - @Override - public int docFreq() throws IOException { - if (last.df == -1) { - last.df = in.docFreq(); - } - return last.df; - } - - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - public SeekStatus seekCeil(BytesRef text) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public BytesRef next() { - throw new UnsupportedOperationException(); - } - - private static class CacheEntry { - private long ttf; - private int df; - } - } } diff --git a/src/test/java/org/elasticsearch/common/lucene/index/FreqTermsEnumTests.java b/src/test/java/org/elasticsearch/common/lucene/index/FreqTermsEnumTests.java new file mode 100644 index 0000000000000..9d9c79bd92550 --- /dev/null +++ b/src/test/java/org/elasticsearch/common/lucene/index/FreqTermsEnumTests.java @@ -0,0 +1,212 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.index; + +import com.carrotsearch.ant.tasks.junit4.dependencies.com.google.common.collect.Lists; +import com.carrotsearch.ant.tasks.junit4.dependencies.com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.queries.TermsFilter; +import org.apache.lucene.search.Filter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.test.ElasticsearchLuceneTestCase; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.*; +import static org.hamcrest.Matchers.equalTo; + +/** + */ +public class FreqTermsEnumTests extends ElasticsearchLuceneTestCase { + + private String[] terms; + private IndexWriter iw; + private IndexReader reader; + private Map referenceAll; + private Map referenceNotDeleted; + private Map referenceFilter; + private Filter filter; + + static class FreqHolder { + int docFreq; + long totalTermFreq; + } + + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + referenceAll = Maps.newHashMap(); + referenceNotDeleted = Maps.newHashMap(); + referenceFilter = Maps.newHashMap(); + + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + conf.setMergeScheduler(NoMergeScheduler.INSTANCE); // we don't want to do any merges, so we won't expunge deletes + iw = new IndexWriter(dir, conf); + terms = new String[scaledRandomIntBetween(10, 300)]; + for (int i = 0; i < terms.length; i++) { + terms[i] = randomAsciiOfLength(5); + } + + int numberOfDocs = scaledRandomIntBetween(30, 300); + Document[] docs = new Document[numberOfDocs]; + for (int i = 0; i < numberOfDocs; i++) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Field.Store.YES)); + docs[i] = doc; + for (String term : terms) { + if (randomBoolean()) { + continue; + } + int freq = randomIntBetween(1, 3); + for (int j = 0; j < freq; j++) { + doc.add(new TextField("field", term, Field.Store.YES)); + } + } + } + + // add all docs + + for (int i = 0; i < docs.length; i++) { + Document doc = docs[i]; + iw.addDocument(doc); + if (randomInt(10) == 5) { + iw.commit(); + } + } + + Set deletedIds = Sets.newHashSet(); + for (int i = 0; i < docs.length; i++) { + Document doc = docs[i]; + if (randomInt(5) == 2) { + Term idTerm = new Term("id", Integer.toString(i)); + deletedIds.add(idTerm.text()); + iw.deleteDocuments(idTerm); + } + } + + + // now go over each doc, build the relevant references and filter + reader = DirectoryReader.open(iw, true); + List filterTerms = Lists.newArrayList(); + for (int docId = 0; docId < reader.maxDoc(); docId++) { + Document doc = reader.document(docId); + addFreqs(doc, referenceAll); + if (!deletedIds.contains(doc.getField("id").stringValue())) { + addFreqs(doc, referenceNotDeleted); + if (randomBoolean()) { + filterTerms.add(new Term("id", doc.getField("id").stringValue())); + addFreqs(doc, referenceFilter); + } + } + } + filter = new TermsFilter(filterTerms); + } + + private void addFreqs(Document doc, Map reference) { + Set addedDocFreq = Sets.newHashSet(); + for (IndexableField field : doc.getFields("field")) { + String term = field.stringValue(); + FreqHolder freqHolder = reference.get(term); + if (freqHolder == null) { + freqHolder = new FreqHolder(); + reference.put(term, freqHolder); + } + if (!addedDocFreq.contains(term)) { + freqHolder.docFreq++; + addedDocFreq.add(term); + } + freqHolder.totalTermFreq++; + } + } + + @After + @Override + public void tearDown() throws Exception { + if (reader != null) { + reader.close(); + } + iw.rollback(); + iw.getDirectory().close(); + super.tearDown(); + } + + @Test + public void testAllFreqs() throws Exception { + assertAgainstReference(true, true, null, referenceAll); + assertAgainstReference(true, false, null, referenceAll); + assertAgainstReference(false, true, null, referenceAll); + } + + @Test + public void testNonDeletedFreqs() throws Exception { + assertAgainstReference(true, true, Queries.MATCH_ALL_FILTER, referenceNotDeleted); + assertAgainstReference(true, false, Queries.MATCH_ALL_FILTER, referenceNotDeleted); + assertAgainstReference(false, true, Queries.MATCH_ALL_FILTER, referenceNotDeleted); + } + + @Test + public void testFilterFreqs() throws Exception { + assertAgainstReference(true, true, filter, referenceFilter); + assertAgainstReference(true, false, filter, referenceFilter); + assertAgainstReference(false, true, filter, referenceFilter); + } + + private void assertAgainstReference(boolean docFreq, boolean totalTermFreq, Filter filter, Map reference) throws Exception { + FreqTermsEnum freqTermsEnum = new FreqTermsEnum(reader, "field", docFreq, totalTermFreq, filter, BigArrays.NON_RECYCLING_INSTANCE); + assertAgainstReference(freqTermsEnum, reference, docFreq, totalTermFreq); + } + + private void assertAgainstReference(FreqTermsEnum termsEnum, Map reference, boolean docFreq, boolean totalTermFreq) throws Exception { + int cycles = randomIntBetween(1, 5); + for (int i = 0; i < cycles; i++) { + List terms = Lists.newArrayList(Arrays.asList(this.terms)); + //Collections.shuffle(terms, getRandom()); + for (String term : terms) { + if (!termsEnum.seekExact(new BytesRef(term))) { + continue; + } + if (docFreq) { + assertThat("cycle " + i + ", term " + term + ", docFreq", termsEnum.docFreq(), equalTo(reference.get(term).docFreq)); + } + if (totalTermFreq) { + assertThat("cycle " + i + ", term " + term + ", totalTermFreq", termsEnum.totalTermFreq(), equalTo(reference.get(term).totalTermFreq)); + } + } + } + } +}