Skip to content

Commit

Permalink
Expose filler_token via ShingleTokenFilterFactory
Browse files Browse the repository at this point in the history
Lucene 4.7 supports a setter for the `filler_token` that is
inserted if there are gaps in the token stream. This change exposes
this setting.

Closes #4307
  • Loading branch information
s1monw committed Feb 19, 2014
1 parent a12e02a commit f7c6f36
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 16 deletions.
Expand Up @@ -32,5 +32,9 @@ no effect. Defaults to `false`.

|`token_separator` |The string to use when joining adjacent tokens to
form a shingle. Defaults to `" "`.
|`filler_token` | The string to use as a replacement for each position
at which there is no actual token in the stream. For instance this string is
used if the position increment is greater than one when a `stop` filter is used
together with the `shingle` filter. Defaults to `"_"`
|=======================================================================

Expand Up @@ -42,20 +42,21 @@ public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSetti
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}


@Override
public TokenStream create(TokenStream tokenStream) {
return factory.create(tokenStream);
return factory.create(tokenStream);
}


public Factory getInnerFactory() {
return this.factory;
}

public static final class Factory implements TokenFilterFactory {
private final int maxShingleSize;

Expand All @@ -64,44 +65,47 @@ public static final class Factory implements TokenFilterFactory {
private final boolean outputUnigramsIfNoShingles;

private final String tokenSeparator;
private final String fillerToken;

private int minShingleSize;

private final String name;

public Factory(String name) {
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {

Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator, String fillerToken) {
this.maxShingleSize = maxShingleSize;
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
this.tokenSeparator = tokenSeparator;
this.minShingleSize = minShingleSize;
this.fillerToken = fillerToken;
this.name = name;
}

public TokenStream create(TokenStream tokenStream) {
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setTokenSeparator(tokenSeparator);
filter.setFillerToken(fillerToken);
return filter;
}

public int getMaxShingleSize() {
return maxShingleSize;
}

public int getMinShingleSize() {
return minShingleSize;
}

public boolean getOutputUnigrams() {
return outputUnigrams;
}

public boolean getOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
Expand Down
Expand Up @@ -75,9 +75,8 @@ public void testFillerToken() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_filler");
String source = "simon the sorcerer";
String[] expected = new String[]{"simon FILLER sorcerer"};
String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"};
TokenStream tokenizer = new StopFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(source)), StopFilter.makeStopSet(TEST_VERSION_CURRENT, "the"));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

}

0 comments on commit f7c6f36

Please sign in to comment.