New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for char filters in the analyze API #5148
Changes from 2 commits
554a08a
e236b33
187846c
d0dc1e3
36235ac
1fe04bf
8eb4170
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
*/ | ||
package org.elasticsearch.action.admin.indices.analyze; | ||
|
||
import org.elasticsearch.Version; | ||
import org.elasticsearch.action.ActionRequestValidationException; | ||
import org.elasticsearch.action.support.single.custom.SingleCustomOperationRequest; | ||
import org.elasticsearch.common.Nullable; | ||
|
@@ -44,6 +45,8 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> | |
|
||
private String[] tokenFilters; | ||
|
||
private String[] charFilters; | ||
|
||
private String field; | ||
|
||
AnalyzeRequest() { | ||
|
@@ -110,6 +113,15 @@ public String[] tokenFilters() { | |
return this.tokenFilters; | ||
} | ||
|
||
public AnalyzeRequest charFilters(String... charFilters) { | ||
this.charFilters = charFilters; | ||
return this; | ||
} | ||
|
||
public String[] charFilters() { | ||
return this.charFilters; | ||
} | ||
|
||
public AnalyzeRequest field(String field) { | ||
this.field = field; | ||
return this; | ||
|
@@ -142,6 +154,15 @@ public void readFrom(StreamInput in) throws IOException { | |
tokenFilters[i] = in.readString(); | ||
} | ||
} | ||
if (in.getVersion().onOrAfter(Version.V_1_1_0)) { | ||
size = in.readVInt(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't we sue |
||
if (size > 0) { | ||
charFilters = new String[size]; | ||
for (int i = 0; i < size; i++) { | ||
charFilters[i] = in.readString(); | ||
} | ||
} | ||
} | ||
field = in.readOptionalString(); | ||
} | ||
|
||
|
@@ -160,6 +181,16 @@ public void writeTo(StreamOutput out) throws IOException { | |
out.writeString(tokenFilter); | ||
} | ||
} | ||
if (out.getVersion().onOrAfter(Version.V_1_1_0)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above we have a |
||
if (charFilters == null) { | ||
out.writeVInt(0); | ||
} else { | ||
out.writeVInt(charFilters.length); | ||
for (String charFilter : charFilters) { | ||
out.writeString(charFilter); | ||
} | ||
} | ||
} | ||
out.writeOptionalString(field); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ public void handleRequest(final RestRequest request, final RestChannel channel) | |
analyzeRequest.field(request.param("field")); | ||
analyzeRequest.tokenizer(request.param("tokenizer")); | ||
analyzeRequest.tokenFilters(request.paramAsStringArray("token_filters", request.paramAsStringArray("filters", null))); | ||
analyzeRequest.charFilters(request.paramAsStringArray("char_filters", null)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest to use |
||
client.admin().indices().analyze(analyzeRequest, new ActionListener<AnalyzeResponse>() { | ||
@Override | ||
public void onResponse(AnalyzeResponse response) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,12 +23,27 @@ | |
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequestBuilder; | ||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; | ||
import org.elasticsearch.common.Priority; | ||
import org.elasticsearch.common.inject.Injector; | ||
import org.elasticsearch.common.inject.ModulesBuilder; | ||
import org.elasticsearch.common.settings.ImmutableSettings; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.settings.SettingsModule; | ||
import org.elasticsearch.common.xcontent.XContentFactory; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.env.EnvironmentModule; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.IndexNameModule; | ||
import org.elasticsearch.index.analysis.AnalysisModule; | ||
import org.elasticsearch.index.analysis.AnalysisService; | ||
import org.elasticsearch.index.settings.IndexSettingsModule; | ||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule; | ||
import org.elasticsearch.indices.analysis.IndicesAnalysisService; | ||
import org.elasticsearch.test.ElasticsearchIntegrationTest; | ||
import org.junit.Test; | ||
|
||
import java.io.IOException; | ||
|
||
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; | ||
import static org.hamcrest.Matchers.equalTo; | ||
|
||
/** | ||
|
@@ -108,6 +123,30 @@ public void analyzeWithNoIndex() throws Exception { | |
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test")); | ||
} | ||
|
||
@Test | ||
public void analyzeWithCharFilter() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd love to see a test that uses more than one char filter :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So demanding! :) Added a chained token filter test as well. The permutations are endless. |
||
|
||
ImmutableSettings.Builder settings = settingsBuilder() | ||
.put("index.analysis.char_filter.my_mapping.type", "mapping") | ||
.putArray("index.analysis.char_filter.my_mapping.mappings", "ph=>f", "qu=>q") | ||
.put("index.analysis.analyzer.custom_with_char_filter.tokenizer", "standard") | ||
.putArray("index.analysis.analyzer.custom_with_char_filter.char_filter", "my_mapping"); | ||
|
||
prepareCreate("test", 1, settings).execute().actionGet(); | ||
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus().execute().actionGet(); | ||
|
||
AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("test", "<h2><b>THIS</b> IS A</h2> <a href=\"#\">TEST</a>").setTokenizer("standard").setCharFilters("html_strip").execute().actionGet(); | ||
assertThat(analyzeResponse.getTokens().size(), equalTo(4)); | ||
|
||
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A <b>TEST</b>").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("html_strip").execute().actionGet(); | ||
assertThat(analyzeResponse.getTokens().size(), equalTo(1)); | ||
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test")); | ||
|
||
analyzeResponse = client().admin().indices().prepareAnalyze("test", "jeff quit phish").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("my_mapping").execute().actionGet(); | ||
assertThat(analyzeResponse.getTokens().size(), equalTo(1)); | ||
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("jeff qit fish")); | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we could also test the case where we have a custom char filter registered under a specific index? |
||
@Test | ||
public void analyzerWithFieldOrTypeTests() throws Exception { | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO this should be initialized with
Strings.EMPTY_ARRAY
(also thetokenFitlers
) and then we should also check in thevalidate
method?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you envision with the validate method? The only required parameter is the field. TransportAnalyzeAction works fine with all null values for the analyzer/tokenizer/filters.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah so I'd initialize the
String[]
withStrings.EMPTY_ARRAY
and then make sure that it is nevernull
but maybe that is an overkill. I just don't like that we use thenull
invariant use all over the place. Rather check fornull
and throw anElasticsearchIllegalArgument
on the setters. What do you think @brusicThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like how the tests are encapsulated in the validate method. The last commit adds an explicit null check to the filter setters. However, perhaps worse than the use of nulls is inconsistent APIs. The single valued String fields such as analyzer and tokenizer continue to accept null values (since there is no clean non-null default like Strings.EMPTY_ARRAY), which can be set in the REST action. If we enforce non nulls consistently in the setters, we will end up with code such as:
if (request.param("analyzer") != null) analyzeRequest.analyzer(request.param("analyzer"));
Certainly doable, I have written that check thousands of times in my lifetime, but is it worthwhile for fields whose values that can be null? Six of one, half dozen of the other.