New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add multi-valued text support to the analyzer API #10847
Changes from 3 commits
55c0f5b
b471d19
91c7abd
598f94c
139bd34
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -587,6 +587,21 @@ public interface IndicesAdminClient extends ElasticsearchClient<IndicesAdminClie | |
*/ | ||
AnalyzeRequestBuilder prepareAnalyze(String text); | ||
|
||
/** | ||
* Analyze texts under the provided index. | ||
* | ||
* @param index The index name | ||
* @param text The array of text to analyze | ||
*/ | ||
AnalyzeRequestBuilder prepareAnalyzeWithIndexAndMultiValued(@Nullable String index, String... text); | ||
|
||
/** | ||
* Analyze texts. | ||
* | ||
* @param text The array of text to analyze | ||
*/ | ||
AnalyzeRequestBuilder prepareAnalyzeWithMultiValued(String... text); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, sorry, and I completely agree about the naming convention. Of course, we are hard to find the difference between prepareAnalyze(String text), that is old version, and prepareAnalyze(String index), that is new one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
that is an option too. For now I'd just expose a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now, AnalyzeRequestBuilder doesn't have methods related |
||
|
||
/** | ||
* Puts an index template. | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,11 +23,10 @@ | |
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; | ||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; | ||
import org.elasticsearch.client.Client; | ||
import org.elasticsearch.common.bytes.BytesArray; | ||
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.common.bytes.BytesReference; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.xcontent.XContentFactory; | ||
import org.elasticsearch.common.xcontent.XContentHelper; | ||
import org.elasticsearch.common.xcontent.XContentParser; | ||
import org.elasticsearch.common.xcontent.XContentType; | ||
|
@@ -58,10 +57,10 @@ public RestAnalyzeAction(Settings settings, RestController controller, Client cl | |
@Override | ||
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) { | ||
|
||
String text = request.param("text"); | ||
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text"); | ||
|
||
AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index")); | ||
analyzeRequest.text(text); | ||
analyzeRequest.text(texts); | ||
analyzeRequest.listenerThreaded(false); | ||
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard())); | ||
analyzeRequest.analyzer(request.param("analyzer")); | ||
|
@@ -73,9 +72,9 @@ public void handleRequest(final RestRequest request, final RestChannel channel, | |
if (RestActions.hasBodyContent(request)) { | ||
XContentType type = RestActions.guessBodyContentType(request); | ||
if (type == null) { | ||
if (text == null) { | ||
text = RestActions.getRestContent(request).toUtf8(); | ||
analyzeRequest.text(text); | ||
if (texts == null || texts.length == 0) { | ||
texts = new String[]{ RestActions.getRestContent(request).toUtf8() }; | ||
analyzeRequest.text(texts); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: we seem to support a single value only here. What is the difference between this case and the below case where we check the presence of arrays? It is just me not knowing tha analyze api I guess. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. Analyze API has three way to receive text.
This case is 2. Then, I think it's OK we don't support multi valued text for this case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the explanation. case 2) is same as case 1) I guess. The only way we could make it work with multiple values would be by allowing for comma separated values, but then the comma coyuldn't be part of the text anymore. Let's leave it as-is. |
||
} | ||
} else { | ||
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values | ||
|
@@ -99,7 +98,16 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy | |
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) { | ||
analyzeRequest.preferLocal(parser.booleanValue()); | ||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
analyzeRequest.text(parser.text()); | ||
analyzeRequest.text(parser.text()); | ||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) { | ||
List<String> texts = Lists.newArrayList(); | ||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { | ||
if (token.isValue() == false) { | ||
throw new ElasticsearchIllegalArgumentException(currentFieldName + " array element should only contain text"); | ||
} | ||
texts.add(parser.text()); | ||
} | ||
analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY)); | ||
} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
analyzeRequest.analyzer(parser.text()); | ||
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
|
@@ -114,7 +122,7 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy | |
} | ||
filters.add(parser.text()); | ||
} | ||
analyzeRequest.tokenFilters(filters.toArray(new String[0])); | ||
analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY)); | ||
} else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) { | ||
List<String> charFilters = Lists.newArrayList(); | ||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { | ||
|
@@ -123,7 +131,7 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy | |
} | ||
charFilters.add(parser.text()); | ||
} | ||
analyzeRequest.tokenFilters(charFilters.toArray(new String[0])); | ||
analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY)); | ||
} else { | ||
throw new ElasticsearchIllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] "); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,7 +221,8 @@ public void testParseXContentForAnalyzeReuqest() throws Exception { | |
|
||
RestAnalyzeAction.buildFromContent(content, analyzeRequest); | ||
|
||
assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST")); | ||
assertThat(analyzeRequest.text().length, equalTo(1)); | ||
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"})); | ||
assertThat(analyzeRequest.tokenizer(), equalTo("keyword")); | ||
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"})); | ||
} | ||
|
@@ -240,7 +241,6 @@ public void testParseXContentForAnalyzeRequestWithInvalidJsonThrowsException() t | |
} | ||
} | ||
|
||
|
||
@Test | ||
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception { | ||
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test"); | ||
|
@@ -259,4 +259,46 @@ public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() | |
} | ||
} | ||
|
||
@Test | ||
public void analyzerWithMultiValues() throws Exception { | ||
|
||
assertAcked(prepareCreate("test").addAlias(new Alias("alias"))); | ||
ensureGreen(); | ||
|
||
client().admin().indices().preparePutMapping("test") | ||
.setType("document").setSource( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. .setSource("simple", "type=string,analyzer=simple,position_offset_gap=100") ? so we can remove json provided as a string? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I don't know that. I change as string. |
||
"{\n" + | ||
" \"document\":{\n" + | ||
" \"properties\":{\n" + | ||
" \"simple\":{\n" + | ||
" \"type\":\"string\",\n" + | ||
" \"analyzer\": \"simple\",\n" + | ||
" \"position_offset_gap\": 100\n" + | ||
" }\n" + | ||
" }\n" + | ||
" }\n" + | ||
"}" | ||
).get(); | ||
|
||
String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"}; | ||
|
||
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyzeWithMultiValued(texts); | ||
requestBuilder.setIndex(indexOrAlias()); | ||
requestBuilder.setField("simple"); | ||
AnalyzeResponse analyzeResponse = requestBuilder.get(); | ||
assertThat(analyzeResponse.getTokens().size(), equalTo(7)); | ||
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3); | ||
assertThat(token.getTerm(), equalTo("test")); | ||
assertThat(token.getPosition(), equalTo(3)); | ||
assertThat(token.getStartOffset(), equalTo(10)); | ||
assertThat(token.getEndOffset(), equalTo(14)); | ||
|
||
token = analyzeResponse.getTokens().get(5); | ||
assertThat(token.getTerm(), equalTo("second")); | ||
assertThat(token.getPosition(), equalTo(105)); | ||
assertThat(token.getStartOffset(), equalTo(19)); | ||
assertThat(token.getEndOffset(), equalTo(25)); | ||
|
||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this breaks the java api :) we have to mark as breaking then and push only to master (which was the case already I think)