Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi-valued text support to the analyzer API #10847

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/reference/indices/analyze.asciidoc
Expand Up @@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '

coming[2.0.0, body based parameters were added in 2.0.0]

If text parameter is provided as array of strings, it is analyzed as a multi-valued field.

[source,js]
--------------------------------------------------
curl -XGET 'localhost:9200/_analyze' -d '
{
"analyzer" : "standard",
"text" : ["this is a test", "the second text"]
}'
--------------------------------------------------

coming[2.0.0, body based parameters were added in 2.0.0]

Or by building a custom transient analyzer out of tokenizers,
token filters and char filters. Token filters can use the shorter 'filters'
parameter name:
Expand Down
2 changes: 1 addition & 1 deletion rest-api-spec/api/indices.analyze.json
Expand Up @@ -41,7 +41,7 @@
"description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
},
"text": {
"type" : "string",
"type" : "list",
"description" : "The text on which the analysis should be performed (when request body is not used)"
},
"tokenizer": {
Expand Down
8 changes: 8 additions & 0 deletions rest-api-spec/test/indices.analyze/10_analyze.yaml
Expand Up @@ -63,3 +63,11 @@ setup:
body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: bar foo }
---
"Array text":
- do:
indices.analyze:
body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 2 }
- match: { tokens.0.token: foo bar }
- match: { tokens.1.token: baz }
Expand Up @@ -36,7 +36,7 @@
*/
public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {

private String text;
private String[] text;

private String analyzer;

Expand All @@ -61,11 +61,11 @@ public AnalyzeRequest(String index) {
this.index(index);
}

public String text() {
public String[] text() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this breaks the java api :) we have to mark as breaking then and push only to master (which was the case already I think)

return this.text;
}

public AnalyzeRequest text(String text) {
public AnalyzeRequest text(String... text) {
this.text = text;
return this;
}
Expand Down Expand Up @@ -118,7 +118,7 @@ public String field() {
@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = super.validate();
if (text == null) {
if (text == null || text.length == 0) {
validationException = addValidationError("text is missing", validationException);
}
if (tokenFilters == null) {
Expand All @@ -133,7 +133,7 @@ public ActionRequestValidationException validate() {
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
text = in.readString();
text = in.readStringArray();
analyzer = in.readOptionalString();
tokenizer = in.readOptionalString();
tokenFilters = in.readStringArray();
Expand All @@ -144,7 +144,7 @@ public void readFrom(StreamInput in) throws IOException {
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(text);
out.writeStringArray(text);
out.writeOptionalString(analyzer);
out.writeOptionalString(tokenizer);
out.writeStringArray(tokenFilters);
Expand Down
Expand Up @@ -31,7 +31,7 @@ public AnalyzeRequestBuilder(IndicesAdminClient indicesClient) {
super(indicesClient, new AnalyzeRequest());
}

public AnalyzeRequestBuilder(IndicesAdminClient indicesClient, String index, String text) {
public AnalyzeRequestBuilder(IndicesAdminClient indicesClient, String index, String... text) {
super(indicesClient, new AnalyzeRequest(index).text(text));
}

Expand Down
Expand Up @@ -25,10 +25,10 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.single.custom.TransportSingleCustomOperationAction;
import org.elasticsearch.cluster.ClusterService;
Expand Down Expand Up @@ -212,36 +212,41 @@ protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId

List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(field, request.text());
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);

int position = -1;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
}
stream.end();
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
// ignore
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try {
stream = analyzer.tokenStream(field, text);
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);

while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));

}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();

lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);

} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
IOUtils.closeWhileHandlingException(stream);
}
if (closeAnalyzer) {
analyzer.close();
}
}

if (closeAnalyzer) {
analyzer.close();
}

return new AnalyzeResponse(tokens);
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/org/elasticsearch/client/IndicesAdminClient.java
Expand Up @@ -587,6 +587,21 @@ public interface IndicesAdminClient extends ElasticsearchClient<IndicesAdminClie
*/
AnalyzeRequestBuilder prepareAnalyze(String text);

/**
* Analyze texts under the provided index.
*
* @param index The index name
* @param text The array of text to analyze
*/
AnalyzeRequestBuilder prepareAnalyzeWithIndexAndMultiValued(@Nullable String index, String... text);

/**
* Analyze texts.
*
* @param text The array of text to analyze
*/
AnalyzeRequestBuilder prepareAnalyzeWithMultiValued(String... text);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these two methods go a bit against the naming convention that we have here for prepare* methods. I really think we should go for one of the options I mentioned here. What do you think @dakrone ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry, and I completely agree about the naming convention.
Other question.
If we can change I/F about prepareAnalyze, what do you think about dropping all text parameters for I/F simplification and adding the setter of text/texts to AnalyzeRequestBuilder?

Of course, we are hard to find the difference between prepareAnalyze(String text), that is old version, and prepareAnalyze(String index), that is new one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you think about dropping all text parameters for I/F simplification and adding the setter of text/texts to AnalyzeRequestBuilder?

that is an option too. For now I'd just expose a prepareAnalyze() then with no parameters, leave the existing ones for bw comp, and add the needed setters where needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, AnalyzeRequestBuilder doesn't have methods related text parameters.
If we have prepareAnalyze(), we should setText(String)/setText(String...) to AnalyzeRequestBuilder.java


/**
* Puts an index template.
*/
Expand Down
Expand Up @@ -480,6 +480,16 @@ public AnalyzeRequestBuilder prepareAnalyze(String text) {
return new AnalyzeRequestBuilder(this, null, text);
}

@Override
public AnalyzeRequestBuilder prepareAnalyzeWithIndexAndMultiValued(@Nullable String index, String... text) {
return new AnalyzeRequestBuilder(this, index, text);
}

@Override
public AnalyzeRequestBuilder prepareAnalyzeWithMultiValued(String... text) {
return new AnalyzeRequestBuilder(this, null, text);
}

@Override
public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
return execute(PutIndexTemplateAction.INSTANCE, request);
Expand Down
Expand Up @@ -23,11 +23,10 @@
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
Expand Down Expand Up @@ -58,10 +57,10 @@ public RestAnalyzeAction(Settings settings, RestController controller, Client cl
@Override
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {

String text = request.param("text");
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");

AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
analyzeRequest.text(text);
analyzeRequest.text(texts);
analyzeRequest.listenerThreaded(false);
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
analyzeRequest.analyzer(request.param("analyzer"));
Expand All @@ -73,9 +72,9 @@ public void handleRequest(final RestRequest request, final RestChannel channel,
if (RestActions.hasBodyContent(request)) {
XContentType type = RestActions.guessBodyContentType(request);
if (type == null) {
if (text == null) {
text = RestActions.getRestContent(request).toUtf8();
analyzeRequest.text(text);
if (texts == null || texts.length == 0) {
texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
analyzeRequest.text(texts);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: we seem to support a single value only here. What is the difference between this case and the below case where we check the presence of arrays? It is just me not knowing tha analyze api I guess.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question.

Analyze API has three way to receive text.

  1. request parameter
  2. request body, raw text
  3. request body, JSON parameter

This case is 2.
In this case, I think it is difficult which letter we assume a delimiter.
And we leave this case for backward compatibility.

Then, I think it's OK we don't support multi valued text for this case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the explanation. case 2) is same as case 1) I guess. The only way we could make it work with multiple values would be by allowing for comma separated values, but then the comma coyuldn't be part of the text anymore. Let's leave it as-is.

}
} else {
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
Expand All @@ -99,7 +98,16 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
analyzeRequest.preferLocal(parser.booleanValue());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.text(parser.text());
analyzeRequest.text(parser.text());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
List<String> texts = Lists.newArrayList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
if (token.isValue() == false) {
throw new ElasticsearchIllegalArgumentException(currentFieldName + " array element should only contain text");
}
texts.add(parser.text());
}
analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY));
} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.analyzer(parser.text());
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
Expand All @@ -114,7 +122,7 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy
}
filters.add(parser.text());
}
analyzeRequest.tokenFilters(filters.toArray(new String[0]));
analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY));
} else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
List<String> charFilters = Lists.newArrayList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
Expand All @@ -123,7 +131,7 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy
}
charFilters.add(parser.text());
}
analyzeRequest.tokenFilters(charFilters.toArray(new String[0]));
analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY));
} else {
throw new ElasticsearchIllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
}
Expand Down
Expand Up @@ -221,7 +221,8 @@ public void testParseXContentForAnalyzeReuqest() throws Exception {

RestAnalyzeAction.buildFromContent(content, analyzeRequest);

assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
assertThat(analyzeRequest.text().length, equalTo(1));
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
}
Expand All @@ -240,7 +241,6 @@ public void testParseXContentForAnalyzeRequestWithInvalidJsonThrowsException() t
}
}


@Test
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
Expand All @@ -259,4 +259,46 @@ public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException()
}
}

@Test
public void analyzerWithMultiValues() throws Exception {

assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
ensureGreen();

client().admin().indices().preparePutMapping("test")
.setType("document").setSource(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.setSource("simple", "type=string,analyzer=simple,position_offset_gap=100") ? so we can remove json provided as a string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I don't know that. I change as string.

"{\n" +
" \"document\":{\n" +
" \"properties\":{\n" +
" \"simple\":{\n" +
" \"type\":\"string\",\n" +
" \"analyzer\": \"simple\",\n" +
" \"position_offset_gap\": 100\n" +
" }\n" +
" }\n" +
" }\n" +
"}"
).get();

String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};

final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyzeWithMultiValued(texts);
requestBuilder.setIndex(indexOrAlias());
requestBuilder.setField("simple");
AnalyzeResponse analyzeResponse = requestBuilder.get();
assertThat(analyzeResponse.getTokens().size(), equalTo(7));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));

token = analyzeResponse.getTokens().get(5);
assertThat(token.getTerm(), equalTo("second"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(19));
assertThat(token.getEndOffset(), equalTo(25));

}

}