Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi-valued text support to the analyzer API #10847

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/reference/indices/analyze.asciidoc
Expand Up @@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '

coming[2.0.0, body based parameters were added in 2.0.0]

If text parameter is provided as array of string, it analyze as a multi-valued.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would change this:

If text parameter is provided as array of string, it analyze as a multi-valued.

to If text parameter is provided as array of strings, it is analyzed as a multi-valued field.


[source,js]
--------------------------------------------------
curl -XGET 'localhost:9200/_analyze' -d '
{
"analyzer" : "standard",
"text" : ["this is a test", "the second text"]
}'
--------------------------------------------------

coming[2.0.0, body based parameters were added in 2.0.0]

Or by building a custom transient analyzer out of tokenizers,
token filters and char filters. Token filters can use the shorter 'filters'
parameter name:
Expand Down
2 changes: 1 addition & 1 deletion rest-api-spec/api/indices.analyze.json
Expand Up @@ -41,7 +41,7 @@
"description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
},
"text": {
"type" : "string",
"type" : "list",
"description" : "The text on which the analysis should be performed (when request body is not used)"
},
"tokenizer": {
Expand Down
8 changes: 8 additions & 0 deletions rest-api-spec/test/indices.analyze/10_analyze.yaml
Expand Up @@ -63,3 +63,11 @@ setup:
body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: bar foo }
---
"Array text":
- do:
indices.analyze:
body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 2 }
- match: { tokens.0.token: foo bar }
- match: { tokens.1.token: baz }
Expand Up @@ -36,7 +36,7 @@
*/
public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {

private String text;
private String[] text;

private String analyzer;

Expand All @@ -61,11 +61,11 @@ public AnalyzeRequest(String index) {
this.index(index);
}

public String text() {
public String[] text() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this breaks the java api :) we have to mark as breaking then and push only to master (which was the case already I think)

return this.text;
}

public AnalyzeRequest text(String text) {
public AnalyzeRequest text(String... text) {
this.text = text;
return this;
}
Expand Down Expand Up @@ -118,7 +118,7 @@ public String field() {
@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = super.validate();
if (text == null) {
if (text == null || text.length == 0) {
validationException = addValidationError("text is missing", validationException);
}
if (tokenFilters == null) {
Expand All @@ -133,7 +133,7 @@ public ActionRequestValidationException validate() {
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
text = in.readString();
text = in.readStringArray();
analyzer = in.readOptionalString();
tokenizer = in.readOptionalString();
tokenFilters = in.readStringArray();
Expand All @@ -144,7 +144,7 @@ public void readFrom(StreamInput in) throws IOException {
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(text);
out.writeStringArray(text);
out.writeOptionalString(analyzer);
out.writeOptionalString(tokenizer);
out.writeStringArray(tokenFilters);
Expand Down
Expand Up @@ -31,7 +31,7 @@ public AnalyzeRequestBuilder(IndicesAdminClient indicesClient) {
super(indicesClient, new AnalyzeRequest());
}

public AnalyzeRequestBuilder(IndicesAdminClient indicesClient, String index, String text) {
public AnalyzeRequestBuilder(IndicesAdminClient indicesClient, String index, String... text) {
super(indicesClient, new AnalyzeRequest(index).text(text));
}

Expand Down
Expand Up @@ -28,7 +28,6 @@
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.single.custom.TransportSingleCustomOperationAction;
import org.elasticsearch.cluster.ClusterService;
Expand Down Expand Up @@ -212,36 +211,47 @@ protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId

List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(field, request.text());
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);

int position = -1;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try {
stream = analyzer.tokenStream(field, text);
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);

while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));

}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
}
stream.end();
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
// ignore
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();

lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);

} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
if (stream != null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to replace this with IOUtils.closeWhileHandlingException(stream); and remove the null check and try/catch

try {
stream.close();
} catch (IOException e) {
// ignore
}
}
}
if (closeAnalyzer) {
analyzer.close();
}
}

if (closeAnalyzer) {
analyzer.close();
}

return new AnalyzeResponse(tokens);
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/org/elasticsearch/client/IndicesAdminClient.java
Expand Up @@ -580,6 +580,21 @@ public interface IndicesAdminClient extends ElasticsearchClient<IndicesAdminClie
*/
AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text);

/**
* Analyze texts under the provided index.
*
* @param index The index name
* @param text The array of text to analyze
*/
AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if using varargs here makes sense, I think it makes these different methods ambigous?
what's the difference between client.prepareAnalyze("index", "text1"); and client.prepareAnalyze("text1", "text2"); and client.prepareAnalyze("text1");?

Maybe we should got for new methods that accept an array instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point

I think we should drop the prepareAnalyze(String... text) arity, and force specifying the index if you want to use the varargs analysis. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have another option.
Drop prepareAnalyze() with varargs which added this PR.
And add addText(String...) to AnalyzeRequestBuilder.java

Hmm, is it hard to understand?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johtani but in that case users would have to specify the first value via prepareAnalyze and the following ones using addText ?

Along same lines as Lee's comment, how about having a single AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text); and drop all the others? It would be a breaking change though for the java api. Otherwise we should have the following without varargs which doesnt look great:

AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text);
AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String[] text);
AnalyzeRequestBuilder prepareAnalyze(String text);
AnalyzeRequestBuilder prepareAnalyze(String[] text);

I think compromises between the above two solutions won't work well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@javanna Yeah, my option is not good...

I think Lee's comment is the following:

    AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text);
    AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text);
    AnalyzeRequestBuilder prepareAnalyze(String text);

or

    AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text);
    AnalyzeRequestBuilder prepareAnalyze(String text);

however, I think it is easy to understand that we have the other name of method, like
prepareAnalyze and prepareAnalyzeForMultiValued


/**
* Analyze texts.
*
* @param text The array of text to analyze
*/
AnalyzeRequestBuilder prepareAnalyze(String... text);

/**
* Analyze text.
*
Expand Down
Expand Up @@ -470,6 +470,11 @@ public void analyze(final AnalyzeRequest request, final ActionListener<AnalyzeRe
execute(AnalyzeAction.INSTANCE, request, listener);
}

@Override
public AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text) {
return new AnalyzeRequestBuilder(this, index, text);
}

@Override
public AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text) {
return new AnalyzeRequestBuilder(this, index, text);
Expand All @@ -480,6 +485,11 @@ public AnalyzeRequestBuilder prepareAnalyze(String text) {
return new AnalyzeRequestBuilder(this, null, text);
}

@Override
public AnalyzeRequestBuilder prepareAnalyze(String... text) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above, around using varargs

return new AnalyzeRequestBuilder(this, null, text);
}

@Override
public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
return execute(PutIndexTemplateAction.INSTANCE, request);
Expand Down
Expand Up @@ -58,10 +58,10 @@ public RestAnalyzeAction(Settings settings, RestController controller, Client cl
@Override
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {

String text = request.param("text");
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");

AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
analyzeRequest.text(text);
analyzeRequest.text(texts);
analyzeRequest.listenerThreaded(false);
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
analyzeRequest.analyzer(request.param("analyzer"));
Expand All @@ -73,9 +73,9 @@ public void handleRequest(final RestRequest request, final RestChannel channel,
if (RestActions.hasBodyContent(request)) {
XContentType type = RestActions.guessBodyContentType(request);
if (type == null) {
if (text == null) {
text = RestActions.getRestContent(request).toUtf8();
analyzeRequest.text(text);
if (texts == null || texts.length == 0) {
texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
analyzeRequest.text(texts);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: we seem to support a single value only here. What is the difference between this case and the below case where we check the presence of arrays? It is just me not knowing tha analyze api I guess.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question.

Analyze API has three way to receive text.

  1. request parameter
  2. request body, raw text
  3. request body, JSON parameter

This case is 2.
In this case, I think it is difficult which letter we assume a delimiter.
And we leave this case for backward compatibility.

Then, I think it's OK we don't support multi valued text for this case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the explanation. case 2) is same as case 1) I guess. The only way we could make it work with multiple values would be by allowing for comma separated values, but then the comma coyuldn't be part of the text anymore. Let's leave it as-is.

}
} else {
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
Expand All @@ -99,7 +99,16 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
analyzeRequest.preferLocal(parser.booleanValue());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.text(parser.text());
analyzeRequest.text(parser.text());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
List<String> texts = Lists.newArrayList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
if (token.isValue() == false) {
throw new ElasticsearchIllegalArgumentException(currentFieldName + " array element should only contain text");
}
texts.add(parser.text());
}
analyzeRequest.text(texts.toArray(new String[0]));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use Strings.EMPTY_ARRAY here

} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.analyzer(parser.text());
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
Expand Down
Expand Up @@ -221,7 +221,8 @@ public void testParseXContentForAnalyzeReuqest() throws Exception {

RestAnalyzeAction.buildFromContent(content, analyzeRequest);

assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
assertThat(analyzeRequest.text().length, equalTo(1));
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
}
Expand All @@ -240,7 +241,6 @@ public void testParseXContentForAnalyzeRequestWithInvalidJsonThrowsException() t
}
}


@Test
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
Expand All @@ -259,4 +259,46 @@ public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException()
}
}

@Test
public void analyzerWithMultiValues() throws Exception {

assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
ensureGreen();

client().admin().indices().preparePutMapping("test")
.setType("document").setSource(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.setSource("simple", "type=string,analyzer=simple,position_offset_gap=100") ? so we can remove json provided as a string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I don't know that. I change as string.

"{\n" +
" \"document\":{\n" +
" \"properties\":{\n" +
" \"simple\":{\n" +
" \"type\":\"string\",\n" +
" \"analyzer\": \"simple\",\n" +
" \"position_offset_gap\": 100\n" +
" }\n" +
" }\n" +
" }\n" +
"}"
).get();

String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};

final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze(texts);
requestBuilder.setIndex(indexOrAlias());
requestBuilder.setField("simple");
AnalyzeResponse analyzeResponse = requestBuilder.get();
assertThat(analyzeResponse.getTokens().size(), equalTo(7));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));

token = analyzeResponse.getTokens().get(5);
assertThat(token.getTerm(), equalTo("second"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(19));
assertThat(token.getEndOffset(), equalTo(25));

}

}