New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add multi-valued text support to the analyzer API #10847
Changes from 1 commit
55c0f5b
b471d19
91c7abd
598f94c
139bd34
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,7 @@ | |
*/ | ||
public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> { | ||
|
||
private String text; | ||
private String[] text; | ||
|
||
private String analyzer; | ||
|
||
|
@@ -61,11 +61,11 @@ public AnalyzeRequest(String index) { | |
this.index(index); | ||
} | ||
|
||
public String text() { | ||
public String[] text() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this breaks the java api :) we have to mark as breaking then and push only to master (which was the case already I think) |
||
return this.text; | ||
} | ||
|
||
public AnalyzeRequest text(String text) { | ||
public AnalyzeRequest text(String... text) { | ||
this.text = text; | ||
return this; | ||
} | ||
|
@@ -118,7 +118,7 @@ public String field() { | |
@Override | ||
public ActionRequestValidationException validate() { | ||
ActionRequestValidationException validationException = super.validate(); | ||
if (text == null) { | ||
if (text == null || text.length == 0) { | ||
validationException = addValidationError("text is missing", validationException); | ||
} | ||
if (tokenFilters == null) { | ||
|
@@ -133,7 +133,7 @@ public ActionRequestValidationException validate() { | |
@Override | ||
public void readFrom(StreamInput in) throws IOException { | ||
super.readFrom(in); | ||
text = in.readString(); | ||
text = in.readStringArray(); | ||
analyzer = in.readOptionalString(); | ||
tokenizer = in.readOptionalString(); | ||
tokenFilters = in.readStringArray(); | ||
|
@@ -144,7 +144,7 @@ public void readFrom(StreamInput in) throws IOException { | |
@Override | ||
public void writeTo(StreamOutput out) throws IOException { | ||
super.writeTo(out); | ||
out.writeString(text); | ||
out.writeStringArray(text); | ||
out.writeOptionalString(analyzer); | ||
out.writeOptionalString(tokenizer); | ||
out.writeStringArray(tokenFilters); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,6 @@ | |
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ElasticsearchIllegalArgumentException; | ||
import org.elasticsearch.Version; | ||
import org.elasticsearch.action.ActionListener; | ||
import org.elasticsearch.action.support.ActionFilters; | ||
import org.elasticsearch.action.support.single.custom.TransportSingleCustomOperationAction; | ||
import org.elasticsearch.cluster.ClusterService; | ||
|
@@ -212,36 +211,47 @@ protected AnalyzeResponse shardOperation(AnalyzeRequest request, ShardId shardId | |
|
||
List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList(); | ||
TokenStream stream = null; | ||
try { | ||
stream = analyzer.tokenStream(field, request.text()); | ||
stream.reset(); | ||
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); | ||
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); | ||
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); | ||
TypeAttribute type = stream.addAttribute(TypeAttribute.class); | ||
|
||
int position = -1; | ||
while (stream.incrementToken()) { | ||
int increment = posIncr.getPositionIncrement(); | ||
if (increment > 0) { | ||
position = position + increment; | ||
int lastPosition = -1; | ||
int lastOffset = 0; | ||
for (String text : request.text()) { | ||
try { | ||
stream = analyzer.tokenStream(field, text); | ||
stream.reset(); | ||
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); | ||
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); | ||
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); | ||
TypeAttribute type = stream.addAttribute(TypeAttribute.class); | ||
|
||
while (stream.incrementToken()) { | ||
int increment = posIncr.getPositionIncrement(); | ||
if (increment > 0) { | ||
lastPosition = lastPosition + increment; | ||
} | ||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type())); | ||
|
||
} | ||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type())); | ||
} | ||
stream.end(); | ||
} catch (IOException e) { | ||
throw new ElasticsearchException("failed to analyze", e); | ||
} finally { | ||
if (stream != null) { | ||
try { | ||
stream.close(); | ||
} catch (IOException e) { | ||
// ignore | ||
stream.end(); | ||
lastOffset += offset.endOffset(); | ||
lastPosition += posIncr.getPositionIncrement(); | ||
|
||
lastPosition += analyzer.getPositionIncrementGap(field); | ||
lastOffset += analyzer.getOffsetGap(field); | ||
|
||
} catch (IOException e) { | ||
throw new ElasticsearchException("failed to analyze", e); | ||
} finally { | ||
if (stream != null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should be able to replace this with |
||
try { | ||
stream.close(); | ||
} catch (IOException e) { | ||
// ignore | ||
} | ||
} | ||
} | ||
if (closeAnalyzer) { | ||
analyzer.close(); | ||
} | ||
} | ||
|
||
if (closeAnalyzer) { | ||
analyzer.close(); | ||
} | ||
|
||
return new AnalyzeResponse(tokens); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -580,6 +580,21 @@ public interface IndicesAdminClient extends ElasticsearchClient<IndicesAdminClie | |
*/ | ||
AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text); | ||
|
||
/** | ||
* Analyze texts under the provided index. | ||
* | ||
* @param index The index name | ||
* @param text The array of text to analyze | ||
*/ | ||
AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering if using varargs here makes sense, I think it makes these different methods ambigous? Maybe we should got for new methods that accept an array instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point I think we should drop the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have another option. Hmm, is it hard to understand? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @johtani but in that case users would have to specify the first value via prepareAnalyze and the following ones using addText ? Along same lines as Lee's comment, how about having a single
I think compromises between the above two solutions won't work well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @javanna Yeah, my option is not good... I think Lee's comment is the following:
or
however, I think it is easy to understand that we have the other name of method, like |
||
|
||
/** | ||
* Analyze texts. | ||
* | ||
* @param text The array of text to analyze | ||
*/ | ||
AnalyzeRequestBuilder prepareAnalyze(String... text); | ||
|
||
/** | ||
* Analyze text. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -470,6 +470,11 @@ public void analyze(final AnalyzeRequest request, final ActionListener<AnalyzeRe | |
execute(AnalyzeAction.INSTANCE, request, listener); | ||
} | ||
|
||
@Override | ||
public AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String... text) { | ||
return new AnalyzeRequestBuilder(this, index, text); | ||
} | ||
|
||
@Override | ||
public AnalyzeRequestBuilder prepareAnalyze(@Nullable String index, String text) { | ||
return new AnalyzeRequestBuilder(this, index, text); | ||
|
@@ -480,6 +485,11 @@ public AnalyzeRequestBuilder prepareAnalyze(String text) { | |
return new AnalyzeRequestBuilder(this, null, text); | ||
} | ||
|
||
@Override | ||
public AnalyzeRequestBuilder prepareAnalyze(String... text) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above, around using varargs |
||
return new AnalyzeRequestBuilder(this, null, text); | ||
} | ||
|
||
@Override | ||
public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) { | ||
return execute(PutIndexTemplateAction.INSTANCE, request); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,10 +58,10 @@ public RestAnalyzeAction(Settings settings, RestController controller, Client cl | |
@Override | ||
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) { | ||
|
||
String text = request.param("text"); | ||
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text"); | ||
|
||
AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index")); | ||
analyzeRequest.text(text); | ||
analyzeRequest.text(texts); | ||
analyzeRequest.listenerThreaded(false); | ||
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard())); | ||
analyzeRequest.analyzer(request.param("analyzer")); | ||
|
@@ -73,9 +73,9 @@ public void handleRequest(final RestRequest request, final RestChannel channel, | |
if (RestActions.hasBodyContent(request)) { | ||
XContentType type = RestActions.guessBodyContentType(request); | ||
if (type == null) { | ||
if (text == null) { | ||
text = RestActions.getRestContent(request).toUtf8(); | ||
analyzeRequest.text(text); | ||
if (texts == null || texts.length == 0) { | ||
texts = new String[]{ RestActions.getRestContent(request).toUtf8() }; | ||
analyzeRequest.text(texts); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: we seem to support a single value only here. What is the difference between this case and the below case where we check the presence of arrays? It is just me not knowing tha analyze api I guess. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. Analyze API has three way to receive text.
This case is 2. Then, I think it's OK we don't support multi valued text for this case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the explanation. case 2) is same as case 1) I guess. The only way we could make it work with multiple values would be by allowing for comma separated values, but then the comma coyuldn't be part of the text anymore. Let's leave it as-is. |
||
} | ||
} else { | ||
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values | ||
|
@@ -99,7 +99,16 @@ public static void buildFromContent(BytesReference content, AnalyzeRequest analy | |
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) { | ||
analyzeRequest.preferLocal(parser.booleanValue()); | ||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
analyzeRequest.text(parser.text()); | ||
analyzeRequest.text(parser.text()); | ||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) { | ||
List<String> texts = Lists.newArrayList(); | ||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { | ||
if (token.isValue() == false) { | ||
throw new ElasticsearchIllegalArgumentException(currentFieldName + " array element should only contain text"); | ||
} | ||
texts.add(parser.text()); | ||
} | ||
analyzeRequest.text(texts.toArray(new String[0])); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use |
||
} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
analyzeRequest.analyzer(parser.text()); | ||
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,7 +221,8 @@ public void testParseXContentForAnalyzeReuqest() throws Exception { | |
|
||
RestAnalyzeAction.buildFromContent(content, analyzeRequest); | ||
|
||
assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST")); | ||
assertThat(analyzeRequest.text().length, equalTo(1)); | ||
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"})); | ||
assertThat(analyzeRequest.tokenizer(), equalTo("keyword")); | ||
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"})); | ||
} | ||
|
@@ -240,7 +241,6 @@ public void testParseXContentForAnalyzeRequestWithInvalidJsonThrowsException() t | |
} | ||
} | ||
|
||
|
||
@Test | ||
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception { | ||
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test"); | ||
|
@@ -259,4 +259,46 @@ public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() | |
} | ||
} | ||
|
||
@Test | ||
public void analyzerWithMultiValues() throws Exception { | ||
|
||
assertAcked(prepareCreate("test").addAlias(new Alias("alias"))); | ||
ensureGreen(); | ||
|
||
client().admin().indices().preparePutMapping("test") | ||
.setType("document").setSource( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. .setSource("simple", "type=string,analyzer=simple,position_offset_gap=100") ? so we can remove json provided as a string? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I don't know that. I change as string. |
||
"{\n" + | ||
" \"document\":{\n" + | ||
" \"properties\":{\n" + | ||
" \"simple\":{\n" + | ||
" \"type\":\"string\",\n" + | ||
" \"analyzer\": \"simple\",\n" + | ||
" \"position_offset_gap\": 100\n" + | ||
" }\n" + | ||
" }\n" + | ||
" }\n" + | ||
"}" | ||
).get(); | ||
|
||
String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"}; | ||
|
||
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze(texts); | ||
requestBuilder.setIndex(indexOrAlias()); | ||
requestBuilder.setField("simple"); | ||
AnalyzeResponse analyzeResponse = requestBuilder.get(); | ||
assertThat(analyzeResponse.getTokens().size(), equalTo(7)); | ||
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3); | ||
assertThat(token.getTerm(), equalTo("test")); | ||
assertThat(token.getPosition(), equalTo(3)); | ||
assertThat(token.getStartOffset(), equalTo(10)); | ||
assertThat(token.getEndOffset(), equalTo(14)); | ||
|
||
token = analyzeResponse.getTokens().get(5); | ||
assertThat(token.getTerm(), equalTo("second")); | ||
assertThat(token.getPosition(), equalTo(105)); | ||
assertThat(token.getStartOffset(), equalTo(19)); | ||
assertThat(token.getEndOffset(), equalTo(25)); | ||
|
||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would change this:
If text parameter is provided as array of string, it analyze as a multi-valued.
to
If text parameter is provided as array of strings, it is analyzed as a multi-valued field.