Nested multi_field type wrapped by attachment type doesn't get values passed #57
Comments
Thanks @thomasm82. Indeed you are right. With #37 fix, mapping is not overwritten anymore but still it's not useable. May I ask you how did you fix it in mapper attachment plugin? Let me know. |
Hi there, Well I described a bit of it at the end of my post in the elasticsearch GH: elastic/elasticsearch#5402 Part 1 - My own ParseContextIn detail I implemented my own ParseContext, which I passed the original ParseContext as a delegate to be used: /*
* Created: Mar 12, 2014 11:40:41 AM
*/
package org.elasticsearch.index.mapper.attachment;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Field;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.lucene.all.AllEntries;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.mapper.ContentPath;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapper.ParseListener;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.SourceToParse;
import org.elasticsearch.index.mapper.object.RootObjectMapper;
public class AttachmentParseContext extends ParseContext {
private ParseContext delegate;
public AttachmentParseContext(ParseContext context) {
/*
* needs to be there as no default constructor is available in
* superclass; we are delegating most calls to the original context
* anyways
*/
super(context.index(), context.indexSettings(), context
.docMapperParser(), context.docMapper(), context.path());
this.delegate = context;
}
/*
* ................... START: OVERRIDING CODE ...................
*/
/**
* Unlike {@link ParseContext#externalValueSet()} this method also returns
* <code>true</code>, if the external value is set without the boolean flag
* indicating it. This is needed in order to support multi fields within an
* attachment's field.
*
* @return <code>true</code> in case either {@link #delegate
* #externalValueSet()} returns <code>true</code> or
* {@link #delegate#externalValue()} is not <code>null</code>.
*/
public boolean externalValueSet() {
return delegate.externalValueSet() || delegate.externalValue() != null;
}
/*
* ................... END: OVERRIDING CODE ...................
*/
/*
* ................... START: DELEGATE METHODS ...................
*/
public void addDoc(Document doc) {
delegate.addDoc(doc);
}
public AllEntries allEntries() {
return delegate.allEntries();
}
public AnalysisService analysisService() {
return delegate.analysisService();
}
public Analyzer analyzer() {
return delegate.analyzer();
}
public void analyzer(Analyzer analyzer) {
delegate.analyzer(analyzer);
}
public void clearWithinCopyTo() {
delegate.clearWithinCopyTo();
}
public void clearWithinNewMapper() {
delegate.clearWithinNewMapper();
}
public Document doc() {
return delegate.doc();
}
public float docBoost() {
return delegate.docBoost();
}
public void docBoost(float docBoost) {
delegate.docBoost(docBoost);
}
public DocumentMapper docMapper() {
return delegate.docMapper();
}
public DocumentMapperParser docMapperParser() {
return delegate.docMapperParser();
}
public List<Document> docs() {
return delegate.docs();
}
public boolean equals(Object obj) {
return delegate.equals(obj);
}
public Object externalValue() {
return delegate.externalValue();
}
public void externalValue(Object externalValue) {
delegate.externalValue(externalValue);
}
public boolean flyweight() {
return delegate.flyweight();
}
public int hashCode() {
return delegate.hashCode();
}
public String id() {
return delegate.id();
}
public void id(String id) {
delegate.id(id);
}
public String ignoredValue(String indexName) {
return delegate.ignoredValue(indexName);
}
public void ignoredValue(String indexName, String value) {
delegate.ignoredValue(indexName, value);
}
public boolean includeInAll(Boolean includeInAll, FieldMapper mapper) {
return delegate.includeInAll(includeInAll, mapper);
}
public String index() {
return delegate.index();
}
public Settings indexSettings() {
return delegate.indexSettings();
}
public boolean isWithinCopyTo() {
return delegate.isWithinCopyTo();
}
public boolean isWithinNewMapper() {
return delegate.isWithinNewMapper();
}
public ParseListener listener() {
return delegate.listener();
}
public boolean mappingsModified() {
return delegate.mappingsModified();
}
public XContentParser parser() {
return delegate.parser();
}
public ContentPath path() {
return delegate.path();
}
public void reset(XContentParser parser, Document document,
SourceToParse source, ParseListener listener) {
delegate.reset(parser, document, source, listener);
}
public RootObjectMapper root() {
return delegate.root();
}
public Document rootDoc() {
return delegate.rootDoc();
}
public void setMappingsModified() {
delegate.setMappingsModified();
}
public void setWithinCopyTo() {
delegate.setWithinCopyTo();
}
public void setWithinNewMapper() {
delegate.setWithinNewMapper();
}
public BytesReference source() {
return delegate.source();
}
public void source(BytesReference source) {
delegate.source(source);
}
public SourceToParse sourceToParse() {
return delegate.sourceToParse();
}
public StringBuilder stringBuilder() {
return delegate.stringBuilder();
}
public Document switchDoc(Document doc) {
return delegate.switchDoc(doc);
}
public String toString() {
return delegate.toString();
}
public String type() {
return delegate.type();
}
public Field uid() {
return delegate.uid();
}
public void uid(Field uid) {
delegate.uid(uid);
}
public Field version() {
return delegate.version();
}
public void version(Field version) {
delegate.version(version);
}
/*
* ................... END: DELEGATE METHODS ...................
*/
} Part 2 - Integration with the pluginIn order to get my context being used, I simply changed one line (as far as I can remember) within the @Override
public void parse(ParseContext context) throws IOException {
byte[] content = null;
String contentType = null;
int indexedChars = defaultIndexedChars;
String name = null;
XContentParser parser = context.parser();
XContentParser.Token token = parser.currentToken();
if (token == XContentParser.Token.VALUE_STRING) {
content = parser.binaryValue();
} else {
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("content".equals(currentFieldName)) {
content = parser.binaryValue();
} else if ("_content_type".equals(currentFieldName)) {
contentType = parser.text();
} else if ("_name".equals(currentFieldName)) {
name = parser.text();
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
indexedChars = parser.intValue();
}
}
}
}
// Throw clean exception when no content is provided Fix #23
if (content == null) {
throw new MapperParsingException("No content is provided.");
}
Metadata metadata = new Metadata();
if (contentType != null) {
metadata.add(Metadata.CONTENT_TYPE, contentType);
}
if (name != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, name);
}
String parsedContent;
try {
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parsedContent);
String language = "en";
if (languageIdentifier.isReasonablyCertain()) {
language = languageIdentifier.getLanguage();
}
context.externalValue(language);
languageMapper.parse(context);
} catch (Throwable e) {
// #18: we could ignore errors when Tika does not parse data
if (!ignoreErrors) throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
return;
}
context = new AttachmentParseContext(context);
context.externalValue(parsedContent);
contentMapper.parse(context);
try {
context.externalValue(name);
nameMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage());
}
try {
context.externalValue(metadata.get(Metadata.DATE));
dateMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue());
}
try {
context.externalValue(metadata.get(Metadata.TITLE));
titleMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue());
}
try {
context.externalValue(metadata.get(Metadata.AUTHOR));
authorMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue());
}
try {
context.externalValue(metadata.get(Metadata.KEYWORDS));
keywordsMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue());
}
try {
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
contentTypeMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
}
try {
if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
// We try to get CONTENT_LENGTH from Tika first
context.externalValue(metadata.get(Metadata.CONTENT_LENGTH));
} else {
// Otherwise, we use our byte[] length
context.externalValue(content.length);
}
contentLengthMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue());
}
} This way you get back an external value whenever it is set, even if it was consumed - i.e. read - already. Hope this helps, |
Thanks for sharing this! I really appreciate. We'll see if we need or not to patch the plugin in the meantime... Keeping this open for the moment. |
In context of mapper attachment and other mapper plugins, when dealing with multi fields, sub fields never get the `externalValue` although it was set. This patch consider that an external value is set when `externalValue` is different than `null`. Here is a full script which reproduce the issue when used with mapper attachment plugin: ``` DELETE /test PUT /test { "mappings": { "test": { "properties": { "f": { "type": "attachment", "fields": { "f": { "analyzer": "english", "fields": { "no_stemming": { "type": "string", "store": "yes", "analyzer": "standard" } } } } } } } } } PUT /test/test/1 { "f": "VGhlIHF1aWNrIGJyb3duIGZveGVz" } GET /test/_search { "query": { "match": { "f": "quick" } } } GET /test/_search { "query": { "match": { "f.no_stemming": "quick" } } } GET /test/test/1?fields=f.no_stemming ``` Related to elastic/elasticsearch-mapper-attachments#57 Closes elastic#5402.
In context of mapper attachment and other mapper plugins, when dealing with multi fields, sub fields never get the `externalValue` although it was set. Here is a full script which reproduce the issue when used with mapper attachment plugin: ``` DELETE /test PUT /test { "mappings": { "test": { "properties": { "f": { "type": "attachment", "fields": { "f": { "analyzer": "english", "fields": { "no_stemming": { "type": "string", "store": "yes", "analyzer": "standard" } } } } } } } } } PUT /test/test/1 { "f": "VGhlIHF1aWNrIGJyb3duIGZveGVz" } GET /test/_search { "query": { "match": { "f": "quick" } } } GET /test/_search { "query": { "match": { "f.no_stemming": "quick" } } } GET /test/test/1?fields=f.no_stemming ``` Related to elastic/elasticsearch-mapper-attachments#57 Closes elastic#5402.
In context of mapper attachment and other mapper plugins, when dealing with multi fields, sub fields never get the `externalValue` although it was set. Here is a full script which reproduce the issue when used with mapper attachment plugin: ``` DELETE /test PUT /test { "mappings": { "test": { "properties": { "f": { "type": "attachment", "fields": { "f": { "analyzer": "english", "fields": { "no_stemming": { "type": "string", "store": "yes", "analyzer": "standard" } } } } } } } } } PUT /test/test/1 { "f": "VGhlIHF1aWNrIGJyb3duIGZveGVz" } GET /test/_search { "query": { "match": { "f": "quick" } } } GET /test/_search { "query": { "match": { "f.no_stemming": "quick" } } } GET /test/test/1?fields=f.no_stemming ``` Related to elastic/elasticsearch-mapper-attachments#57 Closes #5402. (cherry picked from commit 11eced0)
Now elastic/elasticsearch#6867 is merged in elasticsearch core code (branch 1.x - es 1.4), we can support multi fields in mapper attachment plugin. ``` DELETE /test PUT /test { "settings": { "number_of_shards": 1 } } PUT /test/person/_mapping { "person": { "properties": { "file": { "type": "attachment", "path": "full", "fields": { "file": { "type": "string", "fields": { "store": { "type": "string", "store": true } } }, "content_type": { "type": "string", "fields": { "store": { "type": "string", "store": true }, "untouched": { "type": "string", "index": "not_analyzed", "store": true } } } } } } } } PUT /test/person/1?refresh=true { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } GET /test/person/_search { "fields": [ "file.store", "file.content_type.store" ], "aggs": { "store": { "terms": { "field": "file.content_type.store" } }, "untouched": { "terms": { "field": "file.content_type.untouched" } } } } ``` It gives: ```js { "took": 3, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 1, "max_score": 1, "hits": [ { "_index": "test", "_type": "person", "_id": "1", "_score": 1, "fields": { "file.store": [ "\"God Save the Queen\" (alternatively \"God Save the King\"\n" ], "file.content_type.store": [ "text/plain; charset=ISO-8859-1" ] } } ] }, "aggregations": { "store": { "doc_count_error_upper_bound": 0, "buckets": [ { "key": "1", "doc_count": 1 }, { "key": "8859", "doc_count": 1 }, { "key": "charset", "doc_count": 1 }, { "key": "iso", "doc_count": 1 }, { "key": "plain", "doc_count": 1 }, { "key": "text", "doc_count": 1 } ] }, "untouched": { "doc_count_error_upper_bound": 0, "buckets": [ { "key": "text/plain; charset=ISO-8859-1", "doc_count": 1 } ] } } } ``` Note that using shorter definition works as well: ``` DELETE /test PUT /test { "settings": { "number_of_shards": 1 } } PUT /test/person/_mapping { "person": { "properties": { "file": { "type": "attachment" } } } } PUT /test/person/1?refresh=true { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } GET /test/person/_search { "query": { "match": { "file": "king" } } } ``` gives: ```js { "took": 53, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 1, "max_score": 0.095891505, "hits": [ { "_index": "test", "_type": "person", "_id": "1", "_score": 0.095891505, "_source": { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } } ] } } ``` Closes #57.
Also referring to #37 where the initial problem was an overwritten mapping, my problem still is, that a multi_field type inside an attachment type does not work properly. Just before I've created an issue on this in the elasticsearch GitHub, as I think it's a core problem which might also influence other plugins.
Please see elastic/elasticsearch#5402 where I exactly described the problem.
The text was updated successfully, but these errors were encountered: