Skip to content

Commit

Permalink
Don't reject full document in case of invalid metadata
Browse files Browse the repository at this point in the history
From original PR elastic#17 from @fcamblor

If you try to index a document with an invalid metadata, the full document is rejected.

For example:

```html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>
```

has a non parseable date.

This fix add a new option that ignore parsing errors `"index.mapping.attachment.ignore_errors":true` (default to `true`).

Closes elastic#17, elastic#38.
  • Loading branch information
fcamblor authored and dadoonet committed Aug 20, 2013
1 parent d7a2e7e commit 019d0f9
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 24 deletions.
8 changes: 8 additions & 0 deletions README.md
Expand Up @@ -90,6 +90,14 @@ Note, this feature is support since `1.3.0` version.

The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html).

Metadata parsing error handling
-------------------------------

While extracting metadata content, errors could happen for example when parsing dates.
Since version `1.9.0`, parsing errors are ignored so your document is indexed.

You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.

License
-------

Expand Down
Expand Up @@ -22,6 +22,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.elasticsearch.common.io.stream.BytesStreamInput;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.ESLoggerFactory;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.*;
Expand Down Expand Up @@ -58,6 +60,8 @@
*/
public class AttachmentMapper implements Mapper {

private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName());

public static final String CONTENT_TYPE = "attachment";

public static class Defaults {
Expand All @@ -70,6 +74,8 @@ public static class Builder extends Mapper.Builder<Builder, AttachmentMapper> {

private Integer defaultIndexedChars = null;

private Boolean ignoreErrors = null;

private Mapper.Builder contentBuilder;

private Mapper.Builder titleBuilder = stringField("title");
Expand All @@ -95,11 +101,6 @@ public Builder pathType(ContentPath.Type pathType) {
return this;
}

public Builder defaultIndexedChars(int defaultIndexedChars) {
this.defaultIndexedChars = defaultIndexedChars;
return this;
}

public Builder content(Mapper.Builder content) {
this.contentBuilder = content;
return this;
Expand Down Expand Up @@ -155,14 +156,21 @@ public AttachmentMapper build(BuilderContext context) {

context.path().pathType(origPathType);

if (defaultIndexedChars != null && context.indexSettings() != null) {
if (defaultIndexedChars == null && context.indexSettings() != null) {
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
}
if (defaultIndexedChars == null) {
defaultIndexedChars = 100000;
}

return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
if (ignoreErrors == null && context.indexSettings() != null) {
ignoreErrors = context.indexSettings().getAsBoolean("index.mapping.attachment.ignore_errors", Boolean.TRUE);
}
if (ignoreErrors == null) {
ignoreErrors = Boolean.TRUE;
}

return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
}
}

Expand Down Expand Up @@ -239,6 +247,8 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext

private final int defaultIndexedChars;

private final boolean ignoreErrors;

private final Mapper contentMapper;

private final Mapper dateMapper;
Expand All @@ -253,12 +263,13 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext

private final Mapper contentTypeMapper;

public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Mapper contentMapper,
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
Mapper keywordsMapper, Mapper contentTypeMapper) {
this.name = name;
this.pathType = pathType;
this.defaultIndexedChars = defaultIndexedChars;
this.ignoreErrors = ignoreErrors;
this.contentMapper = contentMapper;
this.dateMapper = dateMapper;
this.titleMapper = titleMapper;
Expand Down Expand Up @@ -330,23 +341,53 @@ public void parse(ParseContext context) throws IOException {
contentMapper.parse(context);


context.externalValue(name);
nameMapper.parse(context);
try {
context.externalValue(name);
nameMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage());
}

context.externalValue(metadata.get(Metadata.DATE));
dateMapper.parse(context);
try {
context.externalValue(metadata.get(Metadata.DATE));
dateMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue());
}

context.externalValue(metadata.get(Metadata.TITLE));
titleMapper.parse(context);
try {
context.externalValue(metadata.get(Metadata.TITLE));
titleMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue());
}

context.externalValue(metadata.get(Metadata.AUTHOR));
authorMapper.parse(context);
try {
context.externalValue(metadata.get(Metadata.AUTHOR));
authorMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue());
}

context.externalValue(metadata.get(Metadata.KEYWORDS));
keywordsMapper.parse(context);
try {
context.externalValue(metadata.get(Metadata.KEYWORDS));
keywordsMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue());
}

context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
contentTypeMapper.parse(context);
try {
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
contentTypeMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
}
}

@Override
Expand Down
@@ -0,0 +1,89 @@
package org.elasticsearch.index.mapper.xcontent;

import org.apache.lucene.document.Document;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
import org.testng.annotations.Test;

import java.io.IOException;

import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;

/**
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/38
*/
public class MetadataMapperTest {

protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());

String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
DocumentMapper docMapper = mapperParser.parse(mapping);
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);

BytesReference json = jsonBuilder()
.startObject()
.field("_id", 1)
.startObject("file")
.field("_name", "htmlWithoutDateMeta.html")
.field("content", html)
.endObject()
.endObject().bytes();

Document doc = docMapper.parse(json).rootDoc();
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo("htmlWithoutDateMeta.html"));
if (expected == null) {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
} else {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
}
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
}

@Test
public void testIgnoreWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
}

@Test
public void testIgnoreWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
}

@Test
public void testIgnoreWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
}

@Test
public void testWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
}

@Test(expectedExceptions = MapperParsingException.class)
public void testWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
}

@Test
public void testWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
}

}
@@ -0,0 +1,11 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>
@@ -0,0 +1,11 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="2012-11-30">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>
@@ -0,0 +1,10 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>
@@ -1,9 +1,9 @@
{
person:{
properties:{
"person":{
"properties":{
"file":{
type:"attachment"
"type":"attachment"
}
}
}
}
}

0 comments on commit 019d0f9

Please sign in to comment.