Skip to content

Commit

Permalink
MNT-15219 - Excel (.xlsx) containing xmls (shapes/drawings) with mult…
Browse files Browse the repository at this point in the history
…i byte characters may cause OutOfMemory in Tika

- Make processing of shapes in XMLS files configurable
  • Loading branch information
skopf committed Aug 2, 2016
1 parent 6a9f5a5 commit 32aca3f
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 3 deletions.
Expand Up @@ -26,4 +26,7 @@ public interface TikaMetadataKeys {
String PROTECTED = "protected";

String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";

String TIKA_PARSER_PARSE_SHAPES_KEY = "TIKA_PARSER_PARSE_SHAPES";

}
Expand Up @@ -39,16 +39,15 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
Expand Down Expand Up @@ -314,4 +313,19 @@ protected abstract void buildXHTML(XHTMLContentHandler xhtml)
*/
protected abstract List<PackagePart> getMainDocumentParts()
throws TikaException;

private static final boolean SHOULD_PROCESS_SHAPES_DEFAULT_VALUE = false;

public boolean shouldProcessShapes(Metadata metadata)
{
if (metadata != null)
{
String shouldProcessShapesValue = metadata.get(TikaMetadataKeys.TIKA_PARSER_PARSE_SHAPES_KEY);
if (shouldProcessShapesValue != null && !shouldProcessShapesValue.isEmpty())
{
return Boolean.valueOf(shouldProcessShapesValue);
}
}
return SHOULD_PROCESS_SHAPES_DEFAULT_VALUE;
}
}
Expand Up @@ -21,6 +21,8 @@

import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
Expand All @@ -34,6 +36,14 @@ public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor ex
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
// extract document content as a single string (not structured)
if(extractor instanceof XSSFEventBasedExcelExtractor)
{
((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(false);
}
else if(extractor instanceof XSSFExcelExtractor)
{
((XSSFExcelExtractor)extractor).setIncludeTextBoxes(false);
}
xhtml.element("p", extractor.getText());
}

Expand Down
Expand Up @@ -145,7 +145,9 @@ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
for(String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
processShapes(iter.getShapes(), xhtml);
if (shouldProcessShapes(metadata)){
processShapes(iter.getShapes(), xhtml);
}
// All done with this sheet
xhtml.endElement("div");
}
Expand Down

1 comment on commit 32aca3f

@Gagravarr
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Tika 1.15, this can be done as standard by passing in a OfficeParserConfig object on the ParseContext with setIncludeShapeBasedContent(boolean) called, see TIKA-2346

Please sign in to comment.