From 32aca3fd96816ad49b869a82c9ba0f02265f8744 Mon Sep 17 00:00:00 2001 From: Stefan Kopf Date: Tue, 2 Aug 2016 17:09:33 +0200 Subject: [PATCH] MNT-15219 - Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika - Make processing of shapes in XMLS files configurable --- .../apache/tika/metadata/TikaMetadataKeys.java | 3 +++ .../ooxml/AbstractOOXMLExtractor.java | 18 ++++++++++++++++-- .../ooxml/POIXMLTextExtractorDecorator.java | 10 ++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 4 +++- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java index 0846e3230d..725fa4cff1 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java @@ -26,4 +26,7 @@ public interface TikaMetadataKeys { String PROTECTED = "protected"; String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId"; + + String TIKA_PARSER_PARSE_SHAPES_KEY = "TIKA_PARSER_PARSE_SHAPES"; + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 282ffe05a2..c3fccc0c1c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -39,16 +39,15 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; -import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -314,4 +313,19 @@ protected abstract void buildXHTML(XHTMLContentHandler xhtml) */ protected abstract List getMainDocumentParts() throws TikaException; + + private static final boolean SHOULD_PROCESS_SHAPES_DEFAULT_VALUE = false; + + public boolean shouldProcessShapes(Metadata metadata) + { + if (metadata != null) + { + String shouldProcessShapesValue = metadata.get(TikaMetadataKeys.TIKA_PARSER_PARSE_SHAPES_KEY); + if (shouldProcessShapesValue != null && !shouldProcessShapesValue.isEmpty()) + { + return Boolean.valueOf(shouldProcessShapesValue); + } + } + return SHOULD_PROCESS_SHAPES_DEFAULT_VALUE; + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java index 375adf5d41..d6928cb9c5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java @@ -21,6 +21,8 @@ import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -34,6 +36,14 @@ public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor ex @Override protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException { // extract document content as a single string (not structured) + if(extractor instanceof XSSFEventBasedExcelExtractor) + { + ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(false); + } + else if(extractor instanceof XSSFExcelExtractor) + { + ((XSSFExcelExtractor)extractor).setIncludeTextBoxes(false); + } xhtml.element("p", extractor.getText()); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 985e413294..9e191ccdc5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -145,7 +145,9 @@ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, for(String footer : sheetExtractor.footers) { extractHeaderFooter(footer, xhtml); } - processShapes(iter.getShapes(), xhtml); + if (shouldProcessShapes(metadata)){ + processShapes(iter.getShapes(), xhtml); + } // All done with this sheet xhtml.endElement("div"); }