Added configurable TextExtractor to JSoupParserBolt + commented out C…

…ontentFilter in archetype and configure TextExtractor instead (#678)
apache · Jan 2, 2019 · 25b6f30 · 25b6f30
1 parent 9c70cbc
commit 25b6f30
Show file tree

Hide file tree

Showing 6 changed files with 278 additions and 13 deletions.
diff --git a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
@@ -64,6 +64,16 @@ config:
   # never revisit a page with an error (or set a value in minutes)
   fetchInterval.error: -1
 
+  # text extraction for JSoupParserBolt
+  textextractor.include.pattern:
+   - DIV[id="maincontent"]
+   - DIV[itemprop="articleBody"]
+   - ARTICLE
+
+  textextractor.exclude.tags:
+   - STYLE
+   - SCRIPT
+
   # custom fetch interval to be used when a document has the key/value in its metadata
   # and has been fetched successfully (value in minutes)
   # fetchInterval.FETCH_ERROR.isFeed=true: 30

diff --git a/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json b/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json
@@ -16,15 +16,6 @@
          "parse.keywords": "//META[@name=\"keywords\"]/@content"
       }
     },
-    {
-      "class": "com.digitalpebble.stormcrawler.parse.filter.ContentFilter",
-      "name": "ContentFilter",
-      "params": {
-        "pattern": "//DIV[@id=\"maincontent\"]",
-        "pattern2": "//DIV[@itemprop=\"articleBody\"]",
-        "pattern3": "//ARTICLE"
-       }
-    },
     {
       "class": "com.digitalpebble.stormcrawler.parse.filter.DomainParseFilter",
       "name": "DomainParseFilter",

diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/com/digitalpebble/stormcrawler/bolt/JSoupParserBolt.java
@@ -57,6 +57,7 @@
 import com.digitalpebble.stormcrawler.parse.ParseFilter;
 import com.digitalpebble.stormcrawler.parse.ParseFilters;
 import com.digitalpebble.stormcrawler.parse.ParseResult;
+import com.digitalpebble.stormcrawler.parse.TextExtractor;
 import com.digitalpebble.stormcrawler.persistence.Status;
 import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
 import com.digitalpebble.stormcrawler.util.CharsetIdentification;
@@ -106,6 +107,8 @@ public class JSoupParserBolt extends StatusEmitterBolt {
      **/
     private int maxLengthCharsetDetection = -1;
 
+    private TextExtractor textExtractor;
+
     @SuppressWarnings({ "rawtypes", "unchecked" })
     @Override
     public void prepare(Map conf, TopologyContext context,
@@ -135,6 +138,8 @@ public void prepare(Map conf, TopologyContext context,
 
         maxOutlinksPerPage = ConfUtils.getInt(conf,
                 "parser.emitOutlinks.max.per.page", -1);
+
+        textExtractor = new TextExtractor(conf);
     }
 
     @Override
@@ -266,7 +271,7 @@ public void execute(Tuple tuple) {
 
             Element body = jsoupDoc.body();
             if (body != null) {
-                text = body.text();
+                text = textExtractor.text(body);
             }
 
         } catch (Throwable e) {

diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/parse/TextExtractor.java b/core/src/main/java/com/digitalpebble/stormcrawler/parse/TextExtractor.java
@@ -0,0 +1,180 @@
+/**
+ * Licensed to DigitalPebble Ltd under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * DigitalPebble licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.digitalpebble.stormcrawler.parse;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.CDataNode;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.Elements;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
+
+import com.digitalpebble.stormcrawler.util.ConfUtils;
+
+/**
+ * Filters the text extracted from HTML documents, used by JSoupParserBolt.
+ * Configured with optional inclusion patterns based on <a
+ * href="https://jsoup.org/cookbook/extracting-data/selector-syntax"> JSoup
+ * selectors</a>, as well as a list of tags to be excluded.
+ * 
+ * Replaces {@link ContentFilter}.
+ * 
+ * The first matching inclusion pattern is used or the whole document if no
+ * expressions are configured or no match has been found.
+ * 
+ * The TextExtraction can be configured as so:
+ * 
+ * <pre>
+ * {@code 
+ *   textextractor.include.pattern:
+ *    - DIV[id="maincontent"]
+ *    - DIV[itemprop="articleBody"]
+ *    - ARTICLE
+ * 
+ *   textextractor.exclude.tags:
+ *    - STYLE
+ *    - SCRIPT
+ *    }
+ * </pre>
+ * 
+ * @since 1.13
+ **/
+public class TextExtractor {
+
+    public final static String INCLUDE_PARAM_NAME = "textextractor.include.pattern";
+    public final static String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags";
+
+    private List<String> inclusionPatterns;
+    private HashSet<String> excludedTags;
+
+    public TextExtractor(Map stormConf) {
+        inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME,
+                stormConf);
+        excludedTags = new HashSet<String>();
+        ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
+                .forEach((s) -> excludedTags.add(s.toLowerCase()));
+    }
+
+    public String text(Element element) {
+        // no patterns at all - return the text from the whole document
+        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
+            return _text(element);
+        }
+
+        Elements matches = new Elements();
+
+        for (String pattern : inclusionPatterns) {
+            matches = element.select(pattern);
+            if (!matches.isEmpty())
+                break;
+        }
+
+        // if nothing matches or no patterns were defined use the whole doc
+        if (matches.isEmpty()) {
+            matches.add(element);
+        }
+
+        final StringBuilder accum = new StringBuilder();
+
+        for (Element node : matches) {
+            accum.append(_text(node)).append("\n");
+        }
+
+        return accum.toString().trim();
+    }
+
+    private String _text(Node node) {
+        final StringBuilder accum = new StringBuilder();
+        NodeTraversor.traverse(new NodeVisitor() {
+
+            private Node excluded = null;
+
+            public void head(Node node, int depth) {
+                if (excluded == null && node instanceof TextNode) {
+                    TextNode textNode = (TextNode) node;
+                    appendNormalisedText(accum, textNode);
+                } else if (node instanceof Element) {
+                    Element element = (Element) node;
+                    if (excludedTags.contains(element.tagName())) {
+                        excluded = element;
+                    }
+                    if (accum.length() > 0
+                            && (element.isBlock() || element.tag().getName()
+                                    .equals("br"))
+                            && !lastCharIsWhitespace(accum))
+                        accum.append(' ');
+                }
+            }
+
+            public void tail(Node node, int depth) {
+                // make sure there is a space between block tags and immediately
+                // following text nodes <div>One</div>Two should be "One Two".
+                if (node instanceof Element) {
+                    Element element = (Element) node;
+                    if (element == excluded) {
+                        excluded = null;
+                    }
+                    if (element.isBlock()
+                            && (node.nextSibling() instanceof TextNode)
+                            && !lastCharIsWhitespace(accum))
+                        accum.append(' ');
+                }
+
+            }
+        }, node);
+        return accum.toString().trim();
+    }
+
+    private static void appendNormalisedText(StringBuilder accum,
+            TextNode textNode) {
+        String text = textNode.getWholeText();
+
+        if (preserveWhitespace(textNode.parent())
+                || textNode instanceof CDataNode)
+            accum.append(text);
+        else
+            StringUtil.appendNormalisedWhitespace(accum, text,
+                    lastCharIsWhitespace(accum));
+    }
+
+    static boolean preserveWhitespace(Node node) {
+        // looks only at this element and five levels up, to prevent recursion &
+        // needless stack searches
+        if (node != null && node instanceof Element) {
+            Element el = (Element) node;
+            int i = 0;
+            do {
+                if (el.tag().preserveWhitespace())
+                    return true;
+                el = el.parent();
+                i++;
+            } while (i < 6 && el != null);
+        }
+        return false;
+    }
+
+    static boolean lastCharIsWhitespace(StringBuilder sb) {
+        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
+    }
+
+}
diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/parse/filter/ContentFilter.java b/core/src/main/java/com/digitalpebble/stormcrawler/parse/filter/ContentFilter.java
@@ -45,6 +45,9 @@
  * Restricts the text of the main document based on the text value of an Xpath
  * expression (e.g. &lt;div id='maincontent'&gt;). This is useful when dealing
  * with a known format to get rid of the boilerplate HTML code.
+ * 
+ * @deprecated use {@link TextExtractor} to exclude tags and get spaces between
+ *             elements.
  **/
 public class ContentFilter extends ParseFilter {
 
@@ -62,9 +65,6 @@ public void filter(String URL, byte[] content, DocumentFragment doc,
 
         ParseData pd = parse.get(URL);
 
-        // TODO determine how to restrict the expressions e.g. regexp on URL
-        // or value in metadata
-
         // iterates on the expressions - stops at the first that matches
         for (LabelledExpression expression : expressions) {
             try {

diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/parse/TextExtractorTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/parse/TextExtractorTest.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to DigitalPebble Ltd under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * DigitalPebble licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.digitalpebble.stormcrawler.parse;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.apache.storm.Config;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.junit.Test;
+
+public class TextExtractorTest {
+
+    @Test
+    public void testMainContent() throws IOException {
+        Config conf = new Config();
+        conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]");
+
+        TextExtractor extractor = new TextExtractor(conf);
+
+        String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>";
+
+        Document jsoupDoc = Parser.htmlParser().parseInput(content,
+                "http://stormcrawler.net");
+        String text = extractor.text(jsoupDoc.body());
+
+        assertEquals("main content", text);
+    }
+
+    @Test
+    public void testExclusion() throws IOException {
+        Config conf = new Config();
+        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
+
+        TextExtractor extractor = new TextExtractor(conf);
+
+        String content = "<html>the<style>main</style>content of the page</html>";
+
+        Document jsoupDoc = Parser.htmlParser().parseInput(content,
+                "http://stormcrawler.net");
+        String text = extractor.text(jsoupDoc.body());
+
+        assertEquals("the content of the page", text);
+    }
+
+    @Test
+    public void testExclusionCase() throws IOException {
+        Config conf = new Config();
+        conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");
+
+        TextExtractor extractor = new TextExtractor(conf);
+
+        String content = "<html>the<STYLE>main</STYLE>content of the page</html>";
+
+        Document jsoupDoc = Parser.htmlParser().parseInput(content,
+                "http://stormcrawler.net");
+        String text = extractor.text(jsoupDoc.body());
+
+        assertEquals("the content of the page", text);
+    }
+
+}