Skip to content

Commit

Permalink
Added configurable TextExtractor to JSoupParserBolt + commented out C…
Browse files Browse the repository at this point in the history
…ontentFilter in archetype and configure TextExtractor instead (#678)
  • Loading branch information
jnioche committed Jan 2, 2019
1 parent 9c70cbc commit 25b6f30
Show file tree
Hide file tree
Showing 6 changed files with 278 additions and 13 deletions.
10 changes: 10 additions & 0 deletions archetype/src/main/resources/archetype-resources/crawler-conf.yaml
Expand Up @@ -64,6 +64,16 @@ config:
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1

# text extraction for JSoupParserBolt
textextractor.include.pattern:
- DIV[id="maincontent"]
- DIV[itemprop="articleBody"]
- ARTICLE

textextractor.exclude.tags:
- STYLE
- SCRIPT

# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched successfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true: 30
Expand Down
Expand Up @@ -16,15 +16,6 @@
"parse.keywords": "//META[@name=\"keywords\"]/@content"
}
},
{
"class": "com.digitalpebble.stormcrawler.parse.filter.ContentFilter",
"name": "ContentFilter",
"params": {
"pattern": "//DIV[@id=\"maincontent\"]",
"pattern2": "//DIV[@itemprop=\"articleBody\"]",
"pattern3": "//ARTICLE"
}
},
{
"class": "com.digitalpebble.stormcrawler.parse.filter.DomainParseFilter",
"name": "DomainParseFilter",
Expand Down
Expand Up @@ -57,6 +57,7 @@
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.parse.TextExtractor;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.util.CharsetIdentification;
Expand Down Expand Up @@ -106,6 +107,8 @@ public class JSoupParserBolt extends StatusEmitterBolt {
**/
private int maxLengthCharsetDetection = -1;

private TextExtractor textExtractor;

@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void prepare(Map conf, TopologyContext context,
Expand Down Expand Up @@ -135,6 +138,8 @@ public void prepare(Map conf, TopologyContext context,

maxOutlinksPerPage = ConfUtils.getInt(conf,
"parser.emitOutlinks.max.per.page", -1);

textExtractor = new TextExtractor(conf);
}

@Override
Expand Down Expand Up @@ -266,7 +271,7 @@ public void execute(Tuple tuple) {

Element body = jsoupDoc.body();
if (body != null) {
text = body.text();
text = textExtractor.text(body);
}

} catch (Throwable e) {
Expand Down
@@ -0,0 +1,180 @@
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.parse;

import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import com.digitalpebble.stormcrawler.util.ConfUtils;

/**
* Filters the text extracted from HTML documents, used by JSoupParserBolt.
* Configured with optional inclusion patterns based on <a
* href="https://jsoup.org/cookbook/extracting-data/selector-syntax"> JSoup
* selectors</a>, as well as a list of tags to be excluded.
*
* Replaces {@link ContentFilter}.
*
* The first matching inclusion pattern is used or the whole document if no
* expressions are configured or no match has been found.
*
* The TextExtraction can be configured as so:
*
* <pre>
* {@code
* textextractor.include.pattern:
* - DIV[id="maincontent"]
* - DIV[itemprop="articleBody"]
* - ARTICLE
*
* textextractor.exclude.tags:
* - STYLE
* - SCRIPT
* }
* </pre>
*
* @since 1.13
**/
public class TextExtractor {

public final static String INCLUDE_PARAM_NAME = "textextractor.include.pattern";
public final static String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags";

private List<String> inclusionPatterns;
private HashSet<String> excludedTags;

public TextExtractor(Map stormConf) {
inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME,
stormConf);
excludedTags = new HashSet<String>();
ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
.forEach((s) -> excludedTags.add(s.toLowerCase()));
}

public String text(Element element) {
// no patterns at all - return the text from the whole document
if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
return _text(element);
}

Elements matches = new Elements();

for (String pattern : inclusionPatterns) {
matches = element.select(pattern);
if (!matches.isEmpty())
break;
}

// if nothing matches or no patterns were defined use the whole doc
if (matches.isEmpty()) {
matches.add(element);
}

final StringBuilder accum = new StringBuilder();

for (Element node : matches) {
accum.append(_text(node)).append("\n");
}

return accum.toString().trim();
}

private String _text(Node node) {
final StringBuilder accum = new StringBuilder();
NodeTraversor.traverse(new NodeVisitor() {

private Node excluded = null;

public void head(Node node, int depth) {
if (excluded == null && node instanceof TextNode) {
TextNode textNode = (TextNode) node;
appendNormalisedText(accum, textNode);
} else if (node instanceof Element) {
Element element = (Element) node;
if (excludedTags.contains(element.tagName())) {
excluded = element;
}
if (accum.length() > 0
&& (element.isBlock() || element.tag().getName()
.equals("br"))
&& !lastCharIsWhitespace(accum))
accum.append(' ');
}
}

public void tail(Node node, int depth) {
// make sure there is a space between block tags and immediately
// following text nodes <div>One</div>Two should be "One Two".
if (node instanceof Element) {
Element element = (Element) node;
if (element == excluded) {
excluded = null;
}
if (element.isBlock()
&& (node.nextSibling() instanceof TextNode)
&& !lastCharIsWhitespace(accum))
accum.append(' ');
}

}
}, node);
return accum.toString().trim();
}

private static void appendNormalisedText(StringBuilder accum,
TextNode textNode) {
String text = textNode.getWholeText();

if (preserveWhitespace(textNode.parent())
|| textNode instanceof CDataNode)
accum.append(text);
else
StringUtil.appendNormalisedWhitespace(accum, text,
lastCharIsWhitespace(accum));
}

static boolean preserveWhitespace(Node node) {
// looks only at this element and five levels up, to prevent recursion &
// needless stack searches
if (node != null && node instanceof Element) {
Element el = (Element) node;
int i = 0;
do {
if (el.tag().preserveWhitespace())
return true;
el = el.parent();
i++;
} while (i < 6 && el != null);
}
return false;
}

static boolean lastCharIsWhitespace(StringBuilder sb) {
return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
}

}
Expand Up @@ -45,6 +45,9 @@
* Restricts the text of the main document based on the text value of an Xpath
* expression (e.g. &lt;div id='maincontent'&gt;). This is useful when dealing
* with a known format to get rid of the boilerplate HTML code.
*
* @deprecated use {@link TextExtractor} to exclude tags and get spaces between
* elements.
**/
public class ContentFilter extends ParseFilter {

Expand All @@ -62,9 +65,6 @@ public void filter(String URL, byte[] content, DocumentFragment doc,

ParseData pd = parse.get(URL);

// TODO determine how to restrict the expressions e.g. regexp on URL
// or value in metadata

// iterates on the expressions - stops at the first that matches
for (LabelledExpression expression : expressions) {
try {
Expand Down
@@ -0,0 +1,79 @@
/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.digitalpebble.stormcrawler.parse;

import static org.junit.Assert.assertEquals;

import java.io.IOException;

import org.apache.storm.Config;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.junit.Test;

public class TextExtractorTest {

@Test
public void testMainContent() throws IOException {
Config conf = new Config();
conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]");

TextExtractor extractor = new TextExtractor(conf);

String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>";

Document jsoupDoc = Parser.htmlParser().parseInput(content,
"http://stormcrawler.net");
String text = extractor.text(jsoupDoc.body());

assertEquals("main content", text);
}

@Test
public void testExclusion() throws IOException {
Config conf = new Config();
conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE");

TextExtractor extractor = new TextExtractor(conf);

String content = "<html>the<style>main</style>content of the page</html>";

Document jsoupDoc = Parser.htmlParser().parseInput(content,
"http://stormcrawler.net");
String text = extractor.text(jsoupDoc.body());

assertEquals("the content of the page", text);
}

@Test
public void testExclusionCase() throws IOException {
Config conf = new Config();
conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");

TextExtractor extractor = new TextExtractor(conf);

String content = "<html>the<STYLE>main</STYLE>content of the page</html>";

Document jsoupDoc = Parser.htmlParser().parseInput(content,
"http://stormcrawler.net");
String text = extractor.text(jsoupDoc.body());

assertEquals("the content of the page", text);
}

}

0 comments on commit 25b6f30

Please sign in to comment.