Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added configurable TextExtractor to JSoupParserBolt + commented out C…
…ontentFilter in archetype and configure TextExtractor instead (#678)
- Loading branch information
Showing
6 changed files
with
278 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
180 changes: 180 additions & 0 deletions
180
core/src/main/java/com/digitalpebble/stormcrawler/parse/TextExtractor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
/** | ||
* Licensed to DigitalPebble Ltd under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.digitalpebble.stormcrawler.parse; | ||
|
||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import org.jsoup.helper.StringUtil; | ||
import org.jsoup.nodes.CDataNode; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.nodes.Node; | ||
import org.jsoup.nodes.TextNode; | ||
import org.jsoup.select.Elements; | ||
import org.jsoup.select.NodeTraversor; | ||
import org.jsoup.select.NodeVisitor; | ||
|
||
import com.digitalpebble.stormcrawler.util.ConfUtils; | ||
|
||
/** | ||
* Filters the text extracted from HTML documents, used by JSoupParserBolt. | ||
* Configured with optional inclusion patterns based on <a | ||
* href="https://jsoup.org/cookbook/extracting-data/selector-syntax"> JSoup | ||
* selectors</a>, as well as a list of tags to be excluded. | ||
* | ||
* Replaces {@link ContentFilter}. | ||
* | ||
* The first matching inclusion pattern is used or the whole document if no | ||
* expressions are configured or no match has been found. | ||
* | ||
* The TextExtraction can be configured as so: | ||
* | ||
* <pre> | ||
* {@code | ||
* textextractor.include.pattern: | ||
* - DIV[id="maincontent"] | ||
* - DIV[itemprop="articleBody"] | ||
* - ARTICLE | ||
* | ||
* textextractor.exclude.tags: | ||
* - STYLE | ||
* - SCRIPT | ||
* } | ||
* </pre> | ||
* | ||
* @since 1.13 | ||
**/ | ||
public class TextExtractor { | ||
|
||
public final static String INCLUDE_PARAM_NAME = "textextractor.include.pattern"; | ||
public final static String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags"; | ||
|
||
private List<String> inclusionPatterns; | ||
private HashSet<String> excludedTags; | ||
|
||
public TextExtractor(Map stormConf) { | ||
inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, | ||
stormConf); | ||
excludedTags = new HashSet<String>(); | ||
ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf) | ||
.forEach((s) -> excludedTags.add(s.toLowerCase())); | ||
} | ||
|
||
public String text(Element element) { | ||
// no patterns at all - return the text from the whole document | ||
if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) { | ||
return _text(element); | ||
} | ||
|
||
Elements matches = new Elements(); | ||
|
||
for (String pattern : inclusionPatterns) { | ||
matches = element.select(pattern); | ||
if (!matches.isEmpty()) | ||
break; | ||
} | ||
|
||
// if nothing matches or no patterns were defined use the whole doc | ||
if (matches.isEmpty()) { | ||
matches.add(element); | ||
} | ||
|
||
final StringBuilder accum = new StringBuilder(); | ||
|
||
for (Element node : matches) { | ||
accum.append(_text(node)).append("\n"); | ||
} | ||
|
||
return accum.toString().trim(); | ||
} | ||
|
||
private String _text(Node node) { | ||
final StringBuilder accum = new StringBuilder(); | ||
NodeTraversor.traverse(new NodeVisitor() { | ||
|
||
private Node excluded = null; | ||
|
||
public void head(Node node, int depth) { | ||
if (excluded == null && node instanceof TextNode) { | ||
TextNode textNode = (TextNode) node; | ||
appendNormalisedText(accum, textNode); | ||
} else if (node instanceof Element) { | ||
Element element = (Element) node; | ||
if (excludedTags.contains(element.tagName())) { | ||
excluded = element; | ||
} | ||
if (accum.length() > 0 | ||
&& (element.isBlock() || element.tag().getName() | ||
.equals("br")) | ||
&& !lastCharIsWhitespace(accum)) | ||
accum.append(' '); | ||
} | ||
} | ||
|
||
public void tail(Node node, int depth) { | ||
// make sure there is a space between block tags and immediately | ||
// following text nodes <div>One</div>Two should be "One Two". | ||
if (node instanceof Element) { | ||
Element element = (Element) node; | ||
if (element == excluded) { | ||
excluded = null; | ||
} | ||
if (element.isBlock() | ||
&& (node.nextSibling() instanceof TextNode) | ||
&& !lastCharIsWhitespace(accum)) | ||
accum.append(' '); | ||
} | ||
|
||
} | ||
}, node); | ||
return accum.toString().trim(); | ||
} | ||
|
||
private static void appendNormalisedText(StringBuilder accum, | ||
TextNode textNode) { | ||
String text = textNode.getWholeText(); | ||
|
||
if (preserveWhitespace(textNode.parent()) | ||
|| textNode instanceof CDataNode) | ||
accum.append(text); | ||
else | ||
StringUtil.appendNormalisedWhitespace(accum, text, | ||
lastCharIsWhitespace(accum)); | ||
} | ||
|
||
static boolean preserveWhitespace(Node node) { | ||
// looks only at this element and five levels up, to prevent recursion & | ||
// needless stack searches | ||
if (node != null && node instanceof Element) { | ||
Element el = (Element) node; | ||
int i = 0; | ||
do { | ||
if (el.tag().preserveWhitespace()) | ||
return true; | ||
el = el.parent(); | ||
i++; | ||
} while (i < 6 && el != null); | ||
} | ||
return false; | ||
} | ||
|
||
static boolean lastCharIsWhitespace(StringBuilder sb) { | ||
return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
79 changes: 79 additions & 0 deletions
79
core/src/test/java/com/digitalpebble/stormcrawler/parse/TextExtractorTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/** | ||
* Licensed to DigitalPebble Ltd under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.digitalpebble.stormcrawler.parse; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.io.IOException; | ||
|
||
import org.apache.storm.Config; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.parser.Parser; | ||
import org.junit.Test; | ||
|
||
public class TextExtractorTest { | ||
|
||
@Test | ||
public void testMainContent() throws IOException { | ||
Config conf = new Config(); | ||
conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]"); | ||
|
||
TextExtractor extractor = new TextExtractor(conf); | ||
|
||
String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>"; | ||
|
||
Document jsoupDoc = Parser.htmlParser().parseInput(content, | ||
"http://stormcrawler.net"); | ||
String text = extractor.text(jsoupDoc.body()); | ||
|
||
assertEquals("main content", text); | ||
} | ||
|
||
@Test | ||
public void testExclusion() throws IOException { | ||
Config conf = new Config(); | ||
conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE"); | ||
|
||
TextExtractor extractor = new TextExtractor(conf); | ||
|
||
String content = "<html>the<style>main</style>content of the page</html>"; | ||
|
||
Document jsoupDoc = Parser.htmlParser().parseInput(content, | ||
"http://stormcrawler.net"); | ||
String text = extractor.text(jsoupDoc.body()); | ||
|
||
assertEquals("the content of the page", text); | ||
} | ||
|
||
@Test | ||
public void testExclusionCase() throws IOException { | ||
Config conf = new Config(); | ||
conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style"); | ||
|
||
TextExtractor extractor = new TextExtractor(conf); | ||
|
||
String content = "<html>the<STYLE>main</STYLE>content of the page</html>"; | ||
|
||
Document jsoupDoc = Parser.htmlParser().parseInput(content, | ||
"http://stormcrawler.net"); | ||
String text = extractor.text(jsoupDoc.body()); | ||
|
||
assertEquals("the content of the page", text); | ||
} | ||
|
||
} |