Skip to content

Commit

Permalink
Norconex#498 option for link extractor to quit at depth
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwibberley committed Nov 3, 2020
1 parent 5f52e48 commit 9bcbb6c
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 3 deletions.
Expand Up @@ -358,7 +358,9 @@ protected ImporterResponse executeImporterPipeline(
new HttpImporterPipelineContext(importerContext);
new HttpImporterPipeline(
getCrawlerConfig().isKeepDownloads(),
importerContext.isOrphan()).execute(httpContext);
importerContext.isOrphan(),
getCrawlerConfig().isLinkExtractorQuitAtDepth()
).execute(httpContext);
return httpContext.getImporterResponse();
}

Expand Down
Expand Up @@ -86,6 +86,7 @@ public class HttpCrawlerConfig extends AbstractCrawlerConfig {
private boolean ignoreCanonicalLinks;
private boolean keepOutOfScopeLinks;
private boolean skipMetaFetcherOnBadStatus;
private boolean linkExtractorQuitAtDepth;

private String userAgent;

Expand Down Expand Up @@ -295,6 +296,12 @@ public boolean isKeepDownloads() {
public void setKeepDownloads(boolean keepDownloads) {
this.keepDownloads = keepDownloads;
}
public boolean isLinkExtractorQuitAtDepth() {
return linkExtractorQuitAtDepth;
}
public void setLinkExtractorQuitAtDepth(boolean linkExtractorQuitAtDepth) {
this.linkExtractorQuitAtDepth = linkExtractorQuitAtDepth;
}
/**
* Whether links not in scope should be stored as metadata
* under {@link HttpMetadata#COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE}
Expand Down Expand Up @@ -483,6 +490,8 @@ protected void saveCrawlerConfigToXML(Writer out) throws IOException {

writer.writeElementBoolean(
"keepOutOfScopeLinks", isKeepOutOfScopeLinks());
writer.writeElementBoolean(
"linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth());
writer.writeStartElement("startURLs");
writer.writeAttributeBoolean("stayOnProtocol",
urlCrawlScopeStrategy.isStayOnProtocol());
Expand Down Expand Up @@ -666,6 +675,8 @@ private void loadSimpleSettings(XMLConfiguration xml) {
setKeepDownloads(xml.getBoolean("keepDownloads", isKeepDownloads()));
setKeepOutOfScopeLinks(
xml.getBoolean("keepOutOfScopeLinks", isKeepOutOfScopeLinks()));
setKeepOutOfScopeLinks(
xml.getBoolean("linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth()));
setIgnoreCanonicalLinks(xml.getBoolean(
"ignoreCanonicalLinks", isIgnoreCanonicalLinks()));
urlCrawlScopeStrategy.setStayOnProtocol(xml.getBoolean(
Expand Down Expand Up @@ -756,6 +767,7 @@ public boolean equals(final Object other) {
.append(ignoreSitemap, castOther.ignoreSitemap)
.append(keepDownloads, castOther.keepDownloads)
.append(keepOutOfScopeLinks, castOther.keepOutOfScopeLinks)
.append(linkExtractorQuitAtDepth, castOther.linkExtractorQuitAtDepth)
.append(ignoreCanonicalLinks, castOther.ignoreCanonicalLinks)
.append(skipMetaFetcherOnBadStatus,
castOther.skipMetaFetcherOnBadStatus)
Expand Down Expand Up @@ -794,6 +806,7 @@ public int hashCode() {
.append(ignoreSitemap)
.append(keepDownloads)
.append(keepOutOfScopeLinks)
.append(linkExtractorQuitAtDepth)
.append(ignoreCanonicalLinks)
.append(skipMetaFetcherOnBadStatus)
.append(userAgent)
Expand Down Expand Up @@ -830,6 +843,7 @@ public String toString() {
.append("ignoreSitemap", ignoreSitemap)
.append("keepDownloads", keepDownloads)
.append("keepOutOfScopeLinks", keepOutOfScopeLinks)
.append("linkExtractorQuitAtDepth", linkExtractorQuitAtDepth)
.append("ignoreCanonicalLinks", ignoreCanonicalLinks)
.append("skipMetaFetcherOnBadStatus",
skipMetaFetcherOnBadStatus)
Expand Down
Expand Up @@ -47,7 +47,7 @@ public class HttpImporterPipeline
//sharing all thread safe/common information,
//just changing what is url/doc specific.

public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) {
public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan, boolean isLinkExtractorQuitsAtDepth) {

// if an orphan is reprocessed, it could be that it is no longer
// referenced because of deletion. Because of that, we need
Expand All @@ -73,7 +73,7 @@ public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) {
addStage(new MetadataCanonicalGETStage());
addStage(new DocumentCanonicalStage());
addStage(new RobotsMetaCreateStage());
addStage(new LinkExtractorStage());
addStage(new LinkExtractorStage(isLinkExtractorQuitsAtDepth));
addStage(new RobotsMetaNoIndexStage());
addStage(new MetadataFiltersGETStage());
addStage(new MetadataChecksumStage(false));
Expand Down
Expand Up @@ -44,6 +44,12 @@
private static final Logger LOG =
LogManager.getLogger(LinkExtractorStage.class);

private final boolean quitAtDepth;

public LinkExtractorStage(boolean quitAtDepth) {
this.quitAtDepth = quitAtDepth;
}

@Override
public boolean executeStage(HttpImporterPipelineContext ctx) {

Expand All @@ -52,6 +58,15 @@ public boolean executeStage(HttpImporterPipelineContext ctx) {
return true;
}

if(quitAtDepth) {
int depth = ctx.getCrawlData().getDepth();
int maxdepth = ctx.getConfig().getMaxDepth();

if (depth == maxdepth) {
LOG.debug("Max depth reached ; do not extract link");
return true;
}
}

String reference = ctx.getCrawlData().getReference();

Expand Down

0 comments on commit 9bcbb6c

Please sign in to comment.