diff --git a/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java b/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java index b1f348018..1b4be2d04 100644 --- a/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java +++ b/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java @@ -358,7 +358,9 @@ protected ImporterResponse executeImporterPipeline( new HttpImporterPipelineContext(importerContext); new HttpImporterPipeline( getCrawlerConfig().isKeepDownloads(), - importerContext.isOrphan()).execute(httpContext); + importerContext.isOrphan(), + getCrawlerConfig().isLinkExtractorQuitAtDepth() + ).execute(httpContext); return httpContext.getImporterResponse(); } diff --git a/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java b/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java index f9735324f..db319ca21 100644 --- a/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java +++ b/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java @@ -86,6 +86,7 @@ public class HttpCrawlerConfig extends AbstractCrawlerConfig { private boolean ignoreCanonicalLinks; private boolean keepOutOfScopeLinks; private boolean skipMetaFetcherOnBadStatus; + private boolean linkExtractorQuitAtDepth; private String userAgent; @@ -295,6 +296,12 @@ public boolean isKeepDownloads() { public void setKeepDownloads(boolean keepDownloads) { this.keepDownloads = keepDownloads; } + public boolean isLinkExtractorQuitAtDepth() { + return linkExtractorQuitAtDepth; + } + public void setLinkExtractorQuitAtDepth(boolean linkExtractorQuitAtDepth) { + this.linkExtractorQuitAtDepth = linkExtractorQuitAtDepth; + } /** * Whether links not in scope should be stored as metadata * under {@link HttpMetadata#COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE} @@ -483,6 +490,8 @@ protected void saveCrawlerConfigToXML(Writer out) throws IOException { writer.writeElementBoolean( "keepOutOfScopeLinks", isKeepOutOfScopeLinks()); + writer.writeElementBoolean( + "linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth()); writer.writeStartElement("startURLs"); writer.writeAttributeBoolean("stayOnProtocol", urlCrawlScopeStrategy.isStayOnProtocol()); @@ -666,6 +675,8 @@ private void loadSimpleSettings(XMLConfiguration xml) { setKeepDownloads(xml.getBoolean("keepDownloads", isKeepDownloads())); setKeepOutOfScopeLinks( xml.getBoolean("keepOutOfScopeLinks", isKeepOutOfScopeLinks())); + setKeepOutOfScopeLinks( + xml.getBoolean("linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth())); setIgnoreCanonicalLinks(xml.getBoolean( "ignoreCanonicalLinks", isIgnoreCanonicalLinks())); urlCrawlScopeStrategy.setStayOnProtocol(xml.getBoolean( @@ -756,6 +767,7 @@ public boolean equals(final Object other) { .append(ignoreSitemap, castOther.ignoreSitemap) .append(keepDownloads, castOther.keepDownloads) .append(keepOutOfScopeLinks, castOther.keepOutOfScopeLinks) + .append(linkExtractorQuitAtDepth, castOther.linkExtractorQuitAtDepth) .append(ignoreCanonicalLinks, castOther.ignoreCanonicalLinks) .append(skipMetaFetcherOnBadStatus, castOther.skipMetaFetcherOnBadStatus) @@ -794,6 +806,7 @@ public int hashCode() { .append(ignoreSitemap) .append(keepDownloads) .append(keepOutOfScopeLinks) + .append(linkExtractorQuitAtDepth) .append(ignoreCanonicalLinks) .append(skipMetaFetcherOnBadStatus) .append(userAgent) @@ -830,6 +843,7 @@ public String toString() { .append("ignoreSitemap", ignoreSitemap) .append("keepDownloads", keepDownloads) .append("keepOutOfScopeLinks", keepOutOfScopeLinks) + .append("linkExtractorQuitAtDepth", linkExtractorQuitAtDepth) .append("ignoreCanonicalLinks", ignoreCanonicalLinks) .append("skipMetaFetcherOnBadStatus", skipMetaFetcherOnBadStatus) diff --git a/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java b/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java index c257db5be..1eee196ec 100644 --- a/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java +++ b/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java @@ -47,7 +47,7 @@ public class HttpImporterPipeline //sharing all thread safe/common information, //just changing what is url/doc specific. - public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) { + public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan, boolean isLinkExtractorQuitsAtDepth) { // if an orphan is reprocessed, it could be that it is no longer // referenced because of deletion. Because of that, we need @@ -73,7 +73,7 @@ public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) { addStage(new MetadataCanonicalGETStage()); addStage(new DocumentCanonicalStage()); addStage(new RobotsMetaCreateStage()); - addStage(new LinkExtractorStage()); + addStage(new LinkExtractorStage(isLinkExtractorQuitsAtDepth)); addStage(new RobotsMetaNoIndexStage()); addStage(new MetadataFiltersGETStage()); addStage(new MetadataChecksumStage(false)); diff --git a/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java b/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java index 351fda3fb..02f0ca2f3 100644 --- a/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java +++ b/norconex-collector-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java @@ -44,6 +44,12 @@ private static final Logger LOG = LogManager.getLogger(LinkExtractorStage.class); + private final boolean quitAtDepth; + + public LinkExtractorStage(boolean quitAtDepth) { + this.quitAtDepth = quitAtDepth; + } + @Override public boolean executeStage(HttpImporterPipelineContext ctx) { @@ -52,6 +58,15 @@ public boolean executeStage(HttpImporterPipelineContext ctx) { return true; } + if(quitAtDepth) { + int depth = ctx.getCrawlData().getDepth(); + int maxdepth = ctx.getConfig().getMaxDepth(); + + if (depth == maxdepth) { + LOG.debug("Max depth reached ; do not extract link"); + return true; + } + } String reference = ctx.getCrawlData().getReference();