Norconex#498 option for link extractor to quit at depth

CASM-Consulting · Nov 3, 2020 · 9bcbb6c · 9bcbb6c
1 parent 5f52e48
commit 9bcbb6c
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 3 deletions.
diff --git a/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java b/norconex-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawler.java
@@ -358,7 +358,9 @@ protected ImporterResponse executeImporterPipeline(
                 new HttpImporterPipelineContext(importerContext);
         new HttpImporterPipeline(
                 getCrawlerConfig().isKeepDownloads(),
-                importerContext.isOrphan()).execute(httpContext);
+                importerContext.isOrphan(),
+                getCrawlerConfig().isLinkExtractorQuitAtDepth()
+        ).execute(httpContext);
         return httpContext.getImporterResponse();
     }
 

diff --git a/...x-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java b/...x-collector-http/src/main/java/com/norconex/collector/http/crawler/HttpCrawlerConfig.java
@@ -86,6 +86,7 @@ public class HttpCrawlerConfig extends AbstractCrawlerConfig {
     private boolean ignoreCanonicalLinks;
 	private boolean keepOutOfScopeLinks;
 	private boolean skipMetaFetcherOnBadStatus;
+	private boolean linkExtractorQuitAtDepth;
 
     private String userAgent;
 
@@ -295,6 +296,12 @@ public boolean isKeepDownloads() {
     public void setKeepDownloads(boolean keepDownloads) {
         this.keepDownloads = keepDownloads;
     }
+    public boolean isLinkExtractorQuitAtDepth() {
+        return linkExtractorQuitAtDepth;
+    }
+    public void setLinkExtractorQuitAtDepth(boolean linkExtractorQuitAtDepth) {
+        this.linkExtractorQuitAtDepth = linkExtractorQuitAtDepth;
+    }
     /**
      * Whether links not in scope should be stored as metadata
      * under {@link HttpMetadata#COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE}
@@ -483,6 +490,8 @@ protected void saveCrawlerConfigToXML(Writer out) throws IOException {
 
 			writer.writeElementBoolean(
 			        "keepOutOfScopeLinks", isKeepOutOfScopeLinks());
+            writer.writeElementBoolean(
+                    "linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth());
             writer.writeStartElement("startURLs");
             writer.writeAttributeBoolean("stayOnProtocol",
                     urlCrawlScopeStrategy.isStayOnProtocol());
@@ -666,6 +675,8 @@ private void loadSimpleSettings(XMLConfiguration xml) {
         setKeepDownloads(xml.getBoolean("keepDownloads", isKeepDownloads()));
 		setKeepOutOfScopeLinks(
 		        xml.getBoolean("keepOutOfScopeLinks", isKeepOutOfScopeLinks()));
+        setKeepOutOfScopeLinks(
+                xml.getBoolean("linkExtractorQuitAtDepth", isLinkExtractorQuitAtDepth()));
         setIgnoreCanonicalLinks(xml.getBoolean(
                 "ignoreCanonicalLinks", isIgnoreCanonicalLinks()));
         urlCrawlScopeStrategy.setStayOnProtocol(xml.getBoolean(
@@ -756,6 +767,7 @@ public boolean equals(final Object other) {
                 .append(ignoreSitemap, castOther.ignoreSitemap)
                 .append(keepDownloads, castOther.keepDownloads)
                 .append(keepOutOfScopeLinks, castOther.keepOutOfScopeLinks)
+                .append(linkExtractorQuitAtDepth, castOther.linkExtractorQuitAtDepth)
                 .append(ignoreCanonicalLinks, castOther.ignoreCanonicalLinks)
                 .append(skipMetaFetcherOnBadStatus,
                         castOther.skipMetaFetcherOnBadStatus)
@@ -794,6 +806,7 @@ public int hashCode() {
                 .append(ignoreSitemap)
                 .append(keepDownloads)
                 .append(keepOutOfScopeLinks)
+                .append(linkExtractorQuitAtDepth)
                 .append(ignoreCanonicalLinks)
                 .append(skipMetaFetcherOnBadStatus)
                 .append(userAgent)
@@ -830,6 +843,7 @@ public String toString() {
                 .append("ignoreSitemap", ignoreSitemap)
                 .append("keepDownloads", keepDownloads)
                 .append("keepOutOfScopeLinks", keepOutOfScopeLinks)
+                .append("linkExtractorQuitAtDepth", linkExtractorQuitAtDepth)
                 .append("ignoreCanonicalLinks", ignoreCanonicalLinks)
                 .append("skipMetaFetcherOnBadStatus",
                         skipMetaFetcherOnBadStatus)

diff --git a/...ttp/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java b/...ttp/src/main/java/com/norconex/collector/http/pipeline/importer/HttpImporterPipeline.java
@@ -47,7 +47,7 @@ public class HttpImporterPipeline
     //sharing all thread safe/common information,
     //just changing what is url/doc specific.
 
-    public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) {
+    public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan, boolean isLinkExtractorQuitsAtDepth) {
 
         // if an orphan is reprocessed, it could be that it is no longer
         // referenced because of deletion.  Because of that, we need
@@ -73,7 +73,7 @@ public HttpImporterPipeline(boolean isKeepDownloads, boolean isOrphan) {
         addStage(new MetadataCanonicalGETStage());
         addStage(new DocumentCanonicalStage());
         addStage(new RobotsMetaCreateStage());
-        addStage(new LinkExtractorStage());
+        addStage(new LinkExtractorStage(isLinkExtractorQuitsAtDepth));
         addStage(new RobotsMetaNoIndexStage());
         addStage(new MetadataFiltersGETStage());
         addStage(new MetadataChecksumStage(false));

diff --git a/...-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java b/...-http/src/main/java/com/norconex/collector/http/pipeline/importer/LinkExtractorStage.java
@@ -44,6 +44,12 @@
     private static final Logger LOG =
             LogManager.getLogger(LinkExtractorStage.class);
 
+    private final boolean quitAtDepth;
+
+    public LinkExtractorStage(boolean quitAtDepth) {
+        this.quitAtDepth = quitAtDepth;
+    }
+
     @Override
     public boolean executeStage(HttpImporterPipelineContext ctx) {
 
@@ -52,6 +58,15 @@ public boolean executeStage(HttpImporterPipelineContext ctx) {
             return true;
         }
 
+        if(quitAtDepth) {
+            int depth = ctx.getCrawlData().getDepth();
+            int maxdepth = ctx.getConfig().getMaxDepth();
+
+            if (depth == maxdepth) {
+                LOG.debug("Max depth reached ; do not extract link");
+                return true;
+            }
+        }
 
         String reference = ctx.getCrawlData().getReference();