Merge pull request #153 from 18F/www-subdomain-gathering

Add a --ignore-www option to gathering that ignores "www." prefixes
18F · Nov 5, 2017 · 1f789de · 1f789de
2 parents ca1e48b + d7e0086
commit 1f789de
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -174,6 +174,7 @@ General options:
 * `--suffix`: **Required.** suffix to filter on (e.g. `.gov`)
 * `--parents`: A path or URL to a CSV whose first column is second-level domains. Any subdomain not contained within these second-level domains will be excluded.
 * `--include-parents`: Include second-level domains. (Defaults to false.)
+* `--ignore-www`: Ignore the `www.` prefixes of hostnames. If `www.staging.example.com` is found, it will be treated as `staging.example.com`.
 * `--debug`: display extra output
 
 ### `censys`: the Censys.io API

diff --git a/gather b/gather
@@ -2,6 +2,7 @@
 
 import os
 import sys
+import re
 import csv
 import requests
 import logging
@@ -34,6 +35,11 @@ def run(options=None):
     # Opt in to include parent (second-level) domains.
     include_parents = options.get("include-parents", False)
 
+    # Opt into stripping www. prefixes from hostnames, effectively
+    # collapsing www.[host] and [host] into one record.
+    ignore_www = options.get("ignore-www", False)
+    strip_www = re.compile("^www\.")
+
     # --parents should be a CSV whose first column is parent domains
     # that will act as a whitelist for which subdomains to gather.
     parents = get_parent_domains(options)
@@ -72,14 +78,20 @@ def run(options=None):
         # Iterate over each hostname.
         for domain in gatherer.gather(suffix, options, extra):
 
-            # Always apply the suffix to returned names.
+            # Always apply the suffix filter to returned names.
             if not suffix_pattern.search(domain):
                 continue
 
+            # Strip www. prefixes from hostnames, effectively
+            # collapsing www.[host] and [host] into one record.
+            if ignore_www:
+                domain = strip_www.sub("", domain)
+
             base = utils.base_domain_for(domain)
 
             # Unless --include-parents is specified, exclude them.
             if not include_parents:
+                # Always ignore www prefixes for base domains.
                 if (domain == base) or (domain == "www.%s" % base):
                     continue