feat: cache entire public suffix list. Select at runtime.

Addresses john-kurkowski#66 but is not backwards compatible.
CircleUp · Mar 19, 2019 · 4c29a8b · 4c29a8b
1 parent d40fc51
commit 4c29a8b
Show file tree

Hide file tree

Showing 11 changed files with 12,945 additions and 7,481 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ tldextract_app/tldextract
 tldextract_app/web
 tldextract.egg-info
 .tox
+tldextract/.suffix_cache/*
+.pytest_cache
diff --git a/README.md b/README.md
@@ -22,13 +22,13 @@ from its domain, and its domain from its country code.
 >>> import tldextract
 
 >>> tldextract.extract('http://forums.news.cnn.com/')
-ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
+ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', source='publicsuffix_icann')
 
 >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
-ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
+ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', source='publicsuffix_icann')
 
 >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
-ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
+ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', source='publicsuffix_icann')
 ```
 
 `ExtractResult` is a namedtuple, so it's simple to access the parts you want.
@@ -50,26 +50,13 @@ subdomain or a valid suffix.
 
 ```python
 >>> tldextract.extract('google.com')
-ExtractResult(subdomain='', domain='google', suffix='com')
+ExtractResult(subdomain='', domain='google', suffix='com', source='publicsuffix_icann')
 
 >>> tldextract.extract('google.notavalidsuffix')
-ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
+ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', source='')
 
 >>> tldextract.extract('http://127.0.0.1:8080/deployed/')
-ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
-```
-
-If you want to rejoin the whole namedtuple, regardless of whether a subdomain
-or suffix were found:
-
-```python
->>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
->>> # this has unwanted dots
->>> '.'.join(ext)
-'.127.0.0.1.'
->>> # join each part only if it's truthy
->>> '.'.join(part for part in ext if part)
-'127.0.0.1'
+ExtractResult(subdomain='', domain='127.0.0.1', suffix='', source='ip_address')
 ```
 
 By default, this package supports the public ICANN TLDs and their exceptions.
@@ -106,7 +93,7 @@ tldextract http://forums.bbc.co.uk
 
 Beware when first running the module, it updates its TLD list with a live HTTP
 request. This updated TLD set is cached indefinitely in
-`/path/to/tldextract/.tld_set`.
+`/path/to/tldextract/.suffix_cache`.
 
 (Arguably runtime bootstrapping like that shouldn't be the default behavior,
 like for production systems. But I want you to have the latest TLDs, especially
@@ -122,11 +109,11 @@ no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
 no_fetch_extract('http://www.google.com')
 
 # extract callable that reads/writes the updated TLD set to a different path
-custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
+custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/dir')
 custom_cache_extract('http://www.google.com')
 
 # extract callable that doesn't use caching
-no_cache_extract = tldextract.TLDExtract(cache_file=False)
+no_cache_extract = tldextract.TLDExtract(cache_dir=False)
 no_cache_extract('http://www.google.com')
 ```
 
@@ -169,9 +156,8 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
 The following overrides this.
 
 ```python
->>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
->>> extract.update() # necessary until #66 is fixed
->>> extract('waiterrant.blogspot.com')
+>>> extract = tldextract.TLDExtract()
+>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
 ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
 ```
 
@@ -185,11 +171,7 @@ behavior of other, PSL-based libraries.
 You can specify your own input data in place of the default Mozilla Public Suffix List:
 
 ```python
-extract = tldextract.TLDExtract(
-    suffix_list_urls=["http://foo.bar.baz"],
-    # Recommended: Specify your own cache file, to minimize ambiguities about where
-    # tldextract is getting its data, or cached data, from.
-    cache_file='/path/to/your/cache/file')
+extract = tldextract.TLDExtract(suffix_list_urls=["http://foo.bar.baz"])
 ```
 
 The above snippet will fetch from the URL *you* specified, upon first need to download the
@@ -198,9 +180,7 @@ suffix list (i.e. if the cache_file doesn't exist).
 If you want to use input data from your local filesystem, just use the `file://` protocol:
 
 ```python
-extract = tldextract.TLDExtract(
-    suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
-    cache_file='/path/to/your/cache/file')
+extract = tldextract.TLDExtract(suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"])
 ```
 
 Use an absolute path when specifying the `suffix_list_urls` keyword argument.

diff --git a/tests/custom_suffix_test.py b/tests/custom_suffix_test.py
@@ -4,7 +4,7 @@
 
 import tldextract
 
-from .helpers import temporary_file
+from .helpers import temporary_dir
 
 
 FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
@@ -15,15 +15,15 @@
 
 # pylint: disable=invalid-name
 extract_using_fake_suffix_list = tldextract.TLDExtract(
-    cache_file=temporary_file(),
+    cache_dir=temporary_dir(),
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
 )
 extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
 )
 extract_using_extra_suffixes = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
     extra_suffixes=EXTRA_SUFFIXES
 )

diff --git a/tests/helpers.py b/tests/helpers.py
@@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
     return output
 
 
-def temporary_file():
+def temporary_dir():
     """ Make a writable temporary file and return its absolute path.
     """
-    return tempfile.mkstemp()[1]
+    return tempfile.mkdtemp()
diff --git a/tests/integration_test.py b/tests/integration_test.py
@@ -1,33 +1,12 @@
 '''tldextract integration tests.'''
 
-import logging
-import os
-import traceback
-
 import pytest
 
 import tldextract
 
 
-def test_log_snapshot_diff(mocker):
-    mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
-    debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')
-
-    extractor = tldextract.TLDExtract()
-    try:
-        os.remove(extractor.cache_file)
-    except (IOError, OSError):
-        logging.warning(traceback.format_exc())
-
-    extractor('ignore.com')
-
-    assert debug_mock.call_count == 1
-    log_str = debug_mock.call_args[0][0]
-    assert log_str.startswith('computed TLD diff')
-
-
 def test_bad_kwargs():
     with pytest.raises(ValueError):
         tldextract.TLDExtract(
-            cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
+            cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
         )
diff --git a/tests/main_test.py b/tests/main_test.py
@@ -4,21 +4,24 @@
 import sys
 
 import responses
+
 import tldextract
-from .helpers import temporary_file
+from .helpers import temporary_dir
+
 if sys.version_info >= (3,):  # pragma: no cover
     unicode = str  # pylint: disable=invalid-name,redefined-builtin
 
-
 # pylint: disable=invalid-name
-extract = tldextract.TLDExtract(cache_file=temporary_file())
-extract_no_cache = tldextract.TLDExtract(cache_file=False)
-extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
-extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
+extract = tldextract.TLDExtract(cache_dir=temporary_dir())
+extract_no_cache = tldextract.TLDExtract(cache_dir=False)
+extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
+extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
 extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
-    cache_file=None,
+    cache_dir=None,
     suffix_list_urls=None
 )
+
+
 # pylint: enable=invalid-name
 
 
@@ -90,7 +93,7 @@ def test_qualified_local_host():
 def test_ip():
     assert_extract('http://216.22.0.192/',
                    ('', '', '216.22.0.192', ''),
-                   expected_ip_data='216.22.0.192',)
+                   expected_ip_data='216.22.0.192', )
     assert_extract('http://216.22.project.coop/',
                    ('216.22.project.coop', '216.22', 'project', 'coop'))
 
@@ -223,7 +226,7 @@ def test_result_as_dict():
     )
     expected_dict = {'subdomain': 'www',
                      'domain': 'google',
-                     'suffix': 'com'}
+                     'suffix': 'com', }
     assert result._asdict() == expected_dict