Skip to content

Commit

Permalink
feat: cache entire public suffix list. Select at runtime.
Browse files Browse the repository at this point in the history
Addresses john-kurkowski#66 but is not backwards compatible.
  • Loading branch information
brycedrennan committed Mar 19, 2019
1 parent d40fc51 commit 4c29a8b
Show file tree
Hide file tree
Showing 11 changed files with 12,945 additions and 7,481 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ tldextract_app/tldextract
tldextract_app/web
tldextract.egg-info
.tox
tldextract/.suffix_cache/*
.pytest_cache
46 changes: 13 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ from its domain, and its domain from its country code.
>>> import tldextract

>>> tldextract.extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', source='publicsuffix_icann')

>>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', source='publicsuffix_icann')

>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', source='publicsuffix_icann')
```

`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
Expand All @@ -50,26 +50,13 @@ subdomain or a valid suffix.

```python
>>> tldextract.extract('google.com')
ExtractResult(subdomain='', domain='google', suffix='com')
ExtractResult(subdomain='', domain='google', suffix='com', source='publicsuffix_icann')

>>> tldextract.extract('google.notavalidsuffix')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', source='')

>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
```

If you want to rejoin the whole namedtuple, regardless of whether a subdomain
or suffix were found:

```python
>>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
>>> # this has unwanted dots
>>> '.'.join(ext)
'.127.0.0.1.'
>>> # join each part only if it's truthy
>>> '.'.join(part for part in ext if part)
'127.0.0.1'
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', source='ip_address')
```

By default, this package supports the public ICANN TLDs and their exceptions.
Expand Down Expand Up @@ -106,7 +93,7 @@ tldextract http://forums.bbc.co.uk

Beware when first running the module, it updates its TLD list with a live HTTP
request. This updated TLD set is cached indefinitely in
`/path/to/tldextract/.tld_set`.
`/path/to/tldextract/.suffix_cache`.

(Arguably runtime bootstrapping like that shouldn't be the default behavior,
like for production systems. But I want you to have the latest TLDs, especially
Expand All @@ -122,11 +109,11 @@ no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
no_fetch_extract('http://www.google.com')

# extract callable that reads/writes the updated TLD set to a different path
custom_cache_extract = tldextract.TLDExtract(cache_file='/path/to/your/cache/file')
custom_cache_extract = tldextract.TLDExtract(cache_dir='/path/to/your/cache/dir')
custom_cache_extract('http://www.google.com')

# extract callable that doesn't use caching
no_cache_extract = tldextract.TLDExtract(cache_file=False)
no_cache_extract = tldextract.TLDExtract(cache_dir=False)
no_cache_extract('http://www.google.com')
```

Expand Down Expand Up @@ -169,9 +156,8 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
The following overrides this.

```python
>>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
>>> extract.update() # necessary until #66 is fixed
>>> extract('waiterrant.blogspot.com')
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
```

Expand All @@ -185,11 +171,7 @@ behavior of other, PSL-based libraries.
You can specify your own input data in place of the default Mozilla Public Suffix List:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["http://foo.bar.baz"],
# Recommended: Specify your own cache file, to minimize ambiguities about where
# tldextract is getting its data, or cached data, from.
cache_file='/path/to/your/cache/file')
extract = tldextract.TLDExtract(suffix_list_urls=["http://foo.bar.baz"])
```

The above snippet will fetch from the URL *you* specified, upon first need to download the
Expand All @@ -198,9 +180,7 @@ suffix list (i.e. if the cache_file doesn't exist).
If you want to use input data from your local filesystem, just use the `file://` protocol:

```python
extract = tldextract.TLDExtract(
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
cache_file='/path/to/your/cache/file')
extract = tldextract.TLDExtract(suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"])
```

Use an absolute path when specifying the `suffix_list_urls` keyword argument.
Expand Down
8 changes: 4 additions & 4 deletions tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import tldextract

from .helpers import temporary_file
from .helpers import temporary_dir


FAKE_SUFFIX_LIST_URL = "file://" + os.path.join(
Expand All @@ -15,15 +15,15 @@

# pylint: disable=invalid-name
extract_using_fake_suffix_list = tldextract.TLDExtract(
cache_file=temporary_file(),
cache_dir=temporary_dir(),
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_fake_suffix_list_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL]
)
extract_using_extra_suffixes = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=[FAKE_SUFFIX_LIST_URL],
extra_suffixes=EXTRA_SUFFIXES
)
Expand Down
4 changes: 2 additions & 2 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def check_output(*popenargs, **kwargs):
return output


def temporary_file():
def temporary_dir():
""" Make a writable temporary file and return its absolute path.
"""
return tempfile.mkstemp()[1]
return tempfile.mkdtemp()
23 changes: 1 addition & 22 deletions tests/integration_test.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,12 @@
'''tldextract integration tests.'''

import logging
import os
import traceback

import pytest

import tldextract


def test_log_snapshot_diff(mocker):
mocker.patch.object(logging.getLogger(), 'level', logging.DEBUG)
debug_mock = mocker.patch.object(logging.getLogger('tldextract'), 'debug')

extractor = tldextract.TLDExtract()
try:
os.remove(extractor.cache_file)
except (IOError, OSError):
logging.warning(traceback.format_exc())

extractor('ignore.com')

assert debug_mock.call_count == 1
log_str = debug_mock.call_args[0][0]
assert log_str.startswith('computed TLD diff')


def test_bad_kwargs():
with pytest.raises(ValueError):
tldextract.TLDExtract(
cache_file=False, suffix_list_urls=False, fallback_to_snapshot=False
cache_dir=False, suffix_list_urls=False, fallback_to_snapshot=False
)
21 changes: 12 additions & 9 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
import sys

import responses

import tldextract
from .helpers import temporary_file
from .helpers import temporary_dir

if sys.version_info >= (3,): # pragma: no cover
unicode = str # pylint: disable=invalid-name,redefined-builtin


# pylint: disable=invalid-name
extract = tldextract.TLDExtract(cache_file=temporary_file())
extract_no_cache = tldextract.TLDExtract(cache_file=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_file=temporary_file())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_file=False)
extract = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_real_local_suffix_list = tldextract.TLDExtract(cache_dir=temporary_dir())
extract_using_real_local_suffix_list_no_cache = tldextract.TLDExtract(cache_dir=False)
extract_using_fallback_to_snapshot_no_cache = tldextract.TLDExtract(
cache_file=None,
cache_dir=None,
suffix_list_urls=None
)


# pylint: enable=invalid-name


Expand Down Expand Up @@ -90,7 +93,7 @@ def test_qualified_local_host():
def test_ip():
assert_extract('http://216.22.0.192/',
('', '', '216.22.0.192', ''),
expected_ip_data='216.22.0.192',)
expected_ip_data='216.22.0.192', )
assert_extract('http://216.22.project.coop/',
('216.22.project.coop', '216.22', 'project', 'coop'))

Expand Down Expand Up @@ -223,7 +226,7 @@ def test_result_as_dict():
)
expected_dict = {'subdomain': 'www',
'domain': 'google',
'suffix': 'com'}
'suffix': 'com', }
assert result._asdict() == expected_dict


Expand Down
Loading

0 comments on commit 4c29a8b

Please sign in to comment.