-
Notifications
You must be signed in to change notification settings - Fork 0
/
tlds.py
138 lines (117 loc) · 5.86 KB
/
tlds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Start at the bottom (`if __name__ == '__main__':`) and follow the
# execution from there.
import os
from datetime import datetime, timedelta
from urllib2 import urlopen
def should_update(filename, update_interval):
"""Is `filename`'s mtime more than `update_interval` ago?"""
if not os.path.isfile(filename):
return True
# Working with datetime/timedelta objects usually result in code
# that is easier to read.
file_mtime = datetime.fromtimestamp(os.stat(filename).st_mtime)
return file_mtime + update_interval <= datetime.now()
def download_tlds(url):
"""
Returns the (stripped) lines from the given URL that are not
comments.
The URL is assumed to a be that of a text file (no HTML parsing is
performed).
Comment lines start with `#`.
"""
# This is a separate function in order to isolate any future changes
# that may be needed when the format of the TLDs list changes. For
# example, if you want to use the Public Suffix list
# (https://publicsuffix.org/) format, you only need to change this
# function:
# return [line.strip() for line in urlopen(url)
# if line and not line.startswith('//')]
# The Separation of Concerns principle is at work here.
# DO use list comprehensions
return [line.strip() for line in urlopen(url)
if not line.startswith('#')]
def update_tlds(filename, url, update_interval=None):
"""
Update the TLDs list in `filename` with the TLDs from `url`, but
only if after a minimum of `update_interval`.
:param url: The URL to download the text list of TLDs from.
:type update_interval: ``datetime.timedelta``
:returns: ``None`` if no update was attempted (`update_interval` not
yet reached), or a list of new TLDs.
"""
if update_interval is None:
update_interval = timedelta(days=3)
# This function contains your main algorithm and should be as
# readable as possible. That implies factoring out any logic that is
# not strictly part of the main algorithm, into separate functions.
# A good way to achieve this, is the outside-in approach: Write this
# function first, as if all other functions that it uses already
# exist. In that way you will only need to "fill in the gaps"
# (implement the missing functions), and you already know what the
# APIs (function signatures in this case) should look like.
if not should_update(filename, update_interval):
return
# The functionality to load the current TLDs from the file was
# deemed as too simple and very unlikely to change, to justify
# factoring it out into a separate function. In the end, I believe
# that the source is easier to understand this way.
current_tlds = []
if os.path.isfile(filename):
current_tlds = [line.strip() for line in open(filename)]
# The downloading of TLDs, while similarly simple (a single list
# comprehension), is much more likely to change in implementation.
# The implementation is, however, not the concern of this algorithm,
# and is therefore factored out into the `download_tlds()` function.
url_tlds = download_tlds(url)
new_tlds = sorted(set(url_tlds) - set(current_tlds))
if new_tlds:
# Since the total number of TLDs is not very big and the
# resulting file is also quite small, we can simply recreate the
# whole file. Similar to the argument below, it would be better
# to only append `new_tlds` to the file if the total size of the
# dataset was sufficiently large.
with open(filename, 'w') as tldfile:
# Combine all TLDs into a single (newline separated) string
# and write it at once, in stead of writing each TLD/line
# separately.
# If the list of TLDs was VERY big (millions/billions of
# lines) and/or each TLD was a bigger data structure, it
# would have been better to serialise and write each one
# separately. But in cases of this magnitude it is faster to
# pass all the data to the operating system, and let it
# figure out how to perform the low level writing.
tldfile.write('\n'.join(url_tlds))
# We only return the new TLDs, since those are the ones most likely
# to be required outside of this function. The complete updated list
# is still accessible via the filename that was passed into this
# function.
return new_tlds
if __name__ == '__main__':
# Always put "top-level" code in an `if __name__ == '__main__':`
# guard. Otherwise it will be executed when you import this file.
# I used your source of TLDs in this example, but would like you to
# consider using the Public Suffix list (https://publicsuffix.org/).
# In most cases that would be the more appropriate list to use.
# The update interval of 1 second below is only for testing
# purposes.
new_tlds = update_tlds(
'tlds.txt', 'http://data.iana.org/TLD/tlds-alpha-by-domain.txt',
update_interval=timedelta(seconds=1))
# User feedback is not part of the `update_tlds()` function's
# responsibilities, so we give feedback here.
if new_tlds is None:
print 'Update not performed'
elif new_tlds:
# Use `str.join(iterable)` in stead of `str(list(...))`
print 'New TLDs: %s' % (', '.join(new_tlds),)
else:
# new_tlds must be []
print 'No new TLDs :('
# Note: There is no error handling in this code. That is not an
# oversight but deliberately done, because any error that occurs in this
# code *should* bubble up, and fail immediately. It is up to the client
# code (the code that uses this module) to check for errors.
# Since the code is separated by concern, the most likely errors (file
# I/O, and downloading of TLDs/network I/O) will not lead to data loss.