-
Notifications
You must be signed in to change notification settings - Fork 0
/
linkchecker.py
65 lines (53 loc) · 2.21 KB
/
linkchecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from ftw.linkchecker import LOGGER_NAME
from ftw.linkchecker.pool_with_logging import PoolWithLogging
from functools import partial
from multiprocessing import cpu_count
import logging
import requests
import time
def millis():
return int(round(time.time() * 1000))
def get_uri_response(external_link_obj, timeout):
logger = logging.getLogger(LOGGER_NAME)
logger.info('Head request to {}'.format(
external_link_obj.link_target.encode('utf-8')))
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
error = None
response = None
start_time = millis()
try:
response = requests.head(external_link_obj.link_target.encode('utf-8'),
timeout=timeout,
headers=headers,
allow_redirects=False,
verify=False)
except requests.exceptions.Timeout:
error = 'Timeout'
except requests.exceptions.TooManyRedirects:
error = 'Too many redirects'
except requests.exceptions.ConnectionError:
error = 'Connection Error'
except Exception as e:
error = e.message
time = millis() - start_time
if response and response.status_code == 200 \
or 'resolveuid' in external_link_obj.link_target:
external_link_obj.is_broken = False
else:
external_link_obj.is_broken = True
external_link_obj.status_code = getattr(response, 'status_code', None)
external_link_obj.content_type = headers.get('Content-Type', None)
external_link_obj.response_time = time
external_link_obj.error_message = error
return external_link_obj
def work_through_urls(external_link_objs, timeout_config):
# prepare worker function and pool
part_get_uri_response = partial(get_uri_response, timeout=timeout_config)
pool = PoolWithLogging(processes=cpu_count(), logger_name=LOGGER_NAME)
start_time = millis()
external_link_objs = pool.map(part_get_uri_response, external_link_objs)
pool.close()
total_time = millis() - start_time
return external_link_objs, total_time