/
utils.py
179 lines (146 loc) · 5.33 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Provides essential utilites for the rest of TorBot app
"""
from queue import Queue
from threading import Thread
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError
import requests
import modules.getweblinks
# ALGORITHM UTILITY FUNCTIONS
def bfs_urls(urls, add_exts, rec_depth=0, stop_depth=None, target_url=None):
"""
Traverses urls passed using Breadth First Search. You can specify stop
depth or specify a target to look for. The rec_depth argument is used
for recursion.
*NOTE: This function uses a GET request for each url found, this can
be very expensive so avoid if possible try to acquire the urls to
be traversed and use bfs function.
Args:
urls (list): urls to traverse
add_exts (str): additional extensions to use
rec_depth (int): used for recursion
stop_depth (int): stops traversing at this depth if specified
target_url (str): stops at this url if specified
Returns:
rec_depth (int): depth stopped at
"""
if rec_depth == stop_depth:
return rec_depth
urls_to_visit = list()
for url in urls:
if target_url == url and target_url:
return rec_depth
try:
resp = requests.get(url)
except (HTTPError, ConnectionError):
continue
soup = BeautifulSoup(resp.text, 'html.parser')
page_urls = modules.getweblinks.get_urls_from_page(soup, extension=add_exts)
for page_url in page_urls:
urls_to_visit.append(page_url)
rec_depth += 1
if stop_depth and target_url:
return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth, target_url)
if stop_depth:
return bfs_urls(urls_to_visit, add_exts, rec_depth, stop_depth=stop_depth)
if target_url:
return bfs_urls(urls_to_visit, add_exts, rec_depth, target_url=target_url)
return bfs_urls(urls_to_visit, add_exts, rec_depth=rec_depth)
def bfs(nodes, target_node=None, rec_depth=0, stop_depth=None):
"""
Traverses nodes using Breadth First Search. You can specify stop
depth or specify a target to look for. The rec_depth argument is used
for recursion.
Args:
nodes (list): objects to traverse
target_node (object): object being searched for
rec_depth (int): used for recursion
stop_depth (int): stops traversing at this depth if specified
Returns:
rec_depth (int): depth stopped at
"""
if rec_depth == stop_depth:
return rec_depth
adjacent_nodes = list()
# Checks that nodes is a list or has a Visit method
if not isinstance(nodes, list) and not hasattr(nodes, 'Visit', False):
raise Exception('nodes must be a list')
for node in nodes:
if target_node == node and target_node:
return rec_depth
node.Visit()
adjacent_nodes.append(node)
rec_depth += 1
if target_node and not stop_depth:
return bfs(adjacent_nodes, target_node, rec_depth)
if not target_node and stop_depth:
return bfs(adjacent_nodes, rec_depth=rec_depth, stop_depth=stop_depth)
if target_node and stop_depth:
return bfs(adjacent_nodes, target_node, rec_depth, stop_depth)
return bfs(adjacent_nodes, rec_depth)
def exec_tasks(que, task_func, tasks_args=tuple()):
"""
Executes tasks inside of queue using function and arguments passed
inside of threads
Args:
que (queue.Queue): contains tasks
task_func (function): function to be executed on tasks and args
task_args (tuple): contains arguments for function
Returns:
None
"""
while True:
task = que.get()
if tasks_args:
task_func(task, tasks_args)
else:
task_func(task)
que.task_done()
def queue_tasks(tasks, task_func, tasks_args=tuple()):
"""
Starts threads with tasks and queue, then queues tasks and spawned
threads begin to pull tasks off queue to execute
Args:
tasks (list): lists of values that you'd like to operate on
task_func (function): function that you would like to use
tasks_args (tuple): arguments for function
Returns:
None
"""
que = Queue(len(tasks)*2)
for _ in tasks:
if tasks_args:
if isinstance(tasks_args, tuple):
thd = Thread(target=exec_tasks, args=(que, task_func, tasks_args))
thd.daemon = True
thd.start()
else:
raise Exception('Arguments must be in the form of a tuple.')
else:
thd = Thread(target=exec_tasks, args=(que, task_func))
thd.daemon = True
thd.start()
for task in tasks:
que.put(task)
que.join()
# Networking functions
def get_url_status(url, headers=False):
"""
Uses GET request to check if website exists
*NOTE: May look into changing this to HEAD requests to improve perf
Args:
url (str): url to be tested
Return:
something? (int/Response object): return value of the connection
object's GET request if successful & zero upon failure
"""
try:
if headers:
resp = requests.get(url, headers=headers)
else:
resp = requests.get(url)
resp.raise_for_status()
return resp
except (ConnectionError, HTTPError):
return 0