diff --git a/crawler.py b/crawler.py index b5e8b3a..c61ba96 100644 --- a/crawler.py +++ b/crawler.py @@ -1,14 +1,17 @@ import urllib.request +from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse import re class Crawler: def __init__(self, url, exclude=None, no_verbose=False): - self.url = url + + self.url = self.normalize(url) + self.host = urlparse(self.url).netloc self.exclude = exclude self.no_verbose = no_verbose self.found_links = [] - self.visited_links = [] + self.visited_links = [self.url] def start(self): self.crawl(self.url) @@ -18,7 +21,7 @@ def start(self): def crawl(self, url): if not self.no_verbose: - print(url) + print("Parsing " + url) response = urllib.request.urlopen(url) page = str(response.read()) @@ -29,29 +32,49 @@ def crawl(self, url): links = [] for link in found_links: - self.add_url(link, links, self.exclude) - self.add_url(link, self.found_links) + is_url = self.is_url(link) + + if is_url: + is_internal = self.is_internal(link) + + if is_internal: + self.add_url(link, links, self.exclude) + self.add_url(link, self.found_links, self.exclude) for link in links: if link not in self.visited_links: + link = self.normalize(link) + self.visited_links.append(link) - self.crawl("{0}{1}".format(self.url, link)) + self.crawl(urljoin(self.url, link)) def add_url(self, link, link_list, exclude_pattern=None): - link = link.rstrip("/") - - if link: - url_parts = link.split("://") + link = self.normalize(link) + if link: not_in_list = link not in link_list - is_internal_link = link[0] is "/" or link.split("/")[0] is self.url.split("/")[0] + excluded = False if exclude_pattern: excluded = re.search(exclude_pattern, link) - if not_in_list and is_internal_link and not excluded: + if not_in_list and not excluded: link_list.append(link) - + def normalize(self, url): + scheme, netloc, path, qs, anchor = urlsplit(url) + return urlunsplit((scheme, netloc, path, qs, anchor)) + + def is_internal(self, url): + host = urlparse(url).netloc + return host == self.host or host == '' + + def is_url(self, url): + scheme, netloc, path, qs, anchor = urlsplit(url) + + if url != '' and scheme in ['http', 'https', '']: + return True + else: + return False \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..ddbfda9 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file