Skip to content

Commit

Permalink
Fixed issue #1
Browse files Browse the repository at this point in the history
  • Loading branch information
Cartman720 committed Sep 16, 2017
1 parent 2c2ae69 commit 478d399
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 13 deletions.
49 changes: 36 additions & 13 deletions crawler.py
@@ -1,14 +1,17 @@
import urllib.request
from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
import re

class Crawler:

def __init__(self, url, exclude=None, no_verbose=False):
self.url = url

self.url = self.normalize(url)
self.host = urlparse(self.url).netloc
self.exclude = exclude
self.no_verbose = no_verbose
self.found_links = []
self.visited_links = []
self.visited_links = [self.url]

def start(self):
self.crawl(self.url)
Expand All @@ -18,7 +21,7 @@ def start(self):

def crawl(self, url):
if not self.no_verbose:
print(url)
print("Parsing " + url)

response = urllib.request.urlopen(url)
page = str(response.read())
Expand All @@ -29,29 +32,49 @@ def crawl(self, url):
links = []

for link in found_links:
self.add_url(link, links, self.exclude)
self.add_url(link, self.found_links)
is_url = self.is_url(link)

if is_url:
is_internal = self.is_internal(link)

if is_internal:
self.add_url(link, links, self.exclude)
self.add_url(link, self.found_links, self.exclude)

for link in links:
if link not in self.visited_links:
link = self.normalize(link)

self.visited_links.append(link)
self.crawl("{0}{1}".format(self.url, link))
self.crawl(urljoin(self.url, link))

def add_url(self, link, link_list, exclude_pattern=None):
link = link.rstrip("/")

if link:
url_parts = link.split("://")
link = self.normalize(link)

if link:
not_in_list = link not in link_list
is_internal_link = link[0] is "/" or link.split("/")[0] is self.url.split("/")[0]

excluded = False

if exclude_pattern:
excluded = re.search(exclude_pattern, link)

if not_in_list and is_internal_link and not excluded:
if not_in_list and not excluded:
link_list.append(link)



def normalize(self, url):
scheme, netloc, path, qs, anchor = urlsplit(url)
return urlunsplit((scheme, netloc, path, qs, anchor))

def is_internal(self, url):
host = urlparse(url).netloc
return host == self.host or host == ''

def is_url(self, url):
scheme, netloc, path, qs, anchor = urlsplit(url)

if url != '' and scheme in ['http', 'https', '']:
return True
else:
return False
2 changes: 2 additions & 0 deletions sitemap.xml
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>

0 comments on commit 478d399

Please sign in to comment.