Fixed issue #1

Cartman720 · Sep 16, 2017 · 478d399 · 478d399
1 parent 2c2ae69
commit 478d399
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 13 deletions.
diff --git a/crawler.py b/crawler.py
@@ -1,14 +1,17 @@
 import urllib.request
+from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
 import re
 
 class Crawler:
 
 	def __init__(self, url, exclude=None, no_verbose=False):
-		self.url = url
+
+		self.url = self.normalize(url)
+		self.host = urlparse(self.url).netloc
 		self.exclude = exclude
 		self.no_verbose = no_verbose
 		self.found_links = []
-		self.visited_links = []
+		self.visited_links = [self.url]
 
 	def start(self):
 		self.crawl(self.url)
@@ -18,7 +21,7 @@ def start(self):
 
 	def crawl(self, url):
 		if not self.no_verbose:
-			print(url)
+			print("Parsing " + url)
 
 		response = urllib.request.urlopen(url)
 		page = str(response.read())
@@ -29,29 +32,49 @@ def crawl(self, url):
 		links = []
 
 		for link in found_links:
-			self.add_url(link, links, self.exclude)
-			self.add_url(link, self.found_links)
+			is_url = self.is_url(link)
+
+			if is_url:
+				is_internal = self.is_internal(link)
+
+				if is_internal:
+					self.add_url(link, links, self.exclude)
+					self.add_url(link, self.found_links, self.exclude)
 
 		for link in links:
 			if link not in self.visited_links:
+				link = self.normalize(link)
+
 				self.visited_links.append(link)
-				self.crawl("{0}{1}".format(self.url, link))
+				self.crawl(urljoin(self.url, link))
 
 	def add_url(self, link, link_list, exclude_pattern=None):
-		link = link.rstrip("/")
-
-		if link:
-			url_parts = link.split("://")
+		link = self.normalize(link)
 
+		if link:			
 			not_in_list = link not in link_list
-			is_internal_link = link[0] is "/" or link.split("/")[0] is self.url.split("/")[0] 
+
 			excluded = False
 
 			if exclude_pattern:
 				excluded = re.search(exclude_pattern, link)
 
-			if not_in_list and is_internal_link and not excluded:
+			if not_in_list and not excluded:
 				link_list.append(link)
 
 
-
+	def normalize(self, url):
+		scheme, netloc, path, qs, anchor = urlsplit(url)
+		return urlunsplit((scheme, netloc, path, qs, anchor))
+
+	def is_internal(self, url):
+		host = urlparse(url).netloc
+		return host == self.host or host == ''	
+
+	def is_url(self, url):
+		scheme, netloc, path, qs, anchor = urlsplit(url)
+
+		if url != '' and scheme in ['http', 'https', '']:
+			return True 
+		else:
+			return False
diff --git a/sitemap.xml b/sitemap.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+	<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>