In [115]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from urllib.parse import urlparse
from collections import deque
import codecs

In [129]:
def get_page_title(url):
    try:    
        response = requests.get(url)
    except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):      
        #print ("malformed url!")
        return "None"
    
    titlesoup = BeautifulSoup(response.text, 'lxml')
    return titlesoup.title.text.strip()

In [138]:
url = 'http://www.bankofamerica.com'

# a queue of urls to be crawled next
new_urls = deque([url])
new_url_title = get_page_title(new_urls[0])
f = codecs.open("crawl.csv", "w", "utf-8")
#print ('url_source,url_target,page_title_target')
#print ("%s,%s,%s" % ('None',new_urls[0],new_url_title))
f.write(u'url_source,url_target,page_title_target\n')
f.write(u"%s,%s,%s\n" % ('None',new_urls[0],new_url_title))

# a set of urls that we have already processed 
processed_urls = set()

# a set of domains inside the target website
local_urls = set()

# a set of domains outside the target website
foreign_urls = set()

# a set of broken urls
broken_urls = set()

In [None]:
# process urls one by one until we exhaust the queue
link_counter = 1
max_link = 30
debug = 1
while len(new_urls)>0:
    # move url from the queue to processed url set    
    url = new_urls.popleft()    
    processed_urls.add(url)
    # print(new_urls)
    
    # Now, get the source code of the current url being processed.  Check to make
    # sure that there are no obvious problems with this source code.  If there is,
    # add this url to the broken_urls list.
    try:    
        response = requests.get(url)
    except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):    
        # add broken urls to it’s own set, then continue    
        broken_urls.add(url)    
        continue
        
    # Extract base url to resolve relative links
    # 
    # This is not strictly necessary for this exercise, as all of the links
    # on all of the pages are within this one website, but it is good practice
    # in general.  We can refer to these variables later in the code, too ...
    parts = urlsplit(url)
    base = '{0.netloc}'.format(parts)
    strip_base = base.replace('www.', '')
    base_url = '{0.scheme}://{0.netloc}'.format(parts)
    path = url[:url.rfind('/')+1] if '/' in parts.path else url
    
    #print (parts)
    #print (base)
    #print (strip_base)
    #print (base_url)
    #print (path)
    
    # Extract the source code for this url into a BeautifulSoup object for processing
    soup = BeautifulSoup(response.text, 'lxml')
    
    page_link_counter = 0
    # Loop through all of the links found on the page corresponding to this url
    for link in soup.find_all('a'): 
        
        if page_link_counter == 20:
            break
        
        # extract link url from the anchor    
        anchor = link.attrs['href'] if 'href' in link.attrs else ''
        
        if debug: print ('----------------')
        if debug: print (anchor)
        
        # Check to see what the anchor looks like ... based on this, add the link
        # to either the local_urls set or the foreign_urls set.
        
        if len(anchor)>1 and anchor.find('#')==-1 and anchor.find('?')==-1 and anchor.find('javascript')==-1 and anchor.find('.pdf')==-1 and anchor.find('.svg')==-1 and anchor.find(':')==-1 and anchor.find('.go')==-1:
            well_formed = True
        else:
            well_formed = False
            
        if anchor.endswith('/'):
            anchor = anchor[:-1]
            
        if anchor.startswith('/') and well_formed:
            if debug: print ('here 1')
            local_link = base_url + anchor        
            local_urls.add(local_link)    
        elif strip_base in anchor and well_formed:
            if debug: print ('here 2')
            local_urls.add(anchor)    
        elif not anchor.startswith('http') and well_formed:
            if debug: print ('here 3')
            local_link = path + '/' + anchor        
            local_urls.add(local_link)    
        else:
            if debug: print ('here 4')
            foreign_urls.add(anchor)
        
        #print(local_urls)
        #print(foreign_urls)
        
        # Finally, for all of the links in the local urls list, if it is not already
        # in the new_urls set, and it is not already in the processed_urls set, add
        # the link to the new_urls set.
        #
        # N.B.  If we add the url using 'append', it adds it to the END of new_urls list
        # So, this is going to be a breadth-first crawling algorithm!
        for i in local_urls:    
            if not i in new_urls and not i in processed_urls and link_counter < max_link:
                new_urls.append(i)
                page_link_counter = page_link_counter + 1
                link_counter = link_counter + 1
                print ("link_counter, page_link_counter = ",link_counter,page_link_counter)   
                #print('Adding %s to queue ... from %s' % (i,url))
                new_url_title = get_page_title(i)
                #print ("Page title = ",new_url_title)
                if debug: print ("%s,%s,%s" % (url,i,new_url_title))
                f.write(u"%s,%s,%s\n" % (url,i,new_url_title))
        if link_counter >= max_link:
            break
            
        #print ('end of loop',new_urls)

    if link_counter >= max_link:
        print ('Done!!')
        break

        
f.close()