In [231]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from urllib.parse import urlparse
from collections import deque
import codecs

In [236]:
# Define a function that will return the title of the url passed to it.

def get_page_title(url):
    
    try:    
        response = requests.get(url)
        titlesoup = BeautifulSoup(response.text, 'lxml')
        return titlesoup.title.text.strip()
    except:
        return "None"

In [233]:
# Define a function that will return True if the anchor is "well formed", that is
# if is a basic website url, and False otherwise.

def well_formed_anchor(anchor,debug):
    
    no_hash = anchor.find('#') == -1
    non_zero_length = len(anchor)>1
    not_js = anchor.find('javascript')==-1
    no_q = anchor.find('?')==-1
    not_pdf = anchor.find('.pdf')==-1
    not_svg = anchor.find('.svg')==-1
    no_colons = anchor[6:].find(':')==-1
    not_go = anchor.find('.go')==-1
    no_rel = anchor.find('..')==-1
    
    if debug:
        print (no_hash,non_zero_length,not_js,no_q,not_svg,no_colons,not_go)
    
    if non_zero_length and no_rel and no_hash and not_js and no_q and not_pdf and not_svg and no_colons and not_go:
        well_formed = True
    else:
        well_formed = False
        
    return well_formed

In [234]:
# Define the url of the root website
url = 'http://www.bankofamerica.com/'

# Create a queue of urls to be crawled; initialize it with the root website url
new_urls = deque([url])

# Get the title of the root website url
new_url_title = get_page_title(new_urls[0])
print ("%s,%s,%s" % ('None',new_urls[0],new_url_title))

# Open a file to write everyting to
f = codecs.open("crawl.csv", "w", "utf-8")

# Write the header line and the line for the root website url to the file
f.write(u'url_source,url_target,page_title_target\n')
f.write(u"%s,%s,%s\n" % ('None',new_urls[0],new_url_title))

# Now, create some sets to store various pieces of information

# 1. a set of urls that we have already processed 
processed_urls = set()

# 2. a set of domains inside the target website
local_urls = set()

# 3. a set of domains outside the target website
foreign_urls = set()

# 4. a set of broken urls
broken_urls = set()

None,http://www.bankofamerica.com/,Bank of America - Banking, Credit Cards, Loans and Merrill Investing


In [235]:
# Define a counter variable for the number of links that have been found across
# all pages crawled thus far.  Initialize it to 1, as we have already added the
# root website url to the list.
link_counter = 1

# Define the maximum number of links to find before stopping
max_link = 100

# Define a flag to turn on debugging output, if necessary
debug = False

# process urls one by one until we exhaust the queue
while len(new_urls)>0:

    # move url from the queue to processed url set    
    url = new_urls.popleft()    
    processed_urls.add(url)
    
    # Now, get the source code of the current url being processed.  Check to make
    # sure that there are no obvious problems with this source code.  If there is,
    # add this url to the broken_urls list.
    try:    
        response = requests.get(url)
    except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):    
        # add broken urls to it’s own set, then continue    
        broken_urls.add(url)    
        continue
        
    # Break up the url into various useful parts and define some
    # variables tthat can be referred to later    
    parts = urlsplit(url)
    base = '{0.netloc}'.format(parts)
    strip_base = base.replace('www.', '')
    base_url = '{0.scheme}://{0.netloc}'.format(parts)
    path = url[:url.rfind('/')+1] if '/' in parts.path else url
    
    # Add the base_url, with "www" removed, to the processed_urls list
    # This will mean "http://www.mydomain.com" and "http://mydomain.com"
    # will not be treated as separate pages.
    processed_urls.add(base_url.replace('www.',''))
    
    #print (parts)
    #print (base)
    if debug: print (strip_base)
    if debug: print (base_url)
    if debug: print (path)
    
    # Extract the source code for this url into a BeautifulSoup object for processing
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Define a counter for the number of links found within the current url being crawled
    page_link_counter = 0
    
    # Loop through all of the links found on the page corresponding to this url
    for link in soup.find_all('a'): 
        
        # If we have already found 20 links on this page, jump out of this loop
        if page_link_counter == 20:
            break
        
        # extract link url from the anchor    
        anchor = link.attrs['href'] if 'href' in link.attrs else ''
        
        if debug: print ('----------------')
        if debug: print (anchor)
        
        # Check to see if the anchor is well-formed
        well_formed = well_formed_anchor(anchor,debug)
           
        # If the anchor has a trailing "/" character, remove it
        if anchor.endswith('/'):
            anchor = anchor[:-1]
            
        # Check the form of the anchor, and based on this specific form, create a
        # local_link value that can be added to a list of local urls to be crawled.
        # If the link is not in this website domain, add it to the list of foreign urls.
        if anchor.startswith('/') and well_formed:
            if debug: print ('here 1')
            local_link = base_url + anchor        
            local_urls.add(local_link)    
        elif strip_base in anchor and well_formed:
            if debug: print ('here 2')
            local_urls.add(anchor)    
        elif not anchor.startswith('http') and well_formed:
            if debug: print ('here 3')
            if path.endswith('/'):
                local_link = path + anchor
            else:
                local_link = path + '/' + anchor
            local_urls.add(local_link)
        else:
            if debug: print ('here 4')
            foreign_urls.add(anchor)
        
        #print(local_urls)
        #print(foreign_urls)
        
        # Finally, for all of the links in the local urls list, if it is not already
        # in the new_urls set, and it is not already in the processed_urls set, and
        # we have not already reached the max_link count, then add
        # the link to the new_urls set.
        #
        # N.B.  If we add the url using 'append', it adds it to the END of new_urls list
        # So, this is going to be a breadth-first crawling algorithm!
        for i in local_urls:    
            if not i in new_urls and not i in processed_urls and link_counter < max_link:
                
                new_urls.append(i)
                
                # Update the counters
                page_link_counter = page_link_counter + 1
                link_counter = link_counter + 1   
                
                new_url_title = get_page_title(i)
                f.write(u"%s,%s,%s\n" % (url,i,new_url_title))    
                if debug: print ("%s,%s,%s" % (url,i,new_url_title))
                
                print ("link_counter, page_link_counter = ",link_counter,page_link_counter)
                    
        if link_counter >= max_link:
            break
            
        #print ('end of loop',new_urls)

    if link_counter >= max_link:
        print ('Done!!')
        break

# Close the output file        
f.close()

link_counter, page_link_counter =  2 1
link_counter, page_link_counter =  3 2
link_counter, page_link_counter =  4 3
link_counter, page_link_counter =  5 4
link_counter, page_link_counter =  6 5
link_counter, page_link_counter =  7 6
link_counter, page_link_counter =  8 7
link_counter, page_link_counter =  9 8
link_counter, page_link_counter =  10 9
link_counter, page_link_counter =  11 10
link_counter, page_link_counter =  12 11
link_counter, page_link_counter =  13 12
link_counter, page_link_counter =  14 13
link_counter, page_link_counter =  15 14
link_counter, page_link_counter =  16 15
link_counter, page_link_counter =  17 16
link_counter, page_link_counter =  18 17
link_counter, page_link_counter =  19 18
link_counter, page_link_counter =  20 19
link_counter, page_link_counter =  21 20
link_counter, page_link_counter =  22 1
link_counter, page_link_counter =  23 2
link_counter, page_link_counter =  24 3
link_counter, page_link_counter =  25 4
link_counter, page_link_counter =  26