In [20]:
import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin, urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Connect to SQLite database or create one if it doesn't exist
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

# Create table for storing web pages with their HTML and ranking data
cur.execute('''
            CREATE TABLE IF NOT EXISTS Pages
            (id INTEGER PRIMARY KEY, 
            url TEXT UNIQUE, 
            html TEXT,
            error INTEGER, old_rank REAL, new_rank REAL)
            
            ''')

# Create table for storing links between pages (edges in the graph)
cur.execute('''
            CREATE TABLE IF NOT EXISTS Links
            (from_id INTEGER, 
            to_id INTEGER,
            UNIQUE(from_id, to_id))
            
            ''')

# Create table to track domains (websites) being crawled
cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')

# Check if there is an unfinished crawl by looking for pages without HTML or errors
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
row = cur.fetchone()
if row is not None:
    print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.")
else:
    # Input starting URL if no existing crawl
    starturl = input('Enter web url or enter: ')
    if len(starturl) < 1: 
        starturl = 'http://www.dr-chuck.com/'  # Default URL
    if starturl.endswith('/'):
        starturl = starturl[:-1]
    
    web = starturl
    # Strip .htm or .html from URL if present to get the base domain
    if starturl.endswith('.htm') or starturl.endswith('.html'):
        pos = starturl.rfind('/')
        web = starturl[:pos]

    # Insert the starting website into the Webs and Pages tables
    if len(web) > 1:
        cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', (web,))
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', (starturl,))
        conn.commit()

# Retrieve the list of all tracked websites (domains) in the database
cur.execute('''SELECT url FROM Webs''')
webs = list()
for row in cur:
    webs.append(str(row[0]))

print(webs)

# Start crawling process, with a limit of 100 pages
many = 100  # Number of pages to retrieve
while True:
    if many < 1:
        sval = input('How many pages:')
        if len(sval) < 1: 
            break
        many = int(sval)
    many = many - 1

    # Fetch the next page to crawl (which has not been crawled yet)
    cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
    try:
        row = cur.fetchone()
        fromid = row[0]
        url = row[1]
    except:
        print('No unretrieved HTML pages found')
        many = 0
        break

    print(fromid, url, end=' ')

    # Delete all existing links from the page before retrieving it again
    cur.execute('DELETE from Links WHERE from_id=?', (fromid,))
    try:
        # Open the URL and read its content
        document = urlopen(url, context=ctx)
        html = document.read()

        # Check for HTTP error codes and handle non-HTML content
        if document.getcode() != 200:
            print("Error on page:", document.getcode())
            cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url))

        if 'text/html' != document.info().get_content_type():
            print("Ignore non text/html page")
            cur.execute('DELETE FROM Pages WHERE url=?', (url,))
            conn.commit()
            continue

        print('(' + str(len(html)) + ')', end=' ')

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
    except KeyboardInterrupt:
        print('Program interrupted by user...')
        break
    except:
        print("Unable to retrieve or parse page")
        cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url,))
        conn.commit()
        continue

    # Store the HTML content in the database
    cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', (url,))
    cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url))
    conn.commit()

    # Retrieve all anchor tags (links) from the page
    tags = soup('a')
    count = 0
    for tag in tags:
        href = tag.get('href', None)
        if href is None:
            continue

        # Resolve relative URLs to absolute URLs
        up = urlparse(href)
        if len(up.scheme) < 1:
            href = urljoin(url, href)

        # Ignore fragment identifiers (anything after #)
        ipos = href.find('#')
        if ipos > 1:
            href = href[:ipos]

        # Skip non-HTML files (images, etc.) and trailing slashes
        if href.endswith(('.png', '.jpg', '.gif')): 
            continue
        if href.endswith('/'):
            href = href[:-1]

        if len(href) < 1:
            continue

        # Check if the URL belongs to any of the tracked websites (webs)
        found = False
        for web in webs:
            if href.startswith(web):
                found = True
                break
        if not found:
            continue

        # Insert the new page URL into the Pages table (if not already present)
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', (href,))
        count = count + 1
        conn.commit()

        # Retrieve the ID of the inserted page and create a link from the current page
        cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', (href,))
        try:
            row = cur.fetchone()
            toid = row[0]
        except:
            print('Could not retrieve id')
            continue

        # Insert the link between the current page and the linked page
        cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', (fromid, toid))

    print(count)

# Close the cursor and database connection
cur.close()


Enter web url or enter:  https://cpdaccredited.wixsite.com/cld-institute


['http://www.dr-chuck.com', 'https://www.duepoint.net', 'https://generationalwealthafrica.com', 'https://cpdaccredited.wixsite.com/cld-institute']
29 https://cpdaccredited.wixsite.com/cld-institute (610705) 10
35 https://cpdaccredited.wixsite.com/cld-institute/news-resources-1 (631779) 10
30 https://cpdaccredited.wixsite.com/cld-institute/about (636488) 10
33 https://cpdaccredited.wixsite.com/cld-institute/blank-page-2 (625103) 10
36 https://cpdaccredited.wixsite.com/cld-institute/contact-us (650711) 10
31 https://cpdaccredited.wixsite.com/cld-institute/courses (881498) 10
34 https://cpdaccredited.wixsite.com/cld-institute/blank-page-1-1 (639465) 10
32 https://cpdaccredited.wixsite.com/cld-institute/publications-1 (672237) 10
No unretrieved HTML pages found
