In [3]:
import requests
import hashlib
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import openai

# Access environment variable
api_key = os.environ.get('API_KEY')

# openai API key
openai.api_key = api_key

# Import ChunkedEncodingError from http.client
from http.client import IncompleteRead

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain and URL to crawl
domain = "bradleyspiclin.github.io"
full_url = "https://bradleyspiclin.github.io/PortfolioProject/"

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def clean_url_for_filename(url):
    # Remove any non-word characters (characters other than letters, digits, and underscores)
    cleaned_url = re.sub(r'\W+', '_', url)

    # Replace percent sign (%) with underscore (_) in the cleaned URL
    cleaned_url = cleaned_url.replace('%', '_')

    # Add a prefix to the filename to differentiate between different URLs from the same domain
    cleaned_url = "page_" + cleaned_url

    # Limit the filename length to a reasonable value to avoid long filenames
    max_filename_length = 100
    cleaned_url = cleaned_url[:max_filename_length]

    # Remove additional invalid characters
    invalid_chars = '<>:"/\\|?*'
    cleaned_url = re.sub(r'[{}]+'.format(re.escape(invalid_chars)), '', cleaned_url)

    # Remove any trailing dots or underscores
    cleaned_url = cleaned_url.strip('._')

    return cleaned_url

# Helper function to get last part of a URL
def get_last_url_part(url):
    return url.rsplit("/", 1)[-1]

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create directories to store text files if they don't exist
    text_directory = os.path.join("text", local_domain)
    os.makedirs(text_directory, exist_ok=True)

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Get the SHA-1 hash of the URL
        url_hash = hashlib.sha1(url.encode()).hexdigest()

        # Save text from the url to a <hash>.txt file
        text_path = os.path.join(text_directory, url_hash + ".txt")
        
        # Check if the URL ends with certain extensions and exclude them
        if url.lower().endswith((".zip", ".pdf", ".xsl")):
            print("Skipping file:", url)
            continue
       
       # Get the text from the URL using BeautifulSoup
        try:
            soup = BeautifulSoup(requests.get(url).text, "html.parser")
        except IncompleteRead as e:
            print(f"Error occurred while fetching {url}: {e}")
            continue
        except Exception as e:
            print(f"Error occurred while fetching {url}: {e}")
            continue
        # Get the text but remove the tags
        text = soup.get_text()

        # If the crawler gets to a page that requires JavaScript, it will stop the crawl
        if "You need to enable JavaScript to run this app." in text:
            print("Unable to parse page " + url + " due to JavaScript being required")
            continue

        # Otherwise, write the text to the file in the text directory
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

https://bradleyspiclin.github.io/PortfolioProject/
https://bradleyspiclin.github.io/files/Bradley-Spiclin-Resume.pdf
Skipping file: https://bradleyspiclin.github.io/files/Bradley-Spiclin-Resume.pdf
https://bradleyspiclin.github.io/BookWebApp/#
