In [9]:
import aiohttp
import asyncio
from typing import Dict, Any, Callable
from bs4 import BeautifulSoup
from typing import List
import re


async def main():

    async with aiohttp.ClientSession() as session:
        async with session.get('http://python.org') as response:

            print("Status:", response.status)
            print("Content-type:", response.headers['content-type'])

            html = await response.text()
            print("Body:", html[:15], "...")


def pattern_filter(
    urls: List[str],
    regex_patterns: List[str],
) -> List[str]:
    """
    Searches a list of urls for regular expression patterns and keeps
    only those that match one of the patterns. This is a whitelist
    style pattern matching function.
    """
    regex_patterns = [re.compile(pattern) for pattern in regex_patterns]
    filtered_urls = []
    for url in urls:
        for pattern in regex_patterns:
            match = pattern.search(url)
            if match is not None:
                filtered_urls.append(url)
                break
    
    print(filtered_urls)


async def crawler(
    url_queue: asyncio.Queue,
    url_filter: Dict[str, Any],
    session: aiohttp.ClientSession,
    response_queue: asyncio.Queue,
    stop: asyncio.Event,
) -> None:
    """
    Simple asynchronous crawling function that continuously reads
    urls from the url queue and adds the responses to a response
    queue.
    """
    while not stop.is_set():
        url = await url_queue.get()
        print(url)

        # Get the data from the url
        soup = None
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, "lxml")

        # Add all the relevant links to the queue
        if soup is not None:
            all_links = soup.find_all("a")
            all_links = [link.get("href") for link in all_links]

        filter_func = url_filter["filter_func"]
        filter_kwargs = url_filter["kwargs"]
        addable_urls = filter_func(all_links, **filter_kwargs)
        break


url_queue = asyncio.Queue()
event = asyncio.Event()
await url_queue.put("https://caseyhandmer.wordpress.com/")


import time

async with aiohttp.ClientSession() as session:
    await crawler(
        url_queue,
        url_filter={"filter_func": pattern_filter, "kwargs": {"regex_patterns": ["https://"]}},
        session=session,
        response_queue=[],
        stop=event,
    )
    event.set()

https://caseyhandmer.wordpress.com/
['https://caseyhandmer.wordpress.com/', 'https://caseyhandmer.wordpress.com/', 'https://caseyhandmer.wordpress.com/contact/', 'https://caseyhandmer.wordpress.com/2023/10/19/future-of-energy-reading-list/#comments', 'https://caseyhandmer.wordpress.com/2023/10/19/future-of-energy-reading-list/', 'https://caseyhandmer.wordpress.com/2023/10/19/future-of-energy-reading-list/', 'https://caseyhandmer.wordpress.com/2023/10/19/future-of-energy-reading-list/', 'https://caseyhandmer.wordpress.com/author/cjhandmer/', 'https://caseyhandmer.wordpress.com/2023/10/19/future-of-energy-reading-list/', 'https://caseyhandmer.wordpress.com/2022/12/13/mars-trilogy-technical-commentary/#respond', 'https://caseyhandmer.wordpress.com/2022/12/13/mars-trilogy-technical-commentary/', 'https://caseyhandmer.wordpress.com/2022/12/13/mars-trilogy-technical-commentary/', 'https://caseyhandmer.wordpress.com/2022/12/13/mars-trilogy-technical-commentary/', 'https://caseyhandmer.wordpre