# Threading

In [40]:
# Synchronous

import requests
import time
import concurrent.futures
import threading

def download_site(url):
    requests.get(url)
    indicator = "J" if "jython" in url else "R"
    print(indicator, sep='', end='', flush=True)

def download_site_verbose(url):
    with requests.Session() as session:
        response = session.get(url)
        indicator = "J" if "jython" in url else "R"
        print(indicator, sep='', end='', flush=True)

sites = [
        "https://www.jython.org",
        "http://olympus.realpython.org/dice",
    ] * 80

In [39]:
start = time.perf_counter()
for url in sites:
    download_site(url)
duration = time.perf_counter() - start
print(f"Downloaded {len(sites)} sites in {duration} seconds")

JRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRJRDownloaded 160 sites in 8.650328458010335 seconds


In [31]:
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_site, sites)
duration = time.perf_counter() - start
print(f"Downloaded {len(sites)} sites in {duration} seconds")

RRJJJRRRJRRJJJJRRRJRJRJJJRRJRJJRRJRJRJRJRJRJRJJJRJRRRJRJRJJRJRRJJRJRRRJJRJRJJJRRRJRJJRRJJJRRJRJRJRRJJRJRRJRJJRJRJRJRJRRJRJJRRJRJRJJRJRRJJJRRRJJRRJRJRJJRJJRRJRJRDownloaded 160 sites in 1.5454192079923814 seconds


In [41]:
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_site_verbose, sites)
duration = time.perf_counter() - start
print(f"Downloaded {len(sites)} sites in {duration} seconds")

JRRJRRJJRJJRRRJJJRRJRJRJRJRJRRJRJJRRJJJRRRJJRRJJRJRJJRJRJRRRJRJJRJRRJRJRJRRJJJRJRJRJRJRJJRRJRJJRRJJRRJRJRJRJRRJJJRRRJRRJJJRRJRJRRJJJRJJRRRJJRJRJJRJRRRJJRJRJRJRJDownloaded 160 sites in 1.9429268749954645 seconds


In [45]:
# Connection reusage leads to speed up

from itertools import repeat
def download_site_verbose(session, url):
    response = session.get(url)
    indicator = "J" if "jython" in url else "R"
    print(indicator, sep='', end='', flush=True)

start = time.perf_counter()
with requests.Session() as session:
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(download_site_verbose, repeat(session), sites)
duration = time.perf_counter() - start
print(f"Downloaded {len(sites)} sites in {duration} seconds")

RRJJJJRRJRJRJJRRJRRJRJJRJRRJRJJRJRJJRJJRRRJJRJRRJJRRJRRJJRRJJRRJJRRJJRRJJJRJRRRJJRJJRRJJRRJJRJRRRJJRJJJRRRJRRJJJRRRRJJJJRRJJJRRRRJJRJJRRJJRJRRRJJRJJRRRJJJRRJRRJDownloaded 160 sites in 0.8504934589873301 seconds


In [87]:
# Locking, submit
class Account:
    def __init__(self):
        self.balance = 100 # shared data
        self.lock = threading.Lock()
    def update(self, transaction, amount):
        print(f'{transaction} thread updating...')
        with self.lock:
            local_copy = self.balance
            local_copy += amount
            time.sleep(1)
            self.balance = local_copy
        print(f'{transaction} thread finishing...')

account = Account()
print(f'starting with balance of {account.balance}')
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as ex:
    for transaction, amount in [('deposit', 50), ('withdrawal', -150)]:
        ex.submit(account.update, transaction, amount)
print(f'ending balance of {account.balance}')

starting with balance of 100
deposit thread updating...
withdrawal thread updating...
deposit thread finishing...
withdrawal thread finishing...
ending balance of 0


In [103]:
# Event
event = threading.Event()
print(event.is_set())
event.set()
print(event.is_set())
event.clear()
print(event.is_set())

False
True
False


In [107]:
s = threading.Semaphore(value=10)
s.acquire()
print(s._value)
s.release()
print(s._value)

9
10


In [123]:
import concurrent.futures
import random
import threading
import time

def welcome(semaphore, stop):
    visitor_number = 0
    while True and not stop.is_set():
        print(f'welcome visitor #{visitor_number}')
        semaphore.acquire() # reduces value, is blocked when the counter is zero until release is called
        visitor_number += 1
        time.sleep(random.random())
    
def monitor(semaphore, stop):
    while True and not stop.is_set():
        print(f'[monitor] semaphore={semaphore._value}')
        time.sleep(3)
        if semaphore._value == 0:
            print('[monitor] reached max users!')
            print('[monitor] kicking a user out...')
            semaphore.release() # increases value
            time.sleep(0.05)

stop = threading.Event()
semaphore = threading.Semaphore(value=10)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.submit(welcome, semaphore, stop)
    executor.submit(monitor, semaphore, stop)
    time.sleep(7)
    stop.set()

# Counting is atomic. This means that there is a guarantee that the operating system will not swap out the thread in the middle of incrementing or decrementing the counter.
# If a thread calls .acquire() when the counter is zero, that thread will block until a different thread calls .release() and increments the counter to one.

welcome visitor #0
[monitor] semaphore=9
welcome visitor #1
welcome visitor #2
[monitor] semaphore=1
welcome visitor #3
welcome visitor #4
welcome visitor #5
[monitor] semaphore=4
welcome visitor #6
welcome visitor #7
welcome visitor #8
[monitor] semaphore=1
welcome visitor #9
welcome visitor #10
[monitor] reached max users!
[monitor] kicking a user out...
[monitor] semaphore=0
welcome visitor #11
[monitor] semaphore=1
[monitor] reached max users!
[monitor] kicking a user out...


# MultiProcessing

In [85]:
import multiprocessing

start = time.perf_counter()
with multiprocessing.Pool() as pool:
    pool.map(download_site, sites)
duration = time.perf_counter() - start
print(f"Downloaded {len(sites)} sites in {duration} seconds")

# multiprocessing.Pool(initializer=set_global_session)
# If initializer is not None then each worker process will call initializer(*initargs) when it starts.

In [33]:
# Synchronous
import time

def calculate(limit):
    return sum(i * i for i in range(limit))

numbers = [5_000_000 + x for x in range(20)]
start = time.perf_counter()
for number in numbers:
    calculate(number)
duration = time.perf_counter() - start
print(f"Duration {duration} seconds")

Duration 4.790480667004886 seconds


In [39]:
# Concurrent
print(multiprocessing.cpu_count())
start = time.perf_counter()
with multiprocessing.Pool() as pool:
    pool.map(calculate, numbers)
duration = time.perf_counter() - start
print(f"Duration {duration} seconds")

# Asyncio

## Basics

In [3]:
import asyncio
import itertools as it
import os
import random
import time

async def makeitem(size: int = 5) -> str:
    return os.urandom(size).hex()

async def randsleep(caller=None) -> None:
    i = random.randint(0, 10)
    if caller:
        print(f"{caller} sleeping for {i} seconds.")
    await asyncio.sleep(i)

async def produce(name: int, q: asyncio.Queue) -> None:
    n = random.randint(0, 10)
    for _ in it.repeat(None, n):  # Synchronous loop for each single producer
        await randsleep(caller=f"Producer {name}")
        i = await makeitem()
        t = time.perf_counter()
        await q.put((i, t))
        print(f"Producer {name} added <{i}> to queue.")

async def consume(name: int, q: asyncio.Queue) -> None:
    while True:
        await randsleep(caller=f"Consumer {name}")
        i, t = await q.get()
        now = time.perf_counter()
        print(f"Consumer {name} got element <{i}>"
              f" in {now-t:0.5f} seconds.")
        q.task_done()

async def main(nprod: int, ncon: int):
    q = asyncio.Queue()
    producers = [asyncio.create_task(produce(n, q)) for n in range(nprod)]
    consumers = [asyncio.create_task(consume(n, q)) for n in range(ncon)]
    await asyncio.gather(*producers)
    await q.join()  # Implicitly awaits consumers, too
    for c in consumers:
        c.cancel()

import argparse
random.seed(444)
start = time.perf_counter()
# asyncio.run(main(5,10))
await main(5,10)
elapsed = time.perf_counter() - start
print(f"Program completed in {elapsed:0.5f} seconds.")

Producer 0 sleeping for 4 seconds.
Producer 2 sleeping for 7 seconds.
Producer 3 sleeping for 4 seconds.
Producer 4 sleeping for 10 seconds.
Consumer 0 sleeping for 7 seconds.
Consumer 1 sleeping for 8 seconds.
Consumer 2 sleeping for 4 seconds.
Consumer 3 sleeping for 7 seconds.
Consumer 4 sleeping for 1 seconds.
Consumer 5 sleeping for 6 seconds.
Consumer 6 sleeping for 9 seconds.
Consumer 7 sleeping for 3 seconds.
Consumer 8 sleeping for 9 seconds.
Consumer 9 sleeping for 7 seconds.
Producer 0 added <2f1167ddfd> to queue.
Producer 0 sleeping for 10 seconds.
Producer 3 added <37bae73876> to queue.
Producer 3 sleeping for 0 seconds.
Consumer 2 got element <2f1167ddfd> in 0.00060 seconds.
Consumer 2 sleeping for 1 seconds.
Consumer 4 got element <37bae73876> in 0.00055 seconds.
Consumer 4 sleeping for 0 seconds.
Producer 3 added <2656374d36> to queue.
Producer 3 sleeping for 1 seconds.
Consumer 4 got element <2656374d36> in 0.00007 seconds.
Consumer 4 sleeping for 9 seconds.
Producer 3

## aiohttp

In [5]:
sites = [
        "https://www.jython.org",
        "http://olympus.realpython.org/dice",
    ] * 80

In [63]:
import asyncio
import time
import aiohttp

async def download_site(session, url):
    async with session.get(url) as response:
        indicator = "J" if "jython" in url else "R"
        print(indicator, sep='', end='', flush=True)

async def download_all_sites(sites):
    async with aiohttp.ClientSession() as session:
        tasks = [download_site(session, url) for url in sites]
        await asyncio.gather(*tasks, return_exceptions=True)

print("Starting downloads")
start = time.perf_counter()
# asyncio.run(download_all_sites(sites))
await download_all_sites(sites)
duration = time.perf_counter() - start
print(f"\nDownloaded {len(sites)} sites in {duration} seconds")

Starting downloads
JJJJJJJRJJRJRRJRRRRJRRJJRJRJJJRRRJJJJJJJJJJJRJJJRJRJRRJRRRJRJRRJRRJJJJRRRRRJRRRRRRRJRRRRRJJRRJRRJRJJJJJJRJJJJJJJJJJJRRRJRRRRRRJRJRRRRJJJJJJJJRRRJRRRRRJRRJJRRRRR
Downloaded 160 sites in 0.1582791249966249 seconds


## aiofiles

In [None]:
import asyncio
import logging
import re
import sys
from typing import IO
import urllib.error
import urllib.parse

import aiofiles
import aiohttp
from aiohttp import ClientSession

logging.basicConfig(
    format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
    level=logging.DEBUG,
    datefmt="%H:%M:%S",
    stream=sys.stderr,
)
logger = logging.getLogger("areq")
logging.getLogger("chardet.charsetprober").disabled = True

HREF_RE = re.compile(r'href="(.*?)"')

async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
    """GET request wrapper to fetch page HTML.

    kwargs are passed to `session.request()`.
    """

    resp = await session.request(method="GET", url=url, **kwargs)
    resp.raise_for_status()
    logger.info("Got response [%s] for URL: %s", resp.status, url)
    html = await resp.text()
    return html

async def parse(url: str, session: ClientSession, **kwargs) -> set:
    """Find HREFs in the HTML of `url`."""
    found = set()
    try:
        html = await fetch_html(url=url, session=session, **kwargs)
    except (
        aiohttp.ClientError,
        aiohttp.http_exceptions.HttpProcessingError,
    ) as e:
        logger.error(
            "aiohttp exception for %s [%s]: %s",
            url,
            getattr(e, "status", None),
            getattr(e, "message", None),
        )
        return found
    except Exception as e:
        logger.exception(
            "Non-aiohttp exception occured:  %s", getattr(e, "__dict__", {})
        )
        return found
    else:
        for link in HREF_RE.findall(html):
            try:
                abslink = urllib.parse.urljoin(url, link)
            except (urllib.error.URLError, ValueError):
                logger.exception("Error parsing URL: %s", link)
                pass
            else:
                found.add(abslink)
        logger.info("Found %d links for %s", len(found), url)
        return found

async def write_one(file: IO, url: str, **kwargs) -> None:
    """Write the found HREFs from `url` to `file`."""
    res = await parse(url=url, **kwargs)
    if not res:
        return None
    async with aiofiles.open(file, "a") as f:
        for p in res:
            await f.write(f"{url}\t{p}\n")
        logger.info("Wrote results for source URL: %s", url)

async def bulk_crawl_and_write(file: IO, urls: set, **kwargs) -> None:
    """Crawl & write concurrently to `file` for multiple `urls`."""
    async with ClientSession() as session:
        tasks = []
        for url in urls:
            tasks.append(
                write_one(file=file, url=url, session=session, **kwargs)
            )
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    import pathlib
    import sys

    assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
    here = pathlib.Path(__file__).parent

    with open(here.joinpath("urls.txt")) as infile:
        urls = set(map(str.strip, infile))

    outpath = here.joinpath("foundurls.txt")
    with open(outpath, "w") as outfile:
        outfile.write("source_url\tparsed_url\n")

    asyncio.run(bulk_crawl_and_write(file=outpath, urls=urls))