### set up 

In [8]:
import aiohttp
import asyncio
import requests

from bs4 import BeautifulSoup
import re

from pathlib import Path

In [4]:
# temporary folder for data
folder = Path(".") / "data"
if not folder.exists():
    folder.mkdir()
else:
    for file in folder.iterdir():
        if file.is_file():
            file.unlink()

### define functions

In [18]:
async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response.raise_for_status()
            content_type = response.headers.get('Content-Type', '')
            print(f"{content_type = }, {response.charset = }")
            return await response.read()

In [7]:
async def download_binary_file(url, local_location, local_name):
    """Download a binary file from a URL and save it to a local location with a specified name using aiohttp."""
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            response.raise_for_status()
            content_type = response.headers.get('content-type')
            print(f"{content_type = }, {response.charset = }")
            
            data = await response.read()
            
    file_path = local_location / local_name
    with open(file_path, "wb") as f:
        f.write(data)
        print(f"Saved to {file_path}")


In [24]:
async def process(url):
    name = Path(url).name
    name = Path(url).name
    second_last_part = Path(url).parts[-2]

    ## request and parse the HTML page
    response = await fetch(url)
    soup = BeautifulSoup(response, 'html.parser')
    # print(soup.prettify())

    ## find the resource URL
    def find_resource_url(soup, pattern):
        """Find a resource URL in the HTML soup using a regex pattern."""
        match = re.search(pattern, str(soup))
        if match:
            return match.group(0)
        return None

    pattern = r'file-examples\.com/storage/\w+/'
    matched_string = find_resource_url(soup, pattern)
    print(f"{matched_string = }")

    resource_url = f"https://{matched_string}2017/{second_last_part}/{name}"
    print(f"{resource_url = }")

    ## download the file
    location = Path(".") / "data"

    print(f"calling download_binary_file with \n    {resource_url = }\n    {location = }\n    {name = }")
    await download_binary_file(resource_url, location, name)


### process

In [25]:
url = "https://file-examples.com/wp-content/storage/2017/02/zip_2MB.zip"
await process(url)

content_type = 'text/html; charset=UTF-8', response.charset = 'UTF-8'
matched_string = 'file-examples.com/storage/fe0707c5116828d4b9ad356/'
resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/zip_2MB.zip'
calling download_binary_file with 
    resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/zip_2MB.zip'
    location = WindowsPath('data')
    name = 'zip_2MB.zip'
content_type = 'application/zip', response.charset = None
Saved to data\zip_2MB.zip


### clean up

In [26]:
# temporary folder for data
folder = Path(".") / "data"
if not folder.exists():
    folder.mkdir()
else:
    for file in folder.iterdir():
        if file.is_file():
            file.unlink()