### Summary  
The script downloads binary files directly or indirectly by parsing HTML, saving them in a `data` folder.

Main Steps
- setup, Create/clear the `data` folder.  
- Define functions
    - `download_binary_file` for direct downloads, using `requests`.
    - Use `process` to parse HTML and extract resource URLs for indirect downloads, using `BeautifulSoup`, `re`.
- Batch process multiple URLs for both methods.
- clean up

### set up 

In [1]:
import requests
from bs4 import BeautifulSoup
import re

from pathlib import Path

In [2]:
# temporary folder for data
folder = Path(".") / "data"
if not folder.exists():
    folder.mkdir()
else:
    for file in folder.iterdir():
        if file.is_file():
            file.unlink()

### define functions

In [3]:
def download_binary_file(url, local_location, local_name):
    """Download a binary file from a URL and save it to a local location with a specified name."""
    # make the request
    response = requests.get(url)
    response.raise_for_status()
    print(f"{response.headers['content-type'] = }, {response.encoding = }")

    # save the file
    file_path = local_location / local_name
    with open(file_path, "wb") as f:
        f.write(response.content)
        print(f"Saved to {file_path}")

In [4]:
def process(url):
    name = Path(url).name
    second_last_part = Path(url).parts[-2]

    ## request and parse the HTML page
    response = requests.get(url)
    response.raise_for_status()
    print(f"{response.headers['content-type'] = }, {response.encoding = }")

    soup = BeautifulSoup(response.text, "html.parser")
    # print(soup.prettify())

    ## find the resource URL
    def find_resource_url(soup, pattern):
        """Find a resource URL in the HTML soup using a regex pattern."""
        match = re.search(pattern, str(soup))
        if match:
            return match.group(0)
        return None

    pattern = r"file-examples\.com/storage/\w+/"
    matched_string = find_resource_url(soup, pattern)
    print(f"{matched_string = }")

    resource_url = f"https://{matched_string}2017/{second_last_part}/{name}"
    print(f"{resource_url = }")

    ## download the file
    location = Path(".") / "data"

    print(
        f"calling download_binary_file with \n    {resource_url = }\n    {location = }\n    {name = }"
    )
    download_binary_file(resource_url, location, name)

### process

In [5]:
url = "https://file-examples.com/wp-content/storage/2017/02/zip_2MB.zip"
process(url)

response.headers['content-type'] = 'text/html; charset=UTF-8', response.encoding = 'UTF-8'
matched_string = 'file-examples.com/storage/fe0707c5116828d4b9ad356/'
resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/zip_2MB.zip'
calling download_binary_file with 
    resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/zip_2MB.zip'
    location = WindowsPath('data')
    name = 'zip_2MB.zip'
response.headers['content-type'] = 'application/zip', response.encoding = None
Saved to data\zip_2MB.zip


In [6]:
urls = [
    "https://file-examples.com/wp-content/storage/2017/10/file-sample_150kB.pdf",
    "https://file-examples.com/wp-content/storage/2017/02/file_example_JSON_1kb.json",
    "https://file-examples.com/wp-content/storage/2017/02/file_example_XML_24kb.xml",
    "https://file-examples.com/wp-content/storage/2017/02/index.html",
    "https://file-examples.com/wp-content/storage/2017/02/file_example_XLSX_5000.xlsx",
    "https://file-examples.com/wp-content/storage/2017/02/file_example_CSV_5000.csv",
]

for url in urls:
    process(url)
    print("\n" * 2)

response.headers['content-type'] = 'text/html; charset=UTF-8', response.encoding = 'UTF-8'
matched_string = 'file-examples.com/storage/fe0707c5116828d4b9ad356/'
resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/10/file-sample_150kB.pdf'
calling download_binary_file with 
    resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/10/file-sample_150kB.pdf'
    location = WindowsPath('data')
    name = 'file-sample_150kB.pdf'
response.headers['content-type'] = 'application/pdf', response.encoding = None
Saved to data\file-sample_150kB.pdf



response.headers['content-type'] = 'text/html; charset=UTF-8', response.encoding = 'UTF-8'
matched_string = 'file-examples.com/storage/fe0707c5116828d4b9ad356/'
resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/file_example_JSON_1kb.json'
calling download_binary_file with 
    resource_url = 'https://file-examples.com/storage/fe0707c5116828d4b9ad356/2017/02/file_examp

### clean up

In [7]:
# temporary folder for data
folder = Path(".") / "data"
if not folder.exists():
    folder.mkdir()
else:
    for file in folder.iterdir():
        if file.is_file():
            file.unlink()