In [None]:
import asyncio

import bs4
import httpx

fund_base_url = "https://www.gov.uk/find-funding-for-land-or-farms"


def extract_fund_links(html_content):
    results = html_content.find("div", id="js-results")

    actions = results.find_all("a", {"class": "govuk-link"})

    links = []

    for action in actions:
        link = action.get("href")

        links.append(link)

    return links


def extract_fund_code(link):
    parts = link.split("/")[-1].split("-")

    return parts[0].upper()


async def get_funds(page_num):
    async with httpx.AsyncClient() as client:
        response = await client.get(f"{fund_base_url}?page={page_num}")

        soup = bs4.BeautifulSoup(response.text, "lxml")

        return [
            {"code": extract_fund_code(link), "link": link}
            for link in extract_fund_links(soup)
        ]


tasks = await asyncio.gather(*[get_funds(page_num) for page_num in range(1, 6)])

funds = []

for task in tasks:
    for fund in task:
        funds.append(fund)

In [None]:
from markdownify import markdownify as md

content_api_base_url = "https://www.gov.uk/api/content"


async def add_fund_content(fund):
    async with httpx.AsyncClient() as client:
        response = await client.get(f"{content_api_base_url}{fund['link']}")

        html = response.json()["details"]["body"]

        soup = bs4.BeautifulSoup(html, "lxml")

        return md(str(soup), heading_style="ATX")


batch_size = 10

batches = [funds[i : i + batch_size] for i in range(0, len(funds), batch_size)]

for i in range(len(batches)):
    batch = batches[i]

    contents = await asyncio.gather(*[add_fund_content(fund) for fund in batch])

    for fund, content in zip(batch, contents, strict=True):
        fund["content"] = content

    print(f"Completed batch {i + 1} out of {len(batches)}")

    await asyncio.sleep(2)

In [None]:
from pathlib import Path

import aiofiles

output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)


async def save_fund(fund):
    filename = f"{fund['code']}.md"
    filepath = output_dir / filename

    async with aiofiles.open(filepath, "w", encoding="utf-8") as f:
        await f.write(fund["content"])


await asyncio.gather(*[save_fund(fund) for fund in funds])

print(f"\nAll {len(funds)} markdown files saved to {output_dir.absolute()}")