### set up

In [1]:
import aiohttp
import asyncio

from bs4 import BeautifulSoup
import re

from pathlib import Path

import zipfile

### functions

In [2]:
async def fetch(session, url):
    async with session.get(url) as response:
        response.raise_for_status()
        return await response.read()

In [None]:
# R scraping functions


def find_2nd_td_following_certain_td(soup, td_text) -> BeautifulSoup:
    # Find the first <td> element that contains td_text
    td = soup.find("td", string=re.compile(td_text))
    if td:
        # Find the next <td> element
        next_td = td.find_next("td")
        if next_td:
            return next_td
    return None


def find_r_release(soup) -> str:
    td = find_2nd_td_following_certain_td(soup, "binaries")
    found = False
    for item in td:
        if found:
            return item["href"]
        if item.name is None and "r-release:" in item.strip():
            found = True


def find_all_imports_refs(soup) -> list:
    td = find_2nd_td_following_certain_td(soup, "Imports:")
    results = []
    if td:
        a_tags = td.find_all("a")
        if a_tags:
            for a in a_tags:
                results.append(a["href"])
    return results

In [None]:
# R scraping functions

ROOT_URL = "https://cran.r-project.org"


def bin_link_scrap(page) -> str:
    soup = BeautifulSoup(page, "html.parser")
    relative_link = find_r_release(soup)
    if relative_link:
        new_path = [*Path(relative_link).parts[3:]]  # Skips the first three '..'
        return "/".join([ROOT_URL] + new_path)


def imports_link_scrap(page) -> list:
    soup = BeautifulSoup(page, "html.parser")
    imports = find_all_imports_refs(soup)

    results = []
    for relative_link in imports:
        new_path = [*Path(relative_link).parts[1:]]  # Skip the first '..'
        results.append("/".join([ROOT_URL, "web", "packages"] + new_path))

    return results

In [None]:
# R scraping functions - iterative process


async def get_r_bins(session, url, results=None, processed_urls=None) -> tuple:
    if results is None:
        results = set()
    if processed_urls is None:
        processed_urls = set()

    print(f"processing {url}")
    page = await fetch(session, url)

    bin_link = bin_link_scrap(page)
    dependencies = imports_link_scrap(page)

    results.add(bin_link)
    processed_urls.add(url)

    for dep_url in dependencies:
        if dep_url not in processed_urls:
            results, processed_urls = await get_r_bins(
                session, dep_url, results, processed_urls
            )

    return results, processed_urls

In [None]:
# download functions


def write_data_to_file(data, file_path):
    if not file_path.exists():
        with open(file_path, "wb") as f:
            f.write(data)


async def down_load_r_bins(session, urls, zip_path):

    async def process_one_file(session, url, zip_path):
        name = Path(url).name
        if Path(zip_path / name).exists():
            print(f"{name} already exists, skipping download.")
            return

        print(f"downloading {url}")
        data = await fetch(session, url)

        file_path = zip_path / name
        await asyncio.to_thread(write_data_to_file, data, file_path)

    tasks = [process_one_file(session, url, zip_path) for url in urls]
    await asyncio.gather(*tasks)

In [None]:
# unzip functions


def unzip_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)


async def process_zip_files(zip_files: list, extract_to: str):
    async def process_one_file(zip_path, extract_to):
        stem = Path(zip_path).stem
        base_name = re.sub(r"_\d+(\.\d+)*$", "", stem)

        if Path(extract_to / base_name).exists():
            print(f"{base_name} already exists, skipping unzipping.")
            return

        print(f"unzipping {zip_path}")
        await asyncio.to_thread(unzip_file, zip_path, extract_to)

    tasks = []
    for zip_file in zip_files:
        tasks.append(process_one_file(zip_file, extract_to))
    await asyncio.gather(*tasks)

### process

In [7]:
# set up and clean up the folders
folders = [Path(".") / "zip", Path(".") / "unzip"]
import shutil

for folder in folders:
    if not folder.exists():
        folder.mkdir()
    else:
        shutil.rmtree(folder)
        folder.mkdir()

In [11]:
url = "https://cran.r-project.org/web/packages/Hmisc/index.html"
zip_path = Path(".") / "zip"
unzip_path = Path(".") / "unzip"

In [16]:
async with aiohttp.ClientSession() as session:
    results, processed_urls = await get_r_bins(session, url)

print(f"{len(results) = }")

processing https://cran.r-project.org/web/packages/Hmisc/index.html
processing https://cran.r-project.org/web/packages/ggplot2/index.html
processing https://cran.r-project.org/web/packages/cli/index.html
processing https://cran.r-project.org/web/packages/glue/index.html
processing https://cran.r-project.org/web/packages/gtable/index.html
processing https://cran.r-project.org/web/packages/lifecycle/index.html
processing https://cran.r-project.org/web/packages/rlang/index.html
processing https://cran.r-project.org/web/packages/isoband/index.html
processing https://cran.r-project.org/web/packages/MASS/index.html
processing https://cran.r-project.org/web/packages/mgcv/index.html
processing https://cran.r-project.org/web/packages/Matrix/index.html
processing https://cran.r-project.org/web/packages/lattice/index.html
processing https://cran.r-project.org/web/packages/scales/index.html
processing https://cran.r-project.org/web/packages/farver/index.html
processing https://cran.r-project.org/w

In [19]:
# throttle the download
number_of_simultaneous_downloads = 4
list_of_urls = list(results)

async with aiohttp.ClientSession() as session:
    for i in range(0, len(results), number_of_simultaneous_downloads):
        urls_to_download = list_of_urls[i : i + number_of_simultaneous_downloads]
        await down_load_r_bins(session, urls_to_download, zip_path)
        await asyncio.sleep(1)  # Sleep for 1 second between batches

fansi_1.0.6.zip already exists, skipping download.
bslib_0.9.0.zip already exists, skipping download.
ggplot2_3.5.2.zip already exists, skipping download.
evaluate_1.0.3.zip already exists, skipping download.
rappdirs_0.3.3.zip already exists, skipping download.
mgcv_1.9-3.zip already exists, skipping download.
nnet_7.3-20.zip already exists, skipping download.
utf8_1.2.5.zip already exists, skipping download.
yaml_2.3.10.zip already exists, skipping download.
cachem_1.1.0.zip already exists, skipping download.
jquerylib_0.1.4.zip already exists, skipping download.
stringi_1.8.7.zip already exists, skipping download.
xfun_0.52.zip already exists, skipping download.
glue_1.8.0.zip already exists, skipping download.
labeling_0.4.3.zip already exists, skipping download.
htmlwidgets_1.6.4.zip already exists, skipping download.
jsonlite_2.0.0.zip already exists, skipping download.
cli_3.6.5.zip already exists, skipping download.
RColorBrewer_1.1-3.zip already exists, skipping download.
back

In [20]:
zip_files = list(zip_path.glob("*.zip"))
await process_zip_files(zip_files, unzip_path)

backports already exists, skipping unzipping.
unzipping zip\base64enc_0.1-3.zip
bslib already exists, skipping unzipping.
cachem already exists, skipping unzipping.
checkmate already exists, skipping unzipping.
cli already exists, skipping unzipping.
cluster already exists, skipping unzipping.
unzipping zip\colorspace_2.1-1.zip
unzipping zip\data.table_1.17.2.zip
digest already exists, skipping unzipping.
evaluate already exists, skipping unzipping.
fansi already exists, skipping unzipping.
unzipping zip\farver_2.1.2.zip
unzipping zip\fastmap_1.2.0.zip
fontawesome already exists, skipping unzipping.
unzipping zip\foreign_0.8-90.zip
unzipping zip\Formula_1.2-5.zip
fs already exists, skipping unzipping.
ggplot2 already exists, skipping unzipping.
glue already exists, skipping unzipping.
gridExtra already exists, skipping unzipping.
gtable already exists, skipping unzipping.
highr already exists, skipping unzipping.
unzipping zip\Hmisc_5.2-3.zip
unzipping zip\htmlTable_2.4.3.zip
htmltools

In [21]:
# set up and clean up the folders
folders = [Path(".") / "zip", Path(".") / "unzip"]
import shutil

for folder in folders:
    if not folder.exists():
        folder.mkdir()
    else:
        shutil.rmtree(folder)
        folder.mkdir()