In [2]:
import aiohttp
import gzip
from dataclasses import dataclass
import re
from typing import Dict, List, Set, Iterable, Optional

In [3]:
CRAN_PACKAGES_URL = "https://cran.r-project.org/src/contrib/PACKAGES.gz"


@dataclass
class PackageMetadata:
    name: str
    version: str
    depends: List[str]
    imports: List[str]
    linking_to: List[str]
    suggests: List[str]


def _remove_version_constraints(token: str) -> str:
    """
    Remove version constraints in parentheses.
    e.g. 'dplyr (>= 1.0.0)' -> 'dplyr'
    """
    # Remove anything in parentheses, including space before
    return re.sub(r"\s*\(.*?\)", "", token).strip()


def _extract_dependencies(dep_string: Optional[str]) -> List[str]:
    """
    Turn a CRAN dependency string into a list of package names.

    Example:
        "R (>= 3.5.0), ggplot2, dplyr (>= 1.0.0)"
        -> ["ggplot2", "dplyr"]
    """
    if not dep_string:
        return []

    deps: List[str] = []
    for token in dep_string.split(","):
        token = token.strip()
        if not token:
            continue
        token = _remove_version_constraints(token)
        if not token or token == "R":
            continue
        deps.append(token)
    return deps


def _parse_packages_text(text: str) -> Dict[str, PackageMetadata]:
    """
    Parse the PACKAGES file content (plain text) into a dict:
        name -> PackageMetadata
    """
    index: Dict[str, PackageMetadata] = {}

    # Records are separated by blank lines
    blocks = text.strip().split("\n\n")
    for block in blocks:
        if not block.strip():
            continue

        fields = {}
        for line in block.splitlines():
            if not line.strip():
                continue
            # Handle possible continuation lines (indented)
            if line[0].isspace():
                # continuation of previous key
                last_key = next(reversed(fields))
                fields[last_key] += " " + line.strip()
                continue

            if ":" not in line:
                continue
            key, value = line.split(":", 1)
            fields[key.strip()] = value.strip()

        name = fields.get("Package")
        version = fields.get("Version")
        if not name or not version:
            continue

        meta = PackageMetadata(
            name=name,
            version=version,
            depends=_extract_dependencies(fields.get("Depends")),
            imports=_extract_dependencies(fields.get("Imports")),
            linking_to=_extract_dependencies(fields.get("LinkingTo")),
            suggests=_extract_dependencies(fields.get("Suggests")),
        )
        index[name] = meta

    return index


async def load_cran_index(session: aiohttp.ClientSession) -> Dict[str, PackageMetadata]:
    """
    Download and parse PACKAGES.gz from CRAN.
    """
    async with session.get(CRAN_PACKAGES_URL) as resp:
        resp.raise_for_status()
        data = await resp.read()

    text = gzip.decompress(data).decode("utf-8", errors="replace")
    return _parse_packages_text(text)

In [9]:
import aiohttp
import asyncio
import gzip

CRAN_PACKAGES_URL = "https://cran.r-project.org/src/contrib/PACKAGES.gz"

# Global reusable index
CRAN_INDEX = None


async def build_cran_index():
    """
    Download PACKAGES.gz from CRAN, parse it using your existing
    _parse_packages_text() function, and store it in CRAN_INDEX.
    """
    global CRAN_INDEX

    async with aiohttp.ClientSession() as session:
        async with session.get(CRAN_PACKAGES_URL) as resp:
            resp.raise_for_status()
            raw = await resp.read()

    text = gzip.decompress(raw).decode("utf-8", errors="replace")

    # Use YOUR parser from Section 1.2
    CRAN_INDEX = _parse_packages_text(text)

    print(f"CRAN index built with {len(CRAN_INDEX)} packages.")


# Run it once to build the index
await build_cran_index()

CRAN index built with 23082 packages.


In [10]:
CRAN_INDEX

{'AalenJohansen': PackageMetadata(name='AalenJohansen', version='1.0', depends=[], imports=[], linking_to=[], suggests=['knitr', 'rmarkdown']),
 'aamatch': PackageMetadata(name='aamatch', version='0.3.7', depends=[], imports=['iTOS', 'stats'], linking_to=[], suggests=['DOS2', 'sensitivity2x2xk', 'sensitivitymv', 'weightedRank', 'xtable']),
 'AATtools': PackageMetadata(name='AATtools', version='0.0.3', depends=[], imports=['magrittr', 'dplyr', 'doParallel', 'foreach'], linking_to=[], suggests=[]),
 'ABACUS': PackageMetadata(name='ABACUS', version='1.0.0', depends=[], imports=['ggplot2', 'shiny'], linking_to=[], suggests=['rmarkdown', 'knitr']),
 'abasequence': PackageMetadata(name='abasequence', version='0.1.0', depends=[], imports=[], linking_to=[], suggests=[]),
 'abbreviate': PackageMetadata(name='abbreviate', version='0.1', depends=[], imports=[], linking_to=[], suggests=['testthat']),
 'abc': PackageMetadata(name='abc', version='2.2.2', depends=['abc.data', 'nnet', 'quantreg', 'MAS

In [11]:
CRAN_INDEX["Hmisc"]

PackageMetadata(name='Hmisc', version='5.2-4', depends=[], imports=['methods', 'ggplot2', 'cluster', 'rpart', 'nnet', 'foreign', 'gtable', 'grid', 'gridExtra', 'data.table', 'htmlTable', 'viridisLite', 'htmltools', 'base64enc', 'colorspace', 'rmarkdown', 'knitr', 'Formula'], linking_to=[], suggests=['survival', 'qreport', 'acepack', 'chron', 'rms', 'mice', 'rstudioapi', 'tables', 'plotly', 'rlang', 'VGAM', 'leaps', 'pcaPP', 'digest', 'parallel', 'polspline', 'abind', 'kableExtra', 'rio', 'lattice', 'latticeExtra', 'gt', 'sparkline', 'jsonlite', 'htmlwidgets', 'qs', 'getPass', 'keyring', 'safer', 'htm2txt', 'boot'])

In [12]:
import pickle

with open("cran_index.pkl", "wb") as f:
    pickle.dump(CRAN_INDEX, f)

print("Saved cran_index.pkl")

Saved cran_index.pkl


In [13]:
# test loading and comparing
with open("cran_index.pkl", "rb") as f:
    loaded_index = pickle.load(f)
assert loaded_index == CRAN_INDEX
print("Loaded index matches the original.")

Loaded index matches the original.
