In [3]:
from pathlib import Path

def gather_py_files_with_content(
    root: str | Path,
    max_chars: int = 100_000,
    separator: str = "\n\n"
) -> str:
    """
    Recursively finds all .py files under `root`, but if a file's content
    is longer than `max_chars` characters it will be skipped. At the end,
    prints a list of skipped files with their character counts.

    Returns a single string where each included file is prefixed by its full path:
        ==== /full/path/to/module.py ====
        <file contents>
    """
    root = Path(root).expanduser().resolve()
    segments: list[str] = []
    ignored: list[tuple[str, int]] = []

    for py_file in root.rglob("*.py"):
        # read the file (with a UTF-8 -> Latin-1 fallback)
        try:
            text = py_file.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            text = py_file.read_text(encoding="latin-1", errors="ignore")

        length = len(text)
        if length > max_chars:
            ignored.append((str(py_file), length))
            continue

        header = f"==== {py_file} ===="
        segments.append(header)
        segments.append(text)

    # print out ignored files
    if ignored:
        print("Ignored files (exceeded character limit of "
              f"{max_chars}):")
        for path, count in ignored:
            print(f"{path}: {count:,} characters")

    return separator.join(segments)

In [4]:


combined = gather_py_files_with_content("realhf")
print(combined)


==== /home/roeseler/github/AReaL/realhf/version.py ====

import subprocess
from pathlib import Path

__version__ = "0.3.0-dev"
__branch__ = ""
__commit__ = ""
__is_dirty__ = False

try:
    __branch__ = (
        subprocess.check_output(
            ["git", "branch", "--show-current"],
            stderr=subprocess.DEVNULL,
            cwd=Path(__file__).parent,
        )
        .decode("utf-8")
        .strip()
    )
    __commit__ = (
        subprocess.check_output(
            ["git", "rev-parse", "--short", "HEAD"],
            stderr=subprocess.DEVNULL,
            cwd=Path(__file__).parent,
        )
        .decode("utf-8")
        .strip()
    )
    __is_dirty__ = False
    try:
        subprocess.check_call(
            ["git", "diff-index", "--quiet", "HEAD", "--"],
            stderr=subprocess.DEVNULL,
            cwd=Path(__file__).parent,
        )
    except subprocess.CalledProcessError:
        __is_dirty__ = True
except (subprocess.CalledProcessError, FileNotFoundEr