feat(security): v0.7.1 SSRF guard with redirect re-validation + credential-dir denylist

AreteDriver · claude · AreteDriver · commit e4ba5c765c73 · 2026-05-13T01:38:55.000-07:00
Closes two MEDIUM findings from the 2026-05-11 semantic follow-up audit. F-NEW-2 — Web ingest SSRF guard (`memboot ingest <url>`): - Fetch moved off `trafilatura.fetch_url` to a stdlib `urllib` GET so the redirect chain is under our control. - `_validate_ingest_url` runs on the initial URL; `_ValidatingRedirectHandler` re-runs the same guard on every redirect hop, so a public URL can't 30x-bounce to 169.254.169.254 / RFC1918 / loopback. - Only http(s) schemes accepted. Multi-record DNS: any non-public address in the set rejects the URL. - Redirects capped at 5, body at 10 MiB (compression refused), 20 s timeout. - `MEMBOOT_INGEST_ALLOW_PRIVATE=1` overrides the IP check; the scheme check is always enforced. `trafilatura.extract` still does extraction. - Residual: DNS rebinding within a single fetch — out of scope for the single-user-CLI threat model. - Bare `except Exception` narrowed to `(OSError, ValueError)` (F-NEW-6). F-NEW-1 — Credential-directory denylist in default `ignore_patterns`: - Adds .env / .env.* / *.env, .aws, .ssh, .gnupg, .netrc, .docker, .kube, .gcloud, .azure, .terraform, secrets, secret, credentials, plus the file patterns *.pem, *.key, id_rsa*, id_ed25519*, *.p12, *.pfx, *.tfvars, *.tfvars.json. - Poka-yoke regression test in tests/test_indexer.py constructs a temp project with these surfaces and asserts none reach `discover_files` output. Tests: 649 passed (was 621), 90.9% coverage. Deps: - urllib3>=2.7.0 pinned (CVE-2026-44431, CVE-2026-44432). Version bump: 0.7.0 → 0.7.1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "memboot"
-version = "0.7.0"
+version = "0.7.1"
 description = "Zero-infrastructure persistent memory for any LLM"
 readme = "README.md"
 license = {text = "MIT"}
@@ -52,6 +52,7 @@ dev = [
     "pillow>=12.2.0",
     "pygments>=2.20.0",
     "requests>=2.33.0",
+    "urllib3>=2.7.0",  # CVE-2026-44431, CVE-2026-44432
 ]
 
 [project.scripts]
diff --git a/src/memboot/__init__.py b/src/memboot/__init__.py
@@ -1,4 +1,4 @@
 """memboot -- Zero-infrastructure persistent memory for any LLM."""
 
-__version__ = "0.7.0"
+__version__ = "0.7.1"
 __all__ = ["__version__"]
diff --git a/src/memboot/ingest/web.py b/src/memboot/ingest/web.py
@@ -2,17 +2,148 @@
 
 from __future__ import annotations
 
+import ipaddress
 import json
+import os
+import socket
+import urllib.error
+import urllib.request
 from pathlib import Path
+from urllib.parse import urlparse
 from uuid import uuid4
 
+from memboot import __version__
 from memboot.chunker import _chunk_window
 from memboot.embedder import TfidfEmbedder, get_embedder
 from memboot.exceptions import IngestError
 from memboot.indexer import get_db_path
 from memboot.models import Chunk, ChunkType, MembootConfig
 from memboot.store import MembootStore
 
+_ALLOW_PRIVATE_ENV = "MEMBOOT_INGEST_ALLOW_PRIVATE"
+_MAX_REDIRECTS = 5
+_MAX_BODY_BYTES = 10 * 1024 * 1024  # 10 MiB — generous for a single web page
+_FETCH_TIMEOUT = 20.0
+_USER_AGENT = f"memboot/{__version__} (+https://github.com/AreteDriver/memboot)"
+
+
+def _is_blocked_ip(ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
+    """Return True if the address falls in any non-public range.
+
+    Blocks private (RFC1918), loopback, link-local (incl. AWS metadata at
+    169.254.169.254), multicast, reserved, and unspecified addresses for both
+    IPv4 and IPv6.
+    """
+    return (
+        ip.is_private
+        or ip.is_loopback
+        or ip.is_link_local
+        or ip.is_multicast
+        or ip.is_reserved
+        or ip.is_unspecified
+    )
+
+
+def _validate_ingest_url(url: str) -> None:
+    """Guard against SSRF before any network call.
+
+    Raises IngestError if:
+    - scheme is not http/https (rejects file://, gopher://, dict://, etc.)
+    - the URL has no hostname
+    - the hostname resolves (any A/AAAA record) to a non-public address
+
+    Set MEMBOOT_INGEST_ALLOW_PRIVATE=1 to bypass the IP check for users who
+    legitimately need to ingest from internal docs servers. The scheme check
+    is always enforced.
+
+    Every redirect hop is re-validated through this function — see
+    _ValidatingRedirectHandler — so a public URL that 30x-redirects to a
+    private or cloud-metadata address is still refused.
+    """
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise IngestError(
+            f"Unsupported URL scheme {parsed.scheme!r}. "
+            "Web ingestion accepts only http:// and https:// URLs."
+        )
+    hostname = parsed.hostname
+    if not hostname:
+        raise IngestError(f"URL has no hostname: {url}")
+
+    if os.environ.get(_ALLOW_PRIVATE_ENV, "").strip().lower() in ("1", "true", "yes"):
+        return
+
+    try:
+        infos = socket.getaddrinfo(hostname, None)
+    except OSError as exc:
+        raise IngestError(f"Cannot resolve {hostname}: {exc}") from exc
+
+    for info in infos:
+        addr = info[4][0]
+        try:
+            ip = ipaddress.ip_address(addr.split("%", 1)[0])
+        except ValueError:
+            continue
+        if _is_blocked_ip(ip):
+            raise IngestError(
+                f"Refusing to fetch {url}: hostname {hostname!r} resolves to "
+                f"non-public address {ip}. Set {_ALLOW_PRIVATE_ENV}=1 to override "
+                "(e.g., internal documentation servers)."
+            )
+
+
+class _ValidatingRedirectHandler(urllib.request.HTTPRedirectHandler):
+    """Re-validate every redirect target against the SSRF guard.
+
+    urllib resolves a relative Location header to an absolute URL before
+    calling redirect_request, so the target we see here is always absolute.
+    Raising IngestError aborts the redirect chain (the exception propagates
+    out of opener.open()).
+    """
+
+    max_redirections = _MAX_REDIRECTS
+
+    def redirect_request(self, req, fp, code, msg, headers, newurl):
+        _validate_ingest_url(newurl)
+        return super().redirect_request(req, fp, code, msg, headers, newurl)
+
+
+def _fetch_html(url: str) -> str | None:
+    """Fetch a URL's body as decoded text, re-validating every redirect hop.
+
+    Returns None if the response body is empty. Raises IngestError on an
+    unsafe redirect target, an HTTP error, a network failure, or a body that
+    exceeds _MAX_BODY_BYTES. Compression is refused (Accept-Encoding: identity)
+    so the size cap is a true cap, not a post-decompression one.
+    """
+    opener = urllib.request.build_opener(_ValidatingRedirectHandler())
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": _USER_AGENT,
+            "Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
+            "Accept-Encoding": "identity",
+        },
+    )
+    try:
+        with opener.open(req, timeout=_FETCH_TIMEOUT) as resp:
+            raw = resp.read(_MAX_BODY_BYTES + 1)
+            charset = resp.headers.get_content_charset() or "utf-8"
+    except urllib.error.HTTPError as exc:
+        raise IngestError(f"HTTP {exc.code} fetching {url}") from exc
+    except urllib.error.URLError as exc:
+        raise IngestError(f"Failed to fetch {url}: {exc.reason}") from exc
+    except (OSError, ValueError) as exc:
+        raise IngestError(f"Failed to fetch {url}: {exc}") from exc
+
+    if len(raw) > _MAX_BODY_BYTES:
+        raise IngestError(
+            f"Response from {url} exceeds the {_MAX_BODY_BYTES // (1024 * 1024)} MiB limit."
+        )
+    if not raw:
+        return None
+    return raw.decode(charset, errors="replace")
+
 
 def ingest_url(
     url: str,
@@ -21,7 +152,8 @@ def ingest_url(
 ) -> list[Chunk]:
     """Ingest content from a URL into project memory.
 
-    Fetches and extracts main content using trafilatura, chunks via sliding
+    Fetches the page (validating the URL and every redirect hop against the
+    SSRF guard), extracts main content with trafilatura, chunks via sliding
     window, embeds, and stores. Requires Pro tier.
     """
     try:
@@ -33,20 +165,15 @@ def ingest_url(
 
     config = config or MembootConfig()
 
-    # Fetch and extract content
-    try:
-        downloaded = trafilatura.fetch_url(url)
-    except Exception as exc:
-        raise IngestError(f"Failed to fetch {url}: {exc}") from exc
-
+    _validate_ingest_url(url)
+    downloaded = _fetch_html(url)
     if downloaded is None:
         raise IngestError(f"Could not download content from {url}")
 
     text = trafilatura.extract(downloaded)
     if not text or not text.strip():
         raise IngestError(f"No extractable content from {url}")
 
-    # Chunk the extracted text
     chunk_results = _chunk_window(text, config)
 
     if not chunk_results:
@@ -64,7 +191,6 @@ def ingest_url(
         )
         chunks.append(chunk)
 
-    # Embed and store
     db_path = get_db_path(project_path.resolve())
     store = MembootStore(db_path)
 
diff --git a/src/memboot/models.py b/src/memboot/models.py
@@ -187,6 +187,7 @@ class MembootConfig(BaseModel):
     )
     ignore_patterns: list[str] = Field(
         default_factory=lambda: [
+            # Build / tooling caches
             "__pycache__",
             ".git",
             "node_modules",
@@ -200,6 +201,33 @@ class MembootConfig(BaseModel):
             "dist",
             "build",
             ".tox",
+            # Secret / credential surfaces — refuse to index by default.
+            # An allowed extension (.toml/.cfg/.ini/.json/.yaml/.yml/.txt) inside
+            # any of these would otherwise land in the embedding store and become
+            # recoverable via embedding inversion if the DB is ever shared.
+            ".env",
+            ".env.*",
+            "*.env",
+            ".aws",
+            ".ssh",
+            ".gnupg",
+            ".netrc",
+            ".docker",
+            ".kube",
+            ".gcloud",
+            ".azure",
+            ".terraform",
+            "secrets",
+            "secret",
+            "credentials",
+            "*.pem",
+            "*.key",
+            "id_rsa*",
+            "id_ed25519*",
+            "*.p12",
+            "*.pfx",
+            "*.tfvars",
+            "*.tfvars.json",
         ]
     )
 
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
@@ -98,6 +98,100 @@ def test_empty_dir(self, tmp_path: Path):
         files = discover_files(project, config)
         assert files == []
 
+    def test_credential_dirs_skipped_when_explicitly_configured(self, tmp_path: Path):
+        """Mechanism check: an explicit ignore_patterns blocklist DOES skip credential paths.
+
+        Proves the discover_files plumbing handles dotted dir names and glob
+        patterns correctly. Pairs with test_credential_dirs_skipped_by_default
+        which checks that the *default* config carries the same patterns.
+        """
+        project = tmp_path / "proj"
+        project.mkdir()
+        (project / "good.py").write_text("x = 1\n")
+
+        for rel in (
+            ".env",
+            ".aws/credentials",
+            ".ssh/config",
+            "secrets/api.toml",
+            "terraform.tfvars",
+            "id_rsa.pem",
+        ):
+            target = project / rel
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text("SECRET=hunter2\n")
+
+        config = MembootConfig(
+            file_extensions=[".py", ".toml", ".pem", ".tfvars", ""],
+            ignore_patterns=[
+                ".env",
+                ".aws",
+                ".ssh",
+                "secrets",
+                "*.pem",
+                "*.tfvars",
+            ],
+        )
+        files = discover_files(project, config)
+        relpaths = {str(f.relative_to(project)) for f in files}
+        for forbidden in (
+            ".env",
+            ".aws/credentials",
+            ".ssh/config",
+            "secrets/api.toml",
+            "terraform.tfvars",
+            "id_rsa.pem",
+        ):
+            assert forbidden not in relpaths, (
+                f"discover_files leaked credential path {forbidden!r}: {relpaths}"
+            )
+
+    def test_credential_dirs_skipped_by_default(self, tmp_path: Path):
+        """Regression test: default config must not index credential paths.
+
+        Constructs a temp project with the exact attack scenarios named in
+        the audit (.env, .git/, .aws/credentials, secrets/, *.tfvars, *.pem)
+        in extensions on the file_extensions allowlist (.toml/.cfg/.ini/.json),
+        and asserts none appear in discover_files output.
+
+        This is the poka-yoke against silent regression on the default skip
+        list — any future "cleanup" that drops one of these patterns trips
+        this test.
+        """
+        project = tmp_path / "proj"
+        project.mkdir()
+        (project / "good.py").write_text("x = 1\n")
+
+        scenarios = {
+            ".env": "OPENAI_API_KEY=sk-...\n",
+            ".aws/credentials": "[default]\naws_access_key_id=AKIA...\n",
+            ".ssh/config": "Host *\n  IdentityFile ~/.ssh/id_rsa\n",
+            "secrets/prod.toml": '[db]\npassword = "hunter2"\n',
+            "credentials/app.json": '{"token": "shh"}\n',
+            "terraform.tfvars": 'aws_secret = "..."\n',
+            "id_rsa.pem": "-----BEGIN PRIVATE KEY-----\n",
+            ".docker/config.json": '{"auths": {"...": {}}}\n',
+            ".kube/config": "apiVersion: v1\n",
+            ".gnupg/private-keys-v1.d/key.key": "binary blob\n",
+            ".git/config": "[remote]\n",
+        }
+        for rel, body in scenarios.items():
+            target = project / rel
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text(body)
+
+        config = MembootConfig()
+        files = discover_files(project, config)
+        relpaths = {str(f.relative_to(project)) for f in files}
+
+        for forbidden in scenarios:
+            assert forbidden not in relpaths, (
+                f"Default ignore_patterns leaked credential path {forbidden!r}.\n"
+                f"Discovered: {sorted(relpaths)}"
+            )
+        # Sanity: the non-credential file still indexed.
+        assert "good.py" in relpaths
+
 
 class TestIndexProject:
     def test_full_pipeline(self, tmp_project_dir: Path, monkeypatch):
diff --git a/tests/test_ingest.py b/tests/test_ingest.py