Skip to content

Commit e4ba5c7

Browse files
AreteDriverclaude
andcommitted
feat(security): v0.7.1 SSRF guard with redirect re-validation + credential-dir denylist
Closes two MEDIUM findings from the 2026-05-11 semantic follow-up audit. F-NEW-2 — Web ingest SSRF guard (`memboot ingest <url>`): - Fetch moved off `trafilatura.fetch_url` to a stdlib `urllib` GET so the redirect chain is under our control. - `_validate_ingest_url` runs on the initial URL; `_ValidatingRedirectHandler` re-runs the same guard on every redirect hop, so a public URL can't 30x-bounce to 169.254.169.254 / RFC1918 / loopback. - Only http(s) schemes accepted. Multi-record DNS: any non-public address in the set rejects the URL. - Redirects capped at 5, body at 10 MiB (compression refused), 20 s timeout. - `MEMBOOT_INGEST_ALLOW_PRIVATE=1` overrides the IP check; the scheme check is always enforced. `trafilatura.extract` still does extraction. - Residual: DNS rebinding within a single fetch — out of scope for the single-user-CLI threat model. - Bare `except Exception` narrowed to `(OSError, ValueError)` (F-NEW-6). F-NEW-1 — Credential-directory denylist in default `ignore_patterns`: - Adds .env / .env.* / *.env, .aws, .ssh, .gnupg, .netrc, .docker, .kube, .gcloud, .azure, .terraform, secrets, secret, credentials, plus the file patterns *.pem, *.key, id_rsa*, id_ed25519*, *.p12, *.pfx, *.tfvars, *.tfvars.json. - Poka-yoke regression test in tests/test_indexer.py constructs a temp project with these surfaces and asserts none reach `discover_files` output. Tests: 649 passed (was 621), 90.9% coverage. Deps: - urllib3>=2.7.0 pinned (CVE-2026-44431, CVE-2026-44432). Version bump: 0.7.0 → 0.7.1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6dce896 commit e4ba5c7

6 files changed

Lines changed: 551 additions & 41 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "memboot"
7-
version = "0.7.0"
7+
version = "0.7.1"
88
description = "Zero-infrastructure persistent memory for any LLM"
99
readme = "README.md"
1010
license = {text = "MIT"}
@@ -52,6 +52,7 @@ dev = [
5252
"pillow>=12.2.0",
5353
"pygments>=2.20.0",
5454
"requests>=2.33.0",
55+
"urllib3>=2.7.0", # CVE-2026-44431, CVE-2026-44432
5556
]
5657

5758
[project.scripts]

src/memboot/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""memboot -- Zero-infrastructure persistent memory for any LLM."""
22

3-
__version__ = "0.7.0"
3+
__version__ = "0.7.1"
44
__all__ = ["__version__"]

src/memboot/ingest/web.py

Lines changed: 135 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,148 @@
22

33
from __future__ import annotations
44

5+
import ipaddress
56
import json
7+
import os
8+
import socket
9+
import urllib.error
10+
import urllib.request
611
from pathlib import Path
12+
from urllib.parse import urlparse
713
from uuid import uuid4
814

15+
from memboot import __version__
916
from memboot.chunker import _chunk_window
1017
from memboot.embedder import TfidfEmbedder, get_embedder
1118
from memboot.exceptions import IngestError
1219
from memboot.indexer import get_db_path
1320
from memboot.models import Chunk, ChunkType, MembootConfig
1421
from memboot.store import MembootStore
1522

23+
_ALLOW_PRIVATE_ENV = "MEMBOOT_INGEST_ALLOW_PRIVATE"
24+
_MAX_REDIRECTS = 5
25+
_MAX_BODY_BYTES = 10 * 1024 * 1024 # 10 MiB — generous for a single web page
26+
_FETCH_TIMEOUT = 20.0
27+
_USER_AGENT = f"memboot/{__version__} (+https://github.com/AreteDriver/memboot)"
28+
29+
30+
def _is_blocked_ip(ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
31+
"""Return True if the address falls in any non-public range.
32+
33+
Blocks private (RFC1918), loopback, link-local (incl. AWS metadata at
34+
169.254.169.254), multicast, reserved, and unspecified addresses for both
35+
IPv4 and IPv6.
36+
"""
37+
return (
38+
ip.is_private
39+
or ip.is_loopback
40+
or ip.is_link_local
41+
or ip.is_multicast
42+
or ip.is_reserved
43+
or ip.is_unspecified
44+
)
45+
46+
47+
def _validate_ingest_url(url: str) -> None:
48+
"""Guard against SSRF before any network call.
49+
50+
Raises IngestError if:
51+
- scheme is not http/https (rejects file://, gopher://, dict://, etc.)
52+
- the URL has no hostname
53+
- the hostname resolves (any A/AAAA record) to a non-public address
54+
55+
Set MEMBOOT_INGEST_ALLOW_PRIVATE=1 to bypass the IP check for users who
56+
legitimately need to ingest from internal docs servers. The scheme check
57+
is always enforced.
58+
59+
Every redirect hop is re-validated through this function — see
60+
_ValidatingRedirectHandler — so a public URL that 30x-redirects to a
61+
private or cloud-metadata address is still refused.
62+
"""
63+
parsed = urlparse(url)
64+
if parsed.scheme not in ("http", "https"):
65+
raise IngestError(
66+
f"Unsupported URL scheme {parsed.scheme!r}. "
67+
"Web ingestion accepts only http:// and https:// URLs."
68+
)
69+
hostname = parsed.hostname
70+
if not hostname:
71+
raise IngestError(f"URL has no hostname: {url}")
72+
73+
if os.environ.get(_ALLOW_PRIVATE_ENV, "").strip().lower() in ("1", "true", "yes"):
74+
return
75+
76+
try:
77+
infos = socket.getaddrinfo(hostname, None)
78+
except OSError as exc:
79+
raise IngestError(f"Cannot resolve {hostname}: {exc}") from exc
80+
81+
for info in infos:
82+
addr = info[4][0]
83+
try:
84+
ip = ipaddress.ip_address(addr.split("%", 1)[0])
85+
except ValueError:
86+
continue
87+
if _is_blocked_ip(ip):
88+
raise IngestError(
89+
f"Refusing to fetch {url}: hostname {hostname!r} resolves to "
90+
f"non-public address {ip}. Set {_ALLOW_PRIVATE_ENV}=1 to override "
91+
"(e.g., internal documentation servers)."
92+
)
93+
94+
95+
class _ValidatingRedirectHandler(urllib.request.HTTPRedirectHandler):
96+
"""Re-validate every redirect target against the SSRF guard.
97+
98+
urllib resolves a relative Location header to an absolute URL before
99+
calling redirect_request, so the target we see here is always absolute.
100+
Raising IngestError aborts the redirect chain (the exception propagates
101+
out of opener.open()).
102+
"""
103+
104+
max_redirections = _MAX_REDIRECTS
105+
106+
def redirect_request(self, req, fp, code, msg, headers, newurl):
107+
_validate_ingest_url(newurl)
108+
return super().redirect_request(req, fp, code, msg, headers, newurl)
109+
110+
111+
def _fetch_html(url: str) -> str | None:
112+
"""Fetch a URL's body as decoded text, re-validating every redirect hop.
113+
114+
Returns None if the response body is empty. Raises IngestError on an
115+
unsafe redirect target, an HTTP error, a network failure, or a body that
116+
exceeds _MAX_BODY_BYTES. Compression is refused (Accept-Encoding: identity)
117+
so the size cap is a true cap, not a post-decompression one.
118+
"""
119+
opener = urllib.request.build_opener(_ValidatingRedirectHandler())
120+
req = urllib.request.Request(
121+
url,
122+
headers={
123+
"User-Agent": _USER_AGENT,
124+
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
125+
"Accept-Encoding": "identity",
126+
},
127+
)
128+
try:
129+
with opener.open(req, timeout=_FETCH_TIMEOUT) as resp:
130+
raw = resp.read(_MAX_BODY_BYTES + 1)
131+
charset = resp.headers.get_content_charset() or "utf-8"
132+
except urllib.error.HTTPError as exc:
133+
raise IngestError(f"HTTP {exc.code} fetching {url}") from exc
134+
except urllib.error.URLError as exc:
135+
raise IngestError(f"Failed to fetch {url}: {exc.reason}") from exc
136+
except (OSError, ValueError) as exc:
137+
raise IngestError(f"Failed to fetch {url}: {exc}") from exc
138+
139+
if len(raw) > _MAX_BODY_BYTES:
140+
raise IngestError(
141+
f"Response from {url} exceeds the {_MAX_BODY_BYTES // (1024 * 1024)} MiB limit."
142+
)
143+
if not raw:
144+
return None
145+
return raw.decode(charset, errors="replace")
146+
16147

17148
def ingest_url(
18149
url: str,
@@ -21,7 +152,8 @@ def ingest_url(
21152
) -> list[Chunk]:
22153
"""Ingest content from a URL into project memory.
23154
24-
Fetches and extracts main content using trafilatura, chunks via sliding
155+
Fetches the page (validating the URL and every redirect hop against the
156+
SSRF guard), extracts main content with trafilatura, chunks via sliding
25157
window, embeds, and stores. Requires Pro tier.
26158
"""
27159
try:
@@ -33,20 +165,15 @@ def ingest_url(
33165

34166
config = config or MembootConfig()
35167

36-
# Fetch and extract content
37-
try:
38-
downloaded = trafilatura.fetch_url(url)
39-
except Exception as exc:
40-
raise IngestError(f"Failed to fetch {url}: {exc}") from exc
41-
168+
_validate_ingest_url(url)
169+
downloaded = _fetch_html(url)
42170
if downloaded is None:
43171
raise IngestError(f"Could not download content from {url}")
44172

45173
text = trafilatura.extract(downloaded)
46174
if not text or not text.strip():
47175
raise IngestError(f"No extractable content from {url}")
48176

49-
# Chunk the extracted text
50177
chunk_results = _chunk_window(text, config)
51178

52179
if not chunk_results:
@@ -64,7 +191,6 @@ def ingest_url(
64191
)
65192
chunks.append(chunk)
66193

67-
# Embed and store
68194
db_path = get_db_path(project_path.resolve())
69195
store = MembootStore(db_path)
70196

src/memboot/models.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ class MembootConfig(BaseModel):
187187
)
188188
ignore_patterns: list[str] = Field(
189189
default_factory=lambda: [
190+
# Build / tooling caches
190191
"__pycache__",
191192
".git",
192193
"node_modules",
@@ -200,6 +201,33 @@ class MembootConfig(BaseModel):
200201
"dist",
201202
"build",
202203
".tox",
204+
# Secret / credential surfaces — refuse to index by default.
205+
# An allowed extension (.toml/.cfg/.ini/.json/.yaml/.yml/.txt) inside
206+
# any of these would otherwise land in the embedding store and become
207+
# recoverable via embedding inversion if the DB is ever shared.
208+
".env",
209+
".env.*",
210+
"*.env",
211+
".aws",
212+
".ssh",
213+
".gnupg",
214+
".netrc",
215+
".docker",
216+
".kube",
217+
".gcloud",
218+
".azure",
219+
".terraform",
220+
"secrets",
221+
"secret",
222+
"credentials",
223+
"*.pem",
224+
"*.key",
225+
"id_rsa*",
226+
"id_ed25519*",
227+
"*.p12",
228+
"*.pfx",
229+
"*.tfvars",
230+
"*.tfvars.json",
203231
]
204232
)
205233

tests/test_indexer.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,100 @@ def test_empty_dir(self, tmp_path: Path):
9898
files = discover_files(project, config)
9999
assert files == []
100100

101+
def test_credential_dirs_skipped_when_explicitly_configured(self, tmp_path: Path):
102+
"""Mechanism check: an explicit ignore_patterns blocklist DOES skip credential paths.
103+
104+
Proves the discover_files plumbing handles dotted dir names and glob
105+
patterns correctly. Pairs with test_credential_dirs_skipped_by_default
106+
which checks that the *default* config carries the same patterns.
107+
"""
108+
project = tmp_path / "proj"
109+
project.mkdir()
110+
(project / "good.py").write_text("x = 1\n")
111+
112+
for rel in (
113+
".env",
114+
".aws/credentials",
115+
".ssh/config",
116+
"secrets/api.toml",
117+
"terraform.tfvars",
118+
"id_rsa.pem",
119+
):
120+
target = project / rel
121+
target.parent.mkdir(parents=True, exist_ok=True)
122+
target.write_text("SECRET=hunter2\n")
123+
124+
config = MembootConfig(
125+
file_extensions=[".py", ".toml", ".pem", ".tfvars", ""],
126+
ignore_patterns=[
127+
".env",
128+
".aws",
129+
".ssh",
130+
"secrets",
131+
"*.pem",
132+
"*.tfvars",
133+
],
134+
)
135+
files = discover_files(project, config)
136+
relpaths = {str(f.relative_to(project)) for f in files}
137+
for forbidden in (
138+
".env",
139+
".aws/credentials",
140+
".ssh/config",
141+
"secrets/api.toml",
142+
"terraform.tfvars",
143+
"id_rsa.pem",
144+
):
145+
assert forbidden not in relpaths, (
146+
f"discover_files leaked credential path {forbidden!r}: {relpaths}"
147+
)
148+
149+
def test_credential_dirs_skipped_by_default(self, tmp_path: Path):
150+
"""Regression test: default config must not index credential paths.
151+
152+
Constructs a temp project with the exact attack scenarios named in
153+
the audit (.env, .git/, .aws/credentials, secrets/, *.tfvars, *.pem)
154+
in extensions on the file_extensions allowlist (.toml/.cfg/.ini/.json),
155+
and asserts none appear in discover_files output.
156+
157+
This is the poka-yoke against silent regression on the default skip
158+
list — any future "cleanup" that drops one of these patterns trips
159+
this test.
160+
"""
161+
project = tmp_path / "proj"
162+
project.mkdir()
163+
(project / "good.py").write_text("x = 1\n")
164+
165+
scenarios = {
166+
".env": "OPENAI_API_KEY=sk-...\n",
167+
".aws/credentials": "[default]\naws_access_key_id=AKIA...\n",
168+
".ssh/config": "Host *\n IdentityFile ~/.ssh/id_rsa\n",
169+
"secrets/prod.toml": '[db]\npassword = "hunter2"\n',
170+
"credentials/app.json": '{"token": "shh"}\n',
171+
"terraform.tfvars": 'aws_secret = "..."\n',
172+
"id_rsa.pem": "-----BEGIN PRIVATE KEY-----\n",
173+
".docker/config.json": '{"auths": {"...": {}}}\n',
174+
".kube/config": "apiVersion: v1\n",
175+
".gnupg/private-keys-v1.d/key.key": "binary blob\n",
176+
".git/config": "[remote]\n",
177+
}
178+
for rel, body in scenarios.items():
179+
target = project / rel
180+
target.parent.mkdir(parents=True, exist_ok=True)
181+
target.write_text(body)
182+
183+
config = MembootConfig()
184+
files = discover_files(project, config)
185+
relpaths = {str(f.relative_to(project)) for f in files}
186+
187+
for forbidden in scenarios:
188+
assert forbidden not in relpaths, (
189+
f"Default ignore_patterns leaked credential path {forbidden!r}.\n"
190+
f"Discovered: {sorted(relpaths)}"
191+
)
192+
# Sanity: the non-credential file still indexed.
193+
assert "good.py" in relpaths
194+
101195

102196
class TestIndexProject:
103197
def test_full_pipeline(self, tmp_project_dir: Path, monkeypatch):

0 commit comments

Comments
 (0)