Skip to content

Commit

Permalink
00399-cve-2023-24329.patch
Browse files Browse the repository at this point in the history
00399 #

* pythongh-102153: Start stripping C0 control and space chars in `urlsplit` (pythonGH-102508)

`urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit pythonGH-25595.

This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/GH-url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329).

Backported to Python 2 from Python 3.12.

Co-authored-by: Illia Volochii <illia.volochii@gmail.com>
Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org>
Co-authored-by: Lumir Balhar <lbalhar@redhat.com>
  • Loading branch information
3 people authored and stratakis committed May 26, 2023
1 parent b9dd9c2 commit ac6281c
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 0 deletions.
57 changes: 57 additions & 0 deletions Lib/test/test_urlparse.py
Expand Up @@ -666,7 +666,64 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")

def test_urlsplit_strip_url(self):
noise = "".join([chr(i) for i in range(0, 0x20 + 1)])
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"

url = noise.decode("utf-8") + base_url
p = urlparse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query=yes")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, "User")
self.assertEqual(p.password, "Pass")
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url)

url = noise + base_url.encode("utf-8")
p = urlparse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
self.assertEqual(p.path, b"/doc/")
self.assertEqual(p.query, b"query=yes")
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, b"User")
self.assertEqual(p.password, b"Pass")
self.assertEqual(p.hostname, b"www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url.encode("utf-8"))

# Test that trailing space is preserved as some applications rely on
# this within query strings.
query_spaces_url = "https://www.python.org:88/doc/?query= "
p = urlparse.urlsplit(noise.decode("utf-8") + query_spaces_url)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.netloc, "www.python.org:88")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query= ")
self.assertEqual(p.port, 88)
self.assertEqual(p.geturl(), query_spaces_url)

p = urlparse.urlsplit("www.pypi.org ")
# That "hostname" gets considered a "path" due to the
# trailing space and our existing logic... YUCK...
# and re-assembles via geturl aka unurlsplit into the original.
# django.core.validators.URLValidator (at least through v3.2) relies on
# this, for better or worse, to catch it in a ValidationError via its
# regular expressions.
# Here we test the basic round trip concept of such a trailing space.
self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ")

# with scheme as cache-key
url = "//www.python.org/"
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
for _ in range(2):
p = urlparse.urlsplit(url, scheme=scheme)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/")

def test_attributes_bad_port(self):
"""Check handling of non-integer ports."""
Expand Down
10 changes: 10 additions & 0 deletions Lib/urlparse.py
Expand Up @@ -26,6 +26,10 @@
parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.
The WHATWG URL Parser spec should also be considered. We are not compliant with
it either due to existing user code API behavior expectations (Hyrum's Law).
It serves as a useful guide when making changes.
"""

import re
Expand Down Expand Up @@ -63,6 +67,10 @@
'0123456789'
'+-.')

# Leading and trailing C0 control and space to be stripped per WHATWG spec.
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '

# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

Expand Down Expand Up @@ -201,6 +209,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
(e.g. netloc is a single string) and we don't expand % escapes."""
url = _remove_unsafe_bytes_from_url(url)
scheme = _remove_unsafe_bytes_from_url(scheme)
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
Expand Down

0 comments on commit ac6281c

Please sign in to comment.