Skip to content

Commit

Permalink
pythongh-102153: Start stripping C0 control and space chars in `urlsp…
Browse files Browse the repository at this point in the history
…lit`
  • Loading branch information
illia-v committed Mar 7, 2023
1 parent 4a3ea1f commit 5e67815
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
9 changes: 7 additions & 2 deletions Doc/library/urllib.parse.rst
Expand Up @@ -324,8 +324,9 @@ or on combining URL components into a URL string.
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
decomposed before parsing, no error will be raised.

Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline
``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL.
Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0
control and space characters are stripped from the URL. ``\n``, ``\r`` and
tab ``\t`` characters are removed from the URL at any position.

.. versionchanged:: 3.6
Out-of-range port numbers now raise :exc:`ValueError`, instead of
Expand All @@ -338,6 +339,10 @@ or on combining URL components into a URL string.
.. versionchanged:: 3.10
ASCII newline and tab characters are stripped from the URL.

.. versionchanged:: 3.12
Leading and trailing C0 control and space characters are stripped from
the URL

.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser

.. function:: urlunsplit(parts)
Expand Down
40 changes: 39 additions & 1 deletion Lib/test/test_urlparse.py
Expand Up @@ -649,14 +649,52 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.scheme, "http")
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")

def test_urlsplit_strip_url(self):
noise = bytes([*range(0, 0x1f), 0x20])
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"

url = noise.decode() + base_url + noise.decode()
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "query=yes")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, "User")
self.assertEqual(p.password, "Pass")
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url)

url = noise + base_url.encode() + noise
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
self.assertEqual(p.path, b"/doc/")
self.assertEqual(p.query, b"query=yes")
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, b"User")
self.assertEqual(p.password, b"Pass")
self.assertEqual(p.hostname, b"www.python.org")
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), base_url.encode())

# with scheme as cache-key
url = "//www.python.org/"
scheme = noise.decode() + "https" + noise.decode()
for _ in range(2):
p = urllib.parse.urlsplit(url, scheme=scheme)
self.assertEqual(p.scheme, "https")
self.assertEqual(p.geturl(), "https://www.python.org/")

def test_attributes_bad_port(self):
"""Check handling of invalid ports."""
for bytes in (False, True):
for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):
for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
with self.subTest(bytes=bytes, parse=parse, port=port):
netloc = "www.example.net:" + port
url = "http://" + netloc
url = "http://" + netloc + "/"
if bytes:
if netloc.isascii() and port.isascii():
netloc = netloc.encode("ascii")
Expand Down
5 changes: 5 additions & 0 deletions Lib/urllib/parse.py
Expand Up @@ -79,6 +79,9 @@
'0123456789'
'+-.')

# Leading and trailing C0 control and space to be stripped per WHATWG spec
_URL_CHARS_TO_STRIP = "".join([*(chr(i) for i in range(0, 0x1f + 1)), " "])

# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']

Expand Down Expand Up @@ -452,6 +455,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
"""

url, scheme, _coerce_result = _coerce_args(url, scheme)
url = url.strip(_URL_CHARS_TO_STRIP)
scheme = scheme.strip(_URL_CHARS_TO_STRIP)

for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
Expand Down
@@ -0,0 +1,3 @@
:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and
space characters following the controlling specification for URLs defined by
WHATWG in response to CVE-2023-24329. Patch by Illia Volochii.

0 comments on commit 5e67815

Please sign in to comment.