Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 37 additions & 99 deletions src/murfey/server/api/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,15 @@
murfey.bootstrap is compatible with all relevant versions of Python.
This also ignores yanked releases, which again should be fine.
"""
full_path_response = http_session.get("https://pypi.org/simple/murfey")
full_path_response = http_session.get(f"{pypi_index_url.rstrip('/')}/murfey")

Check warning on line 193 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L193

Added line #L193 was not covered by tests
wheels = {}

for wheel_file in re.findall(
b"<a [^>]*>([^<]*).whl</a>",
full_path_response.content,
):
try:
filename = wheel_file.decode("latin-1") + ".whl"
filename = wheel_file.decode("utf-8") + ".whl"

Check warning on line 201 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L201

Added line #L201 was not covered by tests
version = packaging.version.parse(filename.split("-")[1])
wheels[version] = filename
except Exception:
Expand Down Expand Up @@ -261,7 +261,7 @@

mirror_priorities = {}
for mirror in mirrors.content.split(b"\n"):
mirror_line = mirror.decode("latin1").strip().split(";")
mirror_line = mirror.decode("utf-8").strip().split(";")

Check warning on line 264 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L264

Added line #L264 was not covered by tests
if not mirror_line or len(mirror_line) < 4:
continue
if not mirror_line[0].startswith("http"):
Expand Down Expand Up @@ -493,7 +493,7 @@

# Parse and rewrite package index content
content: bytes = response.content # Get content in bytes
content_text: str = content.decode("latin1") # Convert to strings
content_text: str = content.decode("utf-8") # Convert to strings

Check warning on line 496 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L496

Added line #L496 was not covered by tests
content_text_list = []
for line in content_text.splitlines():
if line.startswith("<a href"):
Expand All @@ -508,7 +508,7 @@

# Reconstruct conent
content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
content_new = content_text_new.encode("latin1") # Convert back to bytes
content_new = content_text_new.encode("utf-8") # Convert back to bytes

Check warning on line 511 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L511

Added line #L511 was not covered by tests
return Response(
content=content_new,
status_code=response.status_code,
Expand Down Expand Up @@ -538,7 +538,7 @@

# Parse and rewrite package index content
content: bytes = response.content # Get content in bytes
content_text: str = content.decode("latin1") # Convert to strings
content_text: str = content.decode("utf-8") # Convert to strings

Check warning on line 541 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L541

Added line #L541 was not covered by tests
content_text_list = []
for line in content_text.splitlines():
if line.startswith("<a href="):
Expand All @@ -552,7 +552,7 @@

# Reconstruct conent
content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
content_new = content_text_new.encode("latin1") # Convert back to bytes
content_new = content_text_new.encode("utf-8") # Convert back to bytes

Check warning on line 555 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L555

Added line #L555 was not covered by tests
return Response(
content=content_new,
status_code=response.status_code,
Expand Down Expand Up @@ -1066,6 +1066,9 @@
=======================================================================================
"""

python_repo_url = "https://files.pythonhosted.org"
pypi_index_url = "https://pypi.org/simple/"


def _get_full_pypi_path_response(package: str) -> requests.Response:
"""
Expand All @@ -1082,73 +1085,55 @@
package_clean = quote(re.sub(r"[-_.]+", "-", package.lower()), safe="/")

# Get HTTP response
url = f"https://pypi.org/simple/{package_clean}"
url = f"{pypi_index_url.rstrip('/')}/{package_clean}"
response = http_session.get(url)
if response.status_code != 200:
raise HTTPException(status_code=response.status_code)
return response


@pypi.get("/", response_class=Response)
@pypi.get("/index/", response_class=Response)
def get_pypi_index():
"""
Obtain list of all PyPI packages via the simple API (PEP 503).
"""

response = http_session.get("https://pypi.org/simple/")
response = http_session.get(pypi_index_url)

Check warning on line 1101 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L1101

Added line #L1101 was not covered by tests
return Response(
content=response.content,
status_code=response.status_code,
media_type=response.headers.get("Content-Type"),
)


@pypi.get("/{package}/", response_class=Response)
@pypi.get("/index/{package}/", response_class=Response)
def get_pypi_package_downloads_list(request: Request, package: str) -> Response:
"""
Obtain list of all package downloads from PyPI via the simple API (PEP 503), and
rewrite all download URLs to point to this server, under the current directory.
"""

def _rewrite_pypi_url(match):
"""
Use regular expression matching to rewrite the URLs. Points them from
pythonhosted.org to current server, and removes the hash from the URL as well
"""
# url = match.group(4) # Original
url = match.group(3)
return '<a href="' + url + '"' + match.group(2) + ">" + match.group(3) + "</a>"

logger.debug(f"Received request to access {str(request.url)!r}")

# Construct base URL to rewrite with
netloc = resolve_netloc(request)
scheme = request.headers.get("X-Forwarded-Proto", request.url.scheme)
router_path = request.url.path.removesuffix(f"/index/{package}/")
base_url = f"{scheme}://{netloc}{router_path}"

# Validate package and URL
full_path_response = _get_full_pypi_path_response(package)

# Process lines related to PyPI packages in response
content: bytes = full_path_response.content # In bytes
content_text: str = content.decode("latin1") # Convert to strings
content_text_list = []
for line in content_text.splitlines():
# Look for lines with hyperlinks
if "<a href" in line:
# Rewrite URL to point to current proxy server
line_new = re.sub(
'^<a href="([^">]*)"([^>]*)>([^<]*)</a>', # Regex search criteria
_rewrite_pypi_url, # Function to apply search criteria to
line,
)
content_text_list.append(line_new)

# Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata)
if ".whl" in line_new:
line_metadata = line_new.replace(".whl", ".whl.metadata")
content_text_list.append(line_metadata)
else:
# Append other lines as normal
content_text_list.append(line)
content_text: str = content.decode("utf-8") # Convert to strings

content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
content_new = content_text_new.encode("latin1") # Convert back to bytes
# PyPI's simple index now directly points to https://pythonhosted.org
# It also uses newlines partway through the '<a ...></a>' blocks now
# It's thus now better to use regex substitution on the page as a whole
content_text_new = re.sub(re.escape(python_repo_url), base_url, content_text)

content_new = content_text_new.encode("utf-8") # Convert back to bytes

return Response(
content=content_new,
Expand All @@ -1157,76 +1142,29 @@
)


@pypi.get("/{package}/{filename}", response_class=StreamingResponse)
@pypi.get("/packages/{a}/{b}/{c}/{filename}", response_class=StreamingResponse)
def get_pypi_file(
request: Request,
package: str,
a: str,
b: str,
c: str,
filename: str,
):
"""
Obtain and pass through a specific download for a PyPI package.
"""

def _expose_wheel_metadata(response_bytes: bytes) -> bytes:
"""
As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal
".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple
Index. However, because it is not listed on the webpage itself, it is not copied
across to the proxy. This function adds that URL to the proxy explicitly.
"""

# Analyse API response line-by-line
response_text: str = response_bytes.decode("latin1") # Convert to text
response_text_list = [] # Write line-by-line analysis to here

for line in response_text.splitlines():
# Process URLs
if r"<a href=" in line:
response_text_list.append(line) # Add to list

# Add new line to explicitly call for wheel metadata
if ".whl" in line:
# Add ".metadata" to URL and file name
line_new = line.replace(".whl", ".whl.metadata")
response_text_list.append(line_new) # Add to list

# Append all other lines as normal
else:
response_text_list.append(line)

# Recover original structure
response_text_new = str("\n".join(response_text_list))
response_bytes_new = bytes(response_text_new, encoding="latin-1")

return response_bytes_new

logger.debug(f"Received request to access {str(request.url)!r}")

# Validate package and URL
full_path_response = _get_full_pypi_path_response(package)

# Get filename in bytes
filename_bytes = re.escape(filename.encode("latin1"))

# Add explicit URLs for ".whl.metadata" files
content = _expose_wheel_metadata(full_path_response.content)

# Find package matching the specified filename
selected_package_link = re.search(
b'<a href="([^">]*)"[^>]*>' + filename_bytes + b"</a>",
content,
)
if not selected_package_link:
raise HTTPException(status_code=404, detail="File not found for package")
original_url = selected_package_link.group(1)
response = http_session.get(original_url)
package_url = f"{python_repo_url}/packages/{a}/{b}/{c}/{filename}"
logger.debug(f"Forwarding package request to {package_url!r}")
response = http_session.get(package_url, stream=True)

Check warning on line 1160 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L1158-L1160

Added lines #L1158 - L1160 were not covered by tests

# Construct headers to return with response
headers: dict[str, str] = {}
if response.headers.get("Content-Length"):
headers["Content-Lengh"] = response.headers["Content-Length"]
if response.status_code != 200:
raise HTTPException(status_code=response.status_code)

Check warning on line 1165 in src/murfey/server/api/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

src/murfey/server/api/bootstrap.py#L1165

Added line #L1165 was not covered by tests
return StreamingResponse(
content=response.iter_content(chunk_size=8192),
content=response.raw,
status_code=response.status_code,
headers=headers,
media_type=response.headers.get("Content-Type"),
Expand Down
12 changes: 8 additions & 4 deletions src/murfey/util/route_manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -215,22 +215,26 @@ murfey.server.api.bootstrap.plugins:
methods:
- GET
murfey.server.api.bootstrap.pypi:
- path: /pypi/
- path: /pypi/index/
function: get_pypi_index
path_params: []
methods:
- GET
- path: /pypi/{package}/
- path: /pypi/index/{package}/
function: get_pypi_package_downloads_list
path_params:
- name: package
type: str
methods:
- GET
- path: /pypi/{package}/{filename}
- path: /pypi/packages/{a}/{b}/{c}/{filename}
function: get_pypi_file
path_params:
- name: package
- name: a
type: str
- name: b
type: str
- name: c
type: str
- name: filename
type: str
Expand Down