diff --git a/pyproject.toml b/pyproject.toml index 4c30b9e8e..4de0c8673 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] dependencies = [ - "pydantic", + "pydantic<2", # Pip hops between installing v2.7 or v1.10 depending on which of the additional dependencies are requested "requests", "rich", "werkzeug", diff --git a/src/murfey/server/__init__.py b/src/murfey/server/__init__.py index 45bf9a853..92561fa60 100644 --- a/src/murfey/server/__init__.py +++ b/src/murfey/server/__init__.py @@ -111,7 +111,7 @@ def sanitise(in_string: str) -> str: return in_string.replace("\r\n", "").replace("\n", "") -def santise_path(in_path: Path) -> Path: +def sanitise_path(in_path: Path) -> Path: return Path("/".join(secure_filename(p) for p in in_path.parts)) diff --git a/src/murfey/server/api.py b/src/murfey/server/api.py index d02962041..2e49d619d 100644 --- a/src/murfey/server/api.py +++ b/src/murfey/server/api.py @@ -43,6 +43,7 @@ get_machine_config, get_microscope, get_tomo_preproc_params, + sanitise, templates, ) from murfey.server.config import from_file, settings @@ -110,10 +111,6 @@ router = APIRouter() -def sanitise(in_string: str) -> str: - return in_string.replace("\r\n", "").replace("\n", "") - - # This will be the homepage for a given microscope. @router.get("/", response_class=HTMLResponse) async def root(request: Request): diff --git a/src/murfey/server/bootstrap.py b/src/murfey/server/bootstrap.py index f78ad36a3..42eb318de 100644 --- a/src/murfey/server/bootstrap.py +++ b/src/murfey/server/bootstrap.py @@ -17,6 +17,7 @@ import logging import random import re +from urllib.parse import quote import packaging.version import requests @@ -41,10 +42,47 @@ log = logging.getLogger("murfey.server.bootstrap") +def _validate_package_name(package: str) -> bool: + """ + Check that a package name follows PEP 503 naming conventions, containing only + alphanumerics, "_", "-", or "." characters + """ + if re.match(r"^[a-z0-9\-\_\.]+$", package): + return True + else: + return False + + +def _get_full_path_response(package: str) -> requests.Response: + """ + Validates the package name, sanitises it if valid, and attempts to return a HTTP + response from PyPI. + """ + + if _validate_package_name(package): + # Sanitise and normalise package name (PEP 503) + package_clean = quote(re.sub(r"[-_.]+", "-", package.lower())) + + # Get HTTP response + url = f"https://pypi.org/simple/{package_clean}" + response = requests.get(url) + + if response.status_code == 200: + return response + else: + raise HTTPException(status_code=response.status_code) + else: + raise ValueError(f"{package} is not a valid package name") + + @pypi.get("/", response_class=Response) def get_pypi_index(): - """Obtain list of all PyPI packages via the simple API (PEP 503).""" + """ + Obtain list of all PyPI packages via the simple API (PEP 503). + """ + index = requests.get("https://pypi.org/simple/") + return Response( content=index.content, media_type=index.headers.get("Content-Type"), @@ -53,33 +91,52 @@ def get_pypi_index(): @pypi.get("/{package}/", response_class=Response) -def get_pypi_package_downloads_list(package: str): - """Obtain list of all package downloads from PyPI via the simple API (PEP 503), - and rewrite all download URLs to point to this server, - underneath the current directory.""" - full_path_response = requests.get(f"https://pypi.org/simple/{package}") - - def rewrite_pypi_url(match): - url = match.group(4) - return ( - b"" - + match.group(4) - + b"" - ) +def get_pypi_package_downloads_list(package: str) -> Response: + """ + Obtain list of all package downloads from PyPI via the simple API (PEP 503), and + rewrite all download URLs to point to this server, under the current directory. + """ + + def _rewrite_pypi_url(match): + """ + Use regular expression matching to rewrite the URLs. Points them from + pythonhosted.org to current server, and removes the hash from the URL as well + """ + # url = match.group(4) # Original + url = match.group(3) + return '" + match.group(3) + "" + + # Validate package and URL + full_path_response = _get_full_path_response(package) + + # Process lines related to PyPI packages in response + content: bytes = full_path_response.content # In bytes + content_text: str = content.decode("latin1") # Convert to strings + content_text_list = [] + for line in content_text.splitlines(): + # Look for lines with hyperlinks + if "]*)"([^>]*)>([^<]*)', # Regex search criteria + _rewrite_pypi_url, # Search criteria applied to this function + line, + ) + content_text_list.append(line_new) + + # Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata) + if ".whl" in line_new: + line_metadata = line_new.replace(".whl", ".whl.metadata") + content_text_list.append(line_metadata) + else: + # Append other lines as normal + content_text_list.append(line) + + content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure + content_new = content_text_new.encode("latin1") # Convert back to bytes - content = re.sub( - b']*)href="([^">]*)"([^>]*)>([^<]*)', - rewrite_pypi_url, - full_path_response.content, - ) return Response( - content=content, + content=content_new, media_type=full_path_response.headers.get("Content-Type"), status_code=full_path_response.status_code, ) @@ -87,18 +144,62 @@ def rewrite_pypi_url(match): @pypi.get("/{package}/{filename}", response_class=Response) def get_pypi_file(package: str, filename: str): - """Obtain and pass through a specific download for a PyPI package.""" - full_path_response = requests.get(f"https://pypi.org/simple/{package}") + """ + Obtain and pass through a specific download for a PyPI package. + """ + + def _expose_wheel_metadata(response_bytes: bytes) -> bytes: + """ + As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal + ".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple + Index. However, because it is not listed on the webpage itself, it is not copied + across to the proxy. This function adds that URL to the proxy explicitly. + """ + + # Analyse API response line-by-line + response_text: str = response_bytes.decode("latin1") # Convert to text + response_text_list = [] # Write line-by-line analysis to here + + for line in response_text.splitlines(): + # Process URLs + if r"]*?href="([^">]*)"[^>]*>' + filename_bytes + b"", - full_path_response.content, + b']*)"[^>]*>' + filename_bytes + b"", + content, ) if not selected_package_link: raise HTTPException(status_code=404, detail="File not found for package") original_url = selected_package_link.group(1) original_file = requests.get(original_url) + return Response( content=original_file.content, media_type=original_file.headers.get("Content-Type"), @@ -108,8 +209,10 @@ def get_pypi_file(package: str, filename: str): @plugins.get("/{package}", response_class=FileResponse) def get_plugin_wheel(package: str): + machine_config = get_machine_config() wheel_path = machine_config.plugin_packages.get(package) + if wheel_path is None: return None return FileResponse( @@ -124,6 +227,7 @@ def get_bootstrap_instructions(request: Request): Return a website containing instructions for installing the Murfey client on a machine with no internet access. """ + return respond_with_template( "bootstrap.html", { @@ -140,7 +244,10 @@ def get_pip_wheel(): This is only used during bootstrapping by the client to identify and then download the actually newest appropriate version of pip. """ - return get_pypi_file(package="pip", filename="pip-21.3.1-py3-none-any.whl") + return get_pypi_file( + package="pip", + filename="pip-22.2.2-py3-none-any.whl", # Highest version that works before PEP 658 change + ) @bootstrap.get("/murfey.whl", response_class=Response) @@ -153,6 +260,7 @@ def get_murfey_wheel(): """ full_path_response = requests.get("https://pypi.org/simple/murfey") wheels = {} + for wheel_file in re.findall( b"]*>([^<]*).whl", full_path_response.content, @@ -174,7 +282,7 @@ def get_murfey_wheel(): @cygwin.get("/setup-x86_64.exe", response_class=Response) def get_cygwin_setup(): """ - Obtain and past though a Cygwin installer from an official source. + Obtain and pass through a Cygwin installer from an official source. This is used during client bootstrapping and can download and install the Cygwin distribution that then remains on the client machines. """ diff --git a/src/murfey/server/demo_api.py b/src/murfey/server/demo_api.py index 82d023f3e..d2276ffc1 100644 --- a/src/murfey/server/demo_api.py +++ b/src/murfey/server/demo_api.py @@ -32,7 +32,7 @@ get_hostname, get_microscope, sanitise, - santise_path, + sanitise_path, ) from murfey.server import shutdown as _shutdown from murfey.server import templates @@ -968,7 +968,7 @@ def flush_tomography_processing( async def request_tomography_preprocessing( visit_name: str, client_id: int, proc_file: ProcessFile, db=murfey_db ): - if not santise_path(Path(proc_file.path)).exists(): + if not sanitise_path(Path(proc_file.path)).exists(): log.warning( f"{sanitise(str(proc_file.path))} has not been transferred before preprocessing" ) diff --git a/src/murfey/util/__init__.py b/src/murfey/util/__init__.py index db6f78b9b..a4b739d59 100644 --- a/src/murfey/util/__init__.py +++ b/src/murfey/util/__init__.py @@ -21,6 +21,10 @@ logger = logging.getLogger("murfey.util") +def sanitise(in_string: str) -> str: + return in_string.replace("\r\n", "").replace("\n", "") + + @lru_cache(maxsize=1) def get_machine_config(url: str, demo: bool = False) -> dict: return requests.get(f"{url}/machine/").json() diff --git a/src/murfey/util/lif.py b/src/murfey/util/lif.py index 358200f28..5dda3343c 100644 --- a/src/murfey/util/lif.py +++ b/src/murfey/util/lif.py @@ -15,14 +15,12 @@ from readlif.reader import LifFile from tifffile import imwrite +from murfey.util import sanitise + # Create logger object to output messages with logger = logging.getLogger("murfey.util.lif") -def sanitise(in_string: str) -> str: - return in_string.replace("\r\n", "").replace("\n", "") - - def get_xml_metadata( file: LifFile, save_xml: Optional[Path] = None,