diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index d2dd034d1..f27608ffc 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,112 +1,107 @@ -# Architecture of SHARE/Trove -> NOTE: this document requires update (big ol' TODO) - +# Architecture of SHARE/trove This document is a starting point and reference to familiarize yourself with this codebase. ## Bird's eye view -In short, SHARE/Trove takes metadata records (in any supported input format), -ingests them, and makes them available in any supported output format. -``` - ┌───────────────────────────────────────────┐ - │ Ingest │ - │ ┌──────┐ │ - │ ┌─────────────────────────┐ ┌──►Format├─┼────┐ - │ │ Normalize │ │ └──────┘ │ │ - │ │ │ │ │ ▼ -┌───────┐ │ │ ┌─────────┐ ┌────────┐ │ │ ┌──────┐ │ save as -│Harvest├─┬─┼─┼─►Transform├──►Regulate├─┼─┬─┼──►Format├─┼─┬─►FormattedMetadataRecord -└───────┘ │ │ │ └─────────┘ └────────┘ │ │ │ └──────┘ │ │ - │ │ │ │ │ . │ │ ┌───────┐ - │ │ └─────────────────────────┘ │ . │ └──►Indexer│ - │ │ │ . │ └───────┘ - │ └─────────────────────────────┼─────────────┘ some formats also - │ │ indexed separately - ▼ ▼ - save as save as - RawDatum NormalizedData +In short, SHARE/trove holds metadata records that describe things and makes those records available for searching, browsing, and subscribing. + +![overview of shtrove: metadata records in, search/browse/subscribe out](./project/static/img/shtroverview.png) + + +## Parts +a look at the tangles of communication between different parts of the system: + +```mermaid +graph LR; + subgraph shtrove; + subgraph web[api/web server]; + ingest; + search; + browse; + rss; + atom; + oaipmh; + end; + worker["background worker (celery)"]; + indexer["indexer daemon"]; + rabbitmq["task queue (rabbitmq)"]; + postgres["database (postgres)"]; + elasticsearch; + web---rabbitmq; + web---postgres; + web---elasticsearch; + worker---rabbitmq; + worker---postgres; + worker---elasticsearch; + indexer---rabbitmq; + indexer---postgres; + indexer---elasticsearch; + end; + source["metadata source (e.g. osf.io backend)"]; + user["web user, either by browsing directly or via web app (like osf.io)"]; + subscribers["feed subscription tools"]; + source-->ingest; + user-->search; + user-->browse; + subscribers-->rss; + subscribers-->atom; + subscribers-->oaipmh; ``` ## Code map A brief look at important areas of code as they happen to exist now. -### Static configuration - -`share/schema/` describes the "normalized" metadata schema/format that all -metadata records are converted into when ingested. - -`share/sources/` describes a starting set of metadata sources that the system -could harvest metadata from -- these will be put in the database and can be -updated or added to over time. - -`project/settings.py` describes system-level settings which can be set by -environment variables (and their default values), as well as settings -which cannot. - -`share/models/` describes the data layer using the [Django](https://www.djangoproject.com/) ORM. - -`share/subjects.yaml` describes the "central taxonomy" of subjects allowed -in `Subject.name` fields of `NormalizedData`. - -### Harvest and ingest - -`share/harvest/` and `share/harvesters/` describe how metadata records -are pulled from other metadata repositories. - -`share/transform/` and `share/transformers/` describe how raw data (possibly -in any format) are transformed to the "normalized" schema. +- `trove`: django app for rdf-based apis + - `trove.digestive_tract`: most of what happens after ingestion + - stores records and identifiers in the database + - initiates indexing + - `trove.extract`: parsing ingested metadata records into resource descriptions + - `trove.derive`: from a given resource description, create special non-rdf serializations + - `trove.render`: from an api response modeled as rdf graph, render the requested mediatype + - `trove.models`: database models for identifiers and resource descriptions + - `trove.trovesearch`: builds rdf-graph responses for trove search apis (using `IndexStrategy` implementations from `share.search`) + - `trove.vocab`: identifies and describes concepts used elsewhere + - `trove.vocab.trove`: describes types, properties, and api paths in the trove api + - `trove.vocab.osfmap`: describes metadata from osf.io (currently the only metadata ingested) + - `trove.openapi`: generate openapi json for the trove api from thesaurus in `trove.vocab.trove` +- `share`: django app with search indexes and remnants of sharev2 + - `share.models`: database models for external sources, users, and other system book-keeping + - `share.oaipmh`: provide data via [OAI-PMH](https://www.openarchives.org/OAI/openarchivesprotocol.html) + - `share.search`: all interaction with elasticsearch + - `share.search.index_strategy`: abstract base class `IndexStrategy` with multiple implementations, for different approaches to indexing the same data + - `share.search.daemon`: the "indexer daemon", an optimized background worker for batch-processing updates and sending to all active index strategies + - `share.search.index_messenger`: for sending messages to the indexer daemon +- `api`: django app with remnants of the legacy sharev2 api + - `api.views.feeds`: allows custom RSS and Atom feeds + - otherwise, subject to possible deprecation +- `osf_oauth2_adapter`: django app for login via osf.io +- `project`: the actual django project + - default settings at `project.settings` + - pulls together code from other directories implemented as django apps (`share`, `trove`, `api`, and `osf_oauth2_adapter`) -`share/regulate/` describes rules which are applied to every normalized datum, -regardless where or what format it originally come from. -`share/metadata_formats/` describes how a normalized datum can be formatted -into any supported output format. - -`share/tasks/` runs the harvest/ingest pipeline and stores each task's status -(including debugging info, if errored) as a `HarvestJob` or `IngestJob`. - -### Outward-facing views - -`share/search/` describes how the search indexes are structured, managed, and -updated when new metadata records are introduced -- this provides a view for -discovering items based on whatever search criteria. - -`share/oaipmh/` describes the [OAI-PMH](https://www.openarchives.org/OAI/openarchivesprotocol.html) -view for harvesting metadata from SHARE/Trove in bulk. - -`api/` describes a mostly REST-ful API that's useful for inspecting records for -a specific item of interest. - -### Internals - -`share/admin/` is a Django-app for administrative access to the SHARE database -and pipeline logs - -`osf_oauth2_adapter/` is a Django app to support logging in to SHARE via OSF +## Cross-cutting concerns -### Testing +### Resource descriptions -`tests/` are tests. +Uses the [resource description framework](https://www.w3.org/TR/rdf11-primer/#section-Introduction): +- the content of each ingested metadata record is an rdf graph focused on a specific resource +- all api responses from `trove` views are (experimentally) modeled as rdf graphs, which may be rendered a variety of ways -## Cross-cutting concerns +### Identifiers -### Immutable metadata +Whenever feasible, use full URI strings to identify resources, concepts, types, and properties that may be exposed outwardly. -Metadata records at all stages of the pipeline (`RawDatum`, `NormalizedData`, -`FormattedMetadataRecord`) should be considered immutable -- any updates -result in a new record being created, not an old record being altered. +Prefer using open, standard, well-defined namespaces wherever possible ([DCAT](https://www.w3.org/TR/vocab-dcat-3/) is a good place to start; see `trove.vocab.namespaces` for others already in use). When app-specific concepts must be defined, use the `TROVE` namespace (`https://share.osf.io/vocab/2023/trove/`). -Multiple records which describe the same item/object are grouped by a -"source-unique identifier" or "suid" -- essentially a two-tuple -`(source, identifier)` that uniquely and persistently identifies an item in -the source repository. In most outward-facing views, default to showing only -the most recent record for each suid. +A notable exception (non-URI identifier) is the "source-unique identifier" or "suid" -- essentially a two-tuple `(source, identifier)` that uniquely and persistently identifies a metadata record in a source repository. This `identifier` may be any string value, provided by the external source. ### Conventions (an incomplete list) -- functions prefixed `pls_` ("please") are a request for something to happen +- local variables prefixed with underscore (to consistently distinguish between internal-only names and those imported/built-in) +- prefer full type annotations in python code, wherever reasonably feasible ## Why this? inspired by [this writeup](https://matklad.github.io/2021/02/06/ARCHITECTURE.md.html) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8af0c86a..e2b4dab9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Change Log +# [25.6.0] - 2025-10-30 +- bump dependencies + - `celery` to 5.5.3 + - `kombu` to 5.5.4 +- improve error handling in celery task-result backend +- use logging config in celery worker +- improve code docs (README.md et al.) +- add cardsearch feeds (rss and atom) + - /trove/index-card-search/rss.xml + - /trove/index-card-search/atom.xml +- fix: render >1 result in streamed index-value-search (csv, tsv, json) +- when browsing trove api in browser, wrap non-browser-friendly mediatypes in html (unless `withFileName`, which requests download) +- better trove.render test coverage +- code cleanliness + - de-collide "simple" names + - SimpleRendering => EntireRendering + - SimpleTrovesearchRenderer => TrovesearchCardOnlyRenderer + - consolidate more shared logic into trove.util + - more accurate type annotations + # [25.5.0] - 2025-07-15 - use python 3.13 - use `poetry` to manage dependencies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d14287ddb..ca8dcf691 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,18 @@ # CONTRIBUTING -TODO: how do we want to guide community contributors? +> note: this codebase is currently (and historically) rather entangled with [osf.io](https://osf.io), which has its shtrove at https://share.osf.io -- stay tuned for more-reusable open-source libraries and tools that should be more accessible to community contribution -For now, if you're interested in contributing to SHARE/Trove, feel free to +For now, if you're interested in contributing to SHARE/trove, feel free to [open an issue on github](https://github.com/CenterForOpenScience/SHARE/issues) and start a conversation. + +## Required checks + +All changes must pass the following checks with no errors: +- linting: `python -m flake8` +- static type-checking (on `trove/` code only, for now): `python -m mypy trove` +- tests: `python -m pytest -x tests/` + - note: some tests require other services running -- if [using the provided docker-compose.yml](./how-to/run-locally.md), recommend running in the background (upping worker ups all: `docker compose up -d worker`) and executing tests from within one of the python containers (`indexer`, `worker`, or `web`): + `docker compose exec indexer python -m pytest -x tests/` + +All new changes should also avoid decreasing test coverage, when reasonably possible (currently checked on github pull requests). diff --git a/README.md b/README.md index 27a21f903..201adfc2b 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,17 @@ -# SHARE/Trove +# SHARE/trove (aka SHARtrove, shtrove) -SHARE is creating a free, open dataset of research (meta)data. +> share (verb): to have or use in common. -> **Note**: SHARE’s open API tools and services help bring together scholarship distributed across research ecosystems for the purpose of greater discoverability. However, SHARE does not guarantee a complete aggregation of searched outputs. For this reason, SHARE results should not be used for methodological analyses, such as systematic reviews. +> trove (noun): a store of valuable or delightful things. -[![Coverage Status](https://coveralls.io/repos/github/CenterForOpenScience/SHARE/badge.svg?branch=develop)](https://coveralls.io/github/CenterForOpenScience/SHARE?branch=develop) +SHARE/trove (aka SHARtrove, shtrove) is is a service meant to store (meta)data you wish to keep and offer openly. -## Documentation +note: this codebase is currently (and historically) rather entangled with [osf.io](https://osf.io), which has its shtrove at https://share.osf.io -- stay tuned for more-reusable open-source libraries and tools for working with (meta)data -### What is this? -see [WHAT-IS-THIS-EVEN.md](./WHAT-IS-THIS-EVEN.md) +see [ARCHITECTURE.md](./ARCHITECTURE.md) for help navigating this codebase -### How can I use it? -see [how-to/use-the-api.md](./how-to/use-the-api.md) +see [CONTRIBUTING.md](./CONTRIBUTING.md) for info about contributing changes -### How do I navigate this codebase? -see [ARCHITECTURE.md](./ARCHITECTURE.md) - -### How do I run a copy locally? -see [how-to/run-locally.md](./how-to/run-locally.md) - - -## Running Tests - -### Unit test suite - - py.test - -### BDD Suite - - behave +see [how-to/use-the-api.md](./how-to/use-the-api.md) for help using the api to add and access (meta)data +see [how-to/run-locally.md](./how-to/run-locally.md) for help running a shtrove instance for local development diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..6a3834d4b --- /dev/null +++ b/TODO.md @@ -0,0 +1,84 @@ +# TODO: +ways to better this mess + +## better shtrove api experience + +- better web-browsing experience + - include more explanatory docs (and better fill out those explanations) + - even more helpful (less erratic) visual design + - in each html rendering of an api response, include a `
` for adding/editing/viewing query params + - in browsable html, replace json literals with rdf rendered like the rest of the page + - (perf) add bare-minimal IndexcardDeriver (iris, types, namelikes); use for search-result display +- better tsv/csv experience + - set default columns for `index-value-search` (and/or broadly improve `fields` handling) +- better turtle experience + - quoted literal graphs also turtle + - omit unnecessary `^^rdf:string` +- better jsonld experience + - provide `@context` (via header, at least) + - accept jsonld at `/trove/ingest` (or at each `ldp:inbox`...) + + +## modular packaging +move actually-helpful logic into separate packages that can be used and maintained independently of +any particular web app/api/framework (and then use those packages in shtrove and osf) + +- `osfmap`: standalone OSFMAP definition + - define osfmap properties and shapes (following DCTAP) in static tsv files + - use `tapshoes` (below) to generate docs and helpful utility functions + - may replace/simplify: + - `osf.metadata.osf_gathering.OSFMAP` (and related constants) + - `trove.vocab.osfmap` + - `trove.derive.osfmap_json` +- `tapshoes`: for using and packaging [tabular application profiles](https://dcmi.github.io/dctap/) in python + - take a set of tsv/csv files as input + - should support any valid DCTAP (aim to be worth community interest) + - initial/immediate use case `osfmap` + - generate more human-readable docs of properties and shapes/types + - validate a given record (rdf graph) against a profile + - serialize a valid record in a consistent/stable way (according to the profile) + - enable publishing "official" application profiles as installable python packages + - learn from and consider using prior dctap work: + - dctap-python: https://pypi.org/project/dctap/ + - loads tabular files into more immediately usable form + - tap2shacl: https://pypi.org/project/tap2shacl/ + - builds shacl constraints from application profile + - could then validate a given graph with pyshacl: https://pypi.org/project/pyshacl/ +- metadata record crosswalk/serialization + - given a record (as rdf graph) and application profile to which it conforms (like OSFMAP), offer: + - crosswalking to a standard vocab (DCAT, schema.org, ...) + - stable rdf serialization (json-ld, turtle, xml, ...) + - special bespoke serialization (datacite xml/json, oai_dc, ...) + - may replace/simplify: + - `osf.metadata.serializers` + - `trove.derive` +- `shtrove`: reusable package with the good parts of share/trove + - python api and command-line tools + - given application profile + - digestive tract with pluggable storage/indexing interfaces + - methods for ingest, search, browse, subscribe +- `django-shtrove`: django wrapper for `shtrove` functionality + - set application profile via django setting + - django models for storage, elasticsearch for indexing + - django views for ingest, search, browse, subscribe + + +## open web standards +- data catalog vocabulary (DCAT) https://www.w3.org/TR/vocab-dcat-3/ + - an appropriate (and better thought-thru) vocab for a lot of what shtrove does + - already used in some ways, but would benefit from adopting more thoroughly + - replace bespoke types (like `trove:Indexcard`) with better-defined dcat equivalents (like `dcat:CatalogRecord`) + - rename various properties/types/variables similarly + - "catalog" vs "index" + - "record" vs "card" + - replace checksum-iris with `spdx:checksum` (added in dcat 3) +- linked data notifications (LDN) https://www.w3.org/TR/ldn/ + - shtrove incidentally (partially) aligns with linked-data principles -- could lean into that + - replace `/trove/ingest` with one or more `ldp:inbox` urls + - trove index-card like an inbox containing current/past resource descriptions + ``` + <://osf.example/blarg> ldp:inbox <://shtrove.example/index-card/0000-00...> . + <://shtrove.example/index-card/0000-00...> ldp:contains <://shtrove.example/description/0000-00...> . + <://shtrove.example/description/0000-00...> foaf:primaryTopic <://osf.example/blarg> + ``` + (might consider renaming "index-card" for consistency/clarity) diff --git a/WHAT-IS-THIS-EVEN.md b/WHAT-IS-THIS-EVEN.md deleted file mode 100644 index 8dd64d7e1..000000000 --- a/WHAT-IS-THIS-EVEN.md +++ /dev/null @@ -1,42 +0,0 @@ -# "What is this, even?" - -Imagine a vast, public library full of the outputs and results of some scientific -research -- shelves full of articles, preprints, datasets, data analysis plans, -and so on. - -You can think of SHARE/Trove as that library's card catalog. - -## "...What is a card catalog?" - -A [card catalog](https://en.wikipedia.org/wiki/Card_catalog) is that weird, cool cabinet you might see at the front of a -library with a bunch of tiny drawers full of index cards -- each index card -contains information about some item on the library shelves. - -The card catalog is where you go when you want to: -- locate a specific item in the library -- discover items related to a specific topic, author, or other keywords -- make a new item easily discoverable by others - -## "OK but what 'library' is this?" -As of July 2021, SHARE/Trove contains metadata on over 4.5 million items originating from: -- [OSF](https://osf.io) (including OSF-hosted Registries and Preprint Providers) -- [REPEC](http://repec.org) -- [arXiv](https://arxiv.org) -- [ClinicalTrials.gov](https://clinicaltrials.gov) -- ...and more! - -Updates from OSF are reflected within seconds, while updates from third-party sources are -harvested once daily. - -## "How can I use it?" - -You can search the full SHARE/Trove catalog at -[share.osf.io/discover](https://share.osf.io/discover). - -Other search pages can also be built on SHARE/Trove, showing only a specific -collection of items. For example, [OSF Preprints](https://osf.io/preprints/discover) -and [OSF Registries](https://osf.io/registries/discover) show only registrations -and preprints, respectively, which are hosted on OSF infrastructure. - -To learn about using the API (instead of a user interface), see -[how-to/use-the-api.md](./how-to/use-the-api.md) diff --git a/api/middleware.py b/api/middleware.py index a27e1c2a4..72a7f82d7 100644 --- a/api/middleware.py +++ b/api/middleware.py @@ -27,7 +27,7 @@ def process_view(self, request, view_func, view_args, view_kwargs): if settings.HIDE_DEPRECATED_VIEWS and deprecation_level == DeprecationLevel.HIDDEN: return HttpResponse( - f'This path ({request.path}) has been removed. If you have built something that relies on it, please email us at share-support@osf.io', + f'This path ({request.path}) has been removed. If you have built something that relies on it, please email us at {settings.SHARE_SUPPORT_EMAIL}', status=410, ) diff --git a/api/views/feeds.py b/api/views/feeds.py index 85925591f..40378d1f8 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -1,3 +1,4 @@ +import datetime from xml.sax.saxutils import unescape import json import logging @@ -10,7 +11,6 @@ from share.search import index_strategy from share.search.exceptions import IndexStrategyError from share.util.xml import strip_illegal_xml_chars -from share.util.fromisoformat import fromisoformat logger = logging.getLogger(__name__) @@ -108,10 +108,10 @@ def item_author_name(self, item): return prepare_string('{}{}'.format(author_name, ' et al.' if len(authors) > 1 else '')) def item_pubdate(self, item): - return fromisoformat(item.get('date_published') or item.get('date_created')) + return datetime.datetime.fromisoformat(item.get('date_published') or item.get('date_created')) def item_updateddate(self, item): - return fromisoformat(item.get(self._order)) + return datetime.datetime.fromisoformat(item.get(self._order)) def item_categories(self, item): categories = item.get('subjects', []) diff --git a/how-to/add-a-source.rst b/how-to/add-a-source.rst deleted file mode 100644 index 8e31ea6ac..000000000 --- a/how-to/add-a-source.rst +++ /dev/null @@ -1,251 +0,0 @@ -.. _harvesters-and-transformers: - -Harvesters and Transformers -=========================== - -A `harvester` gathers raw data from a source using their API. - -A `transformer` takes the raw data gathered by a harvester and maps the fields to the defined :ref:`SHARE models `. - -Writing a Harvester and Transformer ------------------------------------ - -See the transformers and harvesters located in the ``share/transformers/`` and ``share/harvesters/`` directories for more examples of syntax and best practices. - -Adding a new source -""""""""""""""""""""" - -- Determine whether the source has an API to access their metadata -- Create a source folder at ``share/sources/{source name}`` - - Source names are typically the reversed domain name of the source, e.g. a source at ``http://example.com`` would have the name ``com.example`` -- Create a file named ``source.yaml`` in the source folder - - See :ref:`Writing a source.yaml file ` -- Determine whether the source makes their data available using the `OAI-PMH`_ protocol - - If the source is OAI see :ref:`Best practices for OAI sources ` -- Writing the harvester - - See :ref:`Best practices for writing a Harvester ` -- Writing the transformer - - See :ref:`Best practices for writing a Transformer ` -- Adding a sources's icon - - visit ``www.domain.com/favicon.ico`` and download the ``favicon.ico`` file - - place the favicon as ``icon.ico`` in the source folder -- Load the source - - To make the source available in your local SHARE, run ``./manage.py loadsources`` in the terminal - -.. _OAI-PMH: http://www.openarchives.org/OAI/openarchivesprotocol.html - - -.. _writing-yaml: - -Writing a source.yaml file -"""""""""""""""""""""""""" - -The ``source.yaml`` file contains information about the source itself, and one or more configs that describe how to harvest and transform data from that source. - -.. code-block:: yaml - - name: com.example - long_title: Example SHARE Source for Examples - home_page: http://example.com/ - user: sources.com.example - configs: - - label: com.example.oai - base_url: http://example.com/oai/ - harvester: oai - harvester_kwargs: - metadata_prefix: oai_datacite - rate_limit_allowance: 5 - rate_limit_period: 1 - transformer: org.datacite - transformer_kwargs: {} - -See the whitepaper_ for Source and SourceConfig tables for the available fields. - -.. _whitepaper: https://github.com/CenterForOpenScience/SHARE/blob/develop/whitepapers/Tables.md - -.. _oai-sources: - -Best practices for OAI sources -"""""""""""""""""""""""""""""" - -Sources that use OAI-PMH_ make it easy to harvest their metadata. - -- Set ``harvester: oai`` in the source config. -- Choose a metadata format to harvest. - - Use the ``ListMetadataFormats`` OAI verb to see what formats the source supports. - - Every OAI source supports ``oai_dc``, but they usually also support at least one other format that has richer, more structured data, like ``oai_datacite`` or ``mods``. - - Choose the format that seems to have the most useful data for SHARE, especially if a transformer for that format already exists. - - Choose ``oai_dc`` only as a last resort. -- Add ``metadata_prefix: {prefix}`` to the ``harvester_kwargs`` in the source config. -- If necessary, write a transformer for the chosen format. - - See :ref:`Best practices for writing a Transformer ` - - -.. _.gitignore: https://github.com/CenterForOpenScience/SHARE/blob/develop/.gitignore - - -.. _writing-harvesters: - -Best practices for writing a non-OAI Harvester -"""""""""""""""""""""""""""""""""""""""""""""" - -- The harvester should be defined in ``share/harvesters/{harvester name}.py``. -- When writing the harvester: - - Inherit from ``share.harvest.BaseHarvester`` - - Add the version of the harvester ``VERSION = 1`` - - Implement ``do_harvest(...)`` (and possibly additional helper functions) to make requests to the source and to yield the harvested records. - - Check to see if the data returned by the source is paginated. - - There will often be a resumption token to get the next page of results. - - Check to see if the source's API accepts a date range - - If the API does not then, if possible, check the date on each record returned and stop harvesting if the date on the record is older than the specified start date. -- Add the harvester to ``entry_points`` in ``setup.py`` - - e.g. ``'com.example = share.harvesters.com_example:ExampleHarvester',`` - - run ``python setup.py develop`` to make the harvester available in your local SHARE -- Test by :ref:`running the harvester ` - -.. _writing-transformers: - -Best practices for writing a non-OAI Transformer -"""""""""""""""""""""""""""""""""""""""""""""""" - -- The transformer should be defined in ``share/transformers/{transformer name}.py``. -- When writing the transformer: - - Determine what information from the source record should be stored as part of the ``CreativeWork`` :ref:`model ` (i.e. if the record clearly defines a title, description, contributors, etc.). - - Use the :ref:`chain transformer tools ` as necessary to correctly parse the raw data. - - Alternatively, implement ``share.transform.BaseTransformer`` to create a transformer from scratch. - - Utilize the ``Extra`` class - - Raw data that does not fit into a defined :ref:`share model ` should be stored here. - - Raw data that is otherwise altered in the transformer should also be stored here to ensure data integrity. -- Add the transformer to ``entry_points`` in ``setup.py`` - - e.g. ``'com.example = share.transformer.com_example:ExampleTransformer',`` - - run ``python setup.py develop`` to make the transformer available in your local SHARE -- Test by :ref:`running the transformer ` against raw data you have harvested. - -.. _chain-transformer: - -SHARE Chain Transformer -""""""""""""""""""""""" - -SHARE provides a set of tools for writing transformers, based on the idea of constructing chains for each field that lead from the root of the raw document to the data for that field. To write a chain transformer, add ``from share.transform.chain import links`` at the top of the file and make the transformer inherit ``share.transform.chain.ChainTransformer``. - - -.. code-block:: python - - from share.transform.chain import ctx, links, ChainTransformer, Parser - - - class CreativeWork(Parser): - title = ctx.title - - - class ExampleTransformer(ChainTransformer): - VERSION = 1 - root_parser = CreativeWork - - -- Concat - To combine list or singular elements into a flat list:: - - links.Concat(, ) - -.. _delegate-reference: - -- Delegate - To specify which class to use:: - - links.Delegate() - -- Join - To combine list elements into a single string:: - - links.Join(, joiner=' ') - - Elements are separated with the ``joiner``. - By default ``joiner`` is a newline. - -- Map - To designate the class used for each instance of a value found:: - - links.Map(links.Delegate(), ) - - See the :ref:`share models ` for what uses a through table (anything that sets ``through=``). - Uses the :ref:`Delegate ` tool. - -- Maybe - To transform data that is not consistently available:: - - links.Maybe(, '') - - Indexing further if the path exists:: - - links.Maybe(, '')[''] - - Nesting Maybe:: - - links.Maybe(links.Maybe(, '')[''], '') - - To avoid excessive nesting use the :ref:`Try link ` - -- OneOf - To specify two possible paths for a single value:: - - links.OneOf(, ) - -- ParseDate - To determine a date from a string:: - - links.ParseDate() - -- ParseLanguage - To determine the ISO language code (i.e. 'ENG') from a string (i.e. 'English'):: - - links.ParseLanguage() - - Uses pycountry_ package. - - .. _pycountry: https://pypi.python.org/pypi/pycountry - -- ParseName - To determine the parts of a name (i.e. first name) out of a string:: - - links.ParseName().first - - options:: - - first - last - middle - suffix - title - nickname - - Uses nameparser_ package. - - .. _nameparser: https://pypi.python.org/pypi/nameparser - -- RunPython - To run a defined python function:: - - links.RunPython('', , *args, **kwargs) - -- Static - To define a static field:: - - links.Static() - -- Subjects - To map a subject to the PLOS taxonomy based on defined mappings:: - - links.Subjects() - -.. _try-reference: - -- Try - To transform data that is not consistently available and may throw an exception:: - - links.Try() - -- XPath - To access data using xpath:: - - links.XPath(, "") diff --git a/how-to/run-locally.md b/how-to/run-locally.md index 99e4a523d..7d0e6eb05 100644 --- a/how-to/run-locally.md +++ b/how-to/run-locally.md @@ -1,14 +1,14 @@ # SHARE Quickstart or: How I Learned to Stop Worrying and Love the Dock -this guide guides you through setting up SHARE locally using Docker -for development and manual testing. +this guide guides you through setting up SHARE locally for development and manual testing +using the `docker-compose.yml` file included in this repository. this guide does NOT guide you to anything appropriate for the open Internet. ## pre-requisites -- [git](https://git-scm.com/) -- [docker](https://www.docker.com/) (including `docker-compose`) +- [git](https://git-scm.com/) or equivalent +- [docker](https://www.docker.com/) (including `docker-compose`) or equivalent ## getting a local SHARE running @@ -48,11 +48,11 @@ docker-compose run --rm --no-deps worker bash this will open a bash prompt within a temporary `worker` container -- from here we can run commands within SHARE's environment, including django's `manage.py` -from within that worker shell, use django's `migrate` command to set up tables in postgres: +from within that worker shell, use django's `migrate` command to create tables in postgres: ``` python manage.py migrate ``` -...and use `sharectl` to set up indexes in elasticsearch: +...and the `shtrove_search_setup` command to create indexes in elasticsearch: ``` python manage.py shtrove_search_setup --initial ``` diff --git a/how-to/use-the-api.md b/how-to/use-the-api.md index 2a220615b..7a89650d6 100644 --- a/how-to/use-the-api.md +++ b/how-to/use-the-api.md @@ -1,25 +1,29 @@ -# How to use the API +# how to use the api -(see [openapi docs](/trove/docs/openapi.html) for detail) +## searching and browsing -## Sample and search for index-cards +`GET /trove/index-card-search`: search for cards that identify and describe things -`GET /trove/index-card-search`: search index-cards +`GET /trove/index-value-search`: search for values (like identifiers) used on cards, which you can use in card-searches -`GET /trove/index-value-search`: search values for specific properties on index-cards +`GET /trove/browse?iri=...`: inquire about a thing you have already identified -## Posting index-cards +(see [openapi docs](/trove/docs/openapi.html) for detail and available parameters) + + +### Posting index-cards > NOTE: currently used only by other COS projects, not yet for public use, authorization required -`POST /trove/ingest?focus_iri=...&record_identifier=...`: +`POST /trove/ingest?focus_iri=...`: currently supports only `Content-Type: text/turtle` query params: - `focus_iri` (required): full iri of the focus resource, exactly as used in the request body -- `record_identifier` (required): a source-specific identifier for the metadata record (no format restrictions) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used) +- `record_identifier`: a source-specific identifier for the metadata record (if omitted, uses `focus_iri`) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used) - `nonurgent`: if present (regardless of value), ingestion may be given a lower priority -- recommended for bulk or background operations - `is_supplementary`: if present (regardless of value), this record's metadata will be added to all pre-existing index-cards from the same user with the same `focus_iri` (if any), but will not get an index-card of its own nor affect the last-updated timestamp (e.g. in OAI-PMH) of the index-cards it supplements + - note: supplementary records must have a different `record_identifier` from the primary records for the same focus - `expiration_date`: optional date (in format `YYYY-MM-DD`) when the record is no longer valid and should be removed ## Deleting index-cards @@ -32,4 +36,3 @@ query params: `/oaipmh` -- an implementation of the Open Access Initiative's [Protocol for Metadata Harvesting](https://www.openarchives.org/OAI/openarchivesprotocol.html), an open standard for harvesting metadata from open repositories. You can use this to list metadata in bulk, or query by a few simple parameters (date range or source). - diff --git a/poetry.lock b/poetry.lock index 3f1f85e66..134c106be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -109,60 +109,60 @@ files = [ [[package]] name = "celery" -version = "5.4.0" +version = "5.5.3" description = "Distributed Task Queue." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "celery-5.4.0-py3-none-any.whl", hash = "sha256:369631eb580cf8c51a82721ec538684994f8277637edde2dfc0dacd73ed97f64"}, - {file = "celery-5.4.0.tar.gz", hash = "sha256:504a19140e8d3029d5acad88330c541d4c3f64c789d85f94756762d8bca7e706"}, + {file = "celery-5.5.3-py3-none-any.whl", hash = "sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525"}, + {file = "celery-5.5.3.tar.gz", hash = "sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5"}, ] [package.dependencies] -billiard = ">=4.2.0,<5.0" +billiard = ">=4.2.1,<5.0" click = ">=8.1.2,<9.0" click-didyoumean = ">=0.3.0" click-plugins = ">=1.1.1" click-repl = ">=0.2.0" -kombu = ">=5.3.4,<6.0" +kombu = ">=5.5.2,<5.6" python-dateutil = ">=2.8.2" -tzdata = ">=2022.7" vine = ">=5.1.0,<6.0" [package.extras] arangodb = ["pyArango (>=2.0.2)"] -auth = ["cryptography (==42.0.5)"] -azureblockblob = ["azure-storage-blob (>=12.15.0)"] +auth = ["cryptography (==44.0.2)"] +azureblockblob = ["azure-identity (>=1.19.0)", "azure-storage-blob (>=12.15.0)"] brotli = ["brotli (>=1.0.0) ; platform_python_implementation == \"CPython\"", "brotlipy (>=0.7.0) ; platform_python_implementation == \"PyPy\""] cassandra = ["cassandra-driver (>=3.25.0,<4)"] consul = ["python-consul2 (==0.1.5)"] cosmosdbsql = ["pydocumentdb (==2.3.5)"] couchbase = ["couchbase (>=3.0.0) ; platform_python_implementation != \"PyPy\" and (platform_system != \"Windows\" or python_version < \"3.10\")"] -couchdb = ["pycouchdb (==1.14.2)"] +couchdb = ["pycouchdb (==1.16.0)"] django = ["Django (>=2.2.28)"] dynamodb = ["boto3 (>=1.26.143)"] -elasticsearch = ["elastic-transport (<=8.13.0)", "elasticsearch (<=8.13.0)"] +elasticsearch = ["elastic-transport (<=8.17.1)", "elasticsearch (<=8.17.2)"] eventlet = ["eventlet (>=0.32.0) ; python_version < \"3.10\""] -gcs = ["google-cloud-storage (>=2.10.0)"] +gcs = ["google-cloud-firestore (==2.20.1)", "google-cloud-storage (>=2.10.0)", "grpcio (==1.67.0)"] gevent = ["gevent (>=1.5.0)"] librabbitmq = ["librabbitmq (>=2.0.0) ; python_version < \"3.11\""] memcache = ["pylibmc (==1.6.3) ; platform_system != \"Windows\""] -mongodb = ["pymongo[srv] (>=4.0.2)"] -msgpack = ["msgpack (==1.0.8)"] +mongodb = ["kombu[mongodb]"] +msgpack = ["kombu[msgpack]"] +pydantic = ["pydantic (>=2.4)"] pymemcache = ["python-memcached (>=1.61)"] pyro = ["pyro4 (==4.82) ; python_version < \"3.11\""] -pytest = ["pytest-celery[all] (>=1.0.0)"] -redis = ["redis (>=4.5.2,!=4.5.5,<6.0.0)"] +pytest = ["pytest-celery[all] (>=1.2.0,<1.3.0)"] +redis = ["kombu[redis]"] s3 = ["boto3 (>=1.26.143)"] -slmq = ["softlayer-messaging (>=1.0.3)"] -solar = ["ephem (==4.1.5) ; platform_python_implementation != \"PyPy\""] -sqlalchemy = ["sqlalchemy (>=1.4.48,<2.1)"] -sqs = ["boto3 (>=1.26.143)", "kombu[sqs] (>=5.3.4)", "pycurl (>=7.43.0.5) ; sys_platform != \"win32\" and platform_python_implementation == \"CPython\"", "urllib3 (>=1.26.16)"] +slmq = ["softlayer_messaging (>=1.0.3)"] +solar = ["ephem (==4.2) ; platform_python_implementation != \"PyPy\""] +sqlalchemy = ["kombu[sqlalchemy]"] +sqs = ["boto3 (>=1.26.143)", "kombu[sqs] (>=5.5.0)", "urllib3 (>=1.26.16)"] tblib = ["tblib (>=1.3.0) ; python_version < \"3.8.0\"", "tblib (>=1.5.0) ; python_version >= \"3.8.0\""] -yaml = ["PyYAML (>=3.10)"] +yaml = ["kombu[yaml]"] zookeeper = ["kazoo (>=1.3.1)"] -zstd = ["zstandard (==0.22.0)"] +zstd = ["zstandard (==0.23.0)"] [[package]] name = "certifi" @@ -617,14 +617,14 @@ test-randomorder = ["pytest-randomly"] [[package]] name = "django" -version = "5.2.3" +version = "5.2.7" description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design." optional = false python-versions = ">=3.10" groups = ["main", "dev"] files = [ - {file = "django-5.2.3-py3-none-any.whl", hash = "sha256:c517a6334e0fd940066aa9467b29401b93c37cec2e61365d663b80922542069d"}, - {file = "django-5.2.3.tar.gz", hash = "sha256:335213277666ab2c5cac44a792a6d2f3d58eb79a80c14b6b160cd4afc3b75684"}, + {file = "django-5.2.7-py3-none-any.whl", hash = "sha256:59a13a6515f787dec9d97a0438cd2efac78c8aca1c80025244b0fe507fe0754b"}, + {file = "django-5.2.7.tar.gz", hash = "sha256:e0f6f12e2551b1716a95a63a1366ca91bbcd7be059862c1b18f989b1da356cdd"}, ] [package.dependencies] @@ -1102,19 +1102,20 @@ typing-extensions = ">=4.5.0" [[package]] name = "kombu" -version = "5.5.0" +version = "5.5.4" description = "Messaging library for Python." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "kombu-5.5.0-py3-none-any.whl", hash = "sha256:526c6cf038c986b998639109a1eb762502f831e8da148cc928f1f95cd91eb874"}, - {file = "kombu-5.5.0.tar.gz", hash = "sha256:72e65c062e903ee1b4e8b68d348f63c02afc172eda409e3aca85867752e79c0b"}, + {file = "kombu-5.5.4-py3-none-any.whl", hash = "sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8"}, + {file = "kombu-5.5.4.tar.gz", hash = "sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363"}, ] [package.dependencies] amqp = ">=5.1.1,<6.0.0" -tzdata = {version = "2025.1", markers = "python_version >= \"3.9\""} +packaging = "*" +tzdata = {version = ">=2025.2", markers = "python_version >= \"3.9\""} vine = "5.1.0" [package.extras] @@ -1124,12 +1125,12 @@ confluentkafka = ["confluent-kafka (>=2.2.0)"] consul = ["python-consul2 (==0.1.5)"] gcpubsub = ["google-cloud-monitoring (>=2.16.0)", "google-cloud-pubsub (>=2.18.4)", "grpcio (==1.67.0)", "protobuf (==4.25.5)"] librabbitmq = ["librabbitmq (>=2.0.0) ; python_version < \"3.11\""] -mongodb = ["pymongo (>=4.1.1)"] +mongodb = ["pymongo (==4.10.1)"] msgpack = ["msgpack (==1.1.0)"] pyro = ["pyro4 (==4.82)"] qpid = ["qpid-python (>=0.26)", "qpid-tools (>=0.26)"] redis = ["redis (>=4.5.2,!=4.5.5,!=5.0.2,<=5.2.1)"] -slmq = ["softlayer-messaging (>=1.0.3)"] +slmq = ["softlayer_messaging (>=1.0.3)"] sqlalchemy = ["sqlalchemy (>=1.4.48,<2.1)"] sqs = ["boto3 (>=1.26.143)", "urllib3 (>=1.26.16)"] yaml = ["PyYAML (>=3.10)"] @@ -1451,7 +1452,7 @@ version = "25.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, @@ -1893,14 +1894,14 @@ files = [ [[package]] name = "tzdata" -version = "2025.1" +version = "2025.2" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" groups = ["main", "dev"] files = [ - {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, - {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, + {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, + {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, ] [[package]] @@ -2033,4 +2034,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = ">=3.13,<3.14" -content-hash = "cb2722bceed3082c7039af5a541855a7ce39531401e843dddb0e6493b604adeb" +content-hash = "56cd9b3cb1ce48fa9c677fd6659ae943b48ab7f8829116f3628d4de77f1a342e" diff --git a/project/settings.py b/project/settings.py index a29abf4ef..95fed6109 100644 --- a/project/settings.py +++ b/project/settings.py @@ -326,6 +326,7 @@ def split(string, delim): RABBITMQ_HEARTBEAT_TIMEOUT = int(os.environ.get('RABBITMQ_HEARTBEAT_TIMEOUT', 60)) CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', 'amqp://{}:{}@{}:{}/{}'.format(RABBITMQ_USERNAME, RABBITMQ_PASSWORD, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_VHOST)) +CELERY_WORKER_HIJACK_ROOT_LOGGER = False CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers:DatabaseScheduler' CELERY_BEAT_SCHEDULE = { @@ -336,6 +337,8 @@ def split(string, delim): } CELERY_RESULT_BACKEND = 'share.celery:CeleryDatabaseBackend' +CELERY_RESULT_BACKEND_ALWAYS_RETRY = True +CELERY_RESULT_BACKEND_MAX_RETRIES = int(os.environ.get('CELERY_RESULT_BACKEND_MAX_RETRIES', 17)) CELERY_RESULT_EXPIRES = int(os.environ.get( 'CELERY_RESULT_EXPIRES', 60 * 60 * 24 * 3, # 3 days @@ -442,6 +445,7 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw): PUBLIC_SENTRY_DSN = os.environ.get('PUBLIC_SENTRY_DSN') SHARE_WEB_URL = os.environ.get('SHARE_WEB_URL', 'http://localhost:8003').rstrip('/') + '/' +SHARE_SUPPORT_EMAIL = os.environ.get('SHARE_SUPPORT_EMAIL', 'share-support@cos.io') SHARE_USER_AGENT = os.environ.get('SHARE_USER_AGENT', 'SHAREbot/{} (+{})'.format(VERSION, SHARE_WEB_URL)) SHARE_ADMIN_USERNAME = os.environ.get('SHARE_ADMIN_USERNAME', 'admin') SHARE_ADMIN_PASSWORD = os.environ.get('SHARE_ADMIN_PASSWORD') diff --git a/project/static/img/shtroverview.png b/project/static/img/shtroverview.png new file mode 100644 index 000000000..0c78c3ebc Binary files /dev/null and b/project/static/img/shtroverview.png differ diff --git a/pyproject.toml b/pyproject.toml index 3dd8fa038..1680388b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "shtrove" -version = "25.5.0" +version = "25.6.0" description = "" authors = [ {name = "CenterForOpenScience", email = "share-support@cos.io"} @@ -9,7 +9,7 @@ readme = "README.md" requires-python = ">=3.13,<3.14" dependencies = [ "bcrypt==4.3.0", # Apache 2.0 - "celery==5.4.0", # BSD 3 Clause + "celery==5.5.3", # BSD 3 Clause "colorlog==6.9.0", # MIT "django-allauth==65.5.0", # MIT "django-celery-beat==2.8.1", # BSD 3 Clause @@ -17,10 +17,10 @@ dependencies = [ "django-extensions==3.2.3", # MIT "django-filter==25.1", # BSD "django-oauth-toolkit==3.0.1", # BSD - "django==5.2.*", # BSD 3 Clause + "django==5.2.7", # BSD 3 Clause "elasticsearch8==8.17.2", # Apache 2.0 "lxml==5.3.0", # BSD - "kombu==5.5.0", # BSD 3 Clause + "kombu==5.5.4", # BSD 3 Clause "markdown2==2.5.3", # MIT "psycopg2==2.9.10", # LGPL with exceptions or ZPL "rdflib==7.1.3", # BSD 3 Clause diff --git a/share/celery.py b/share/celery.py index 663ddbba9..962e08c11 100644 --- a/share/celery.py +++ b/share/celery.py @@ -4,14 +4,16 @@ from celery import states from celery.app.task import Context -from celery.backends.base import BaseDictBackend +from celery.backends.base import BaseBackend from celery.utils.time import maybe_timedelta - from django.conf import settings -from django.db import transaction +from django.db import ( + transaction, + IntegrityError as DjIntegrityError, + OperationalError as DjOperationalError, +) from django.db.models import Q from django.utils import timezone - import sentry_sdk from share.models import CeleryTaskResult @@ -40,7 +42,7 @@ def wrapped(*args, **kwargs): # Based on https://github.com/celery/django-celery-results/commit/f88c677d66ba1eaf1b7cb1f3b8c910012990984f -class CeleryDatabaseBackend(BaseDictBackend): +class CeleryDatabaseBackend(BaseBackend): """ Implemented from scratch rather than subclassed due to: @@ -53,8 +55,53 @@ class CeleryDatabaseBackend(BaseDictBackend): """ TaskModel = CeleryTaskResult + ### + # decorate some methods to fully stop/restart the worker on unhandled errors, + # including safe-to-retry errors that have been maximally retried + # (restarting may resolve some problems; others it will merely make more visible) + + @die_on_unhandled + def get_task_meta(self, *args, **kwargs): + super().get_task_meta(*args, **kwargs) + + @die_on_unhandled + def store_result(self, *args, **kwargs): + super().store_result(*args, **kwargs) + @die_on_unhandled + def forget(self, *args, **kwargs): + super().forget(*args, **kwargs) + + @die_on_unhandled + def cleanup(self, expires=None): + # no super implementation + TaskResultCleaner( + success_ttl=(expires or self.expires), + nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES, + ).clean() + + # END die_on_unhandled decorations + ### + + # override BaseBackend + def exception_safe_to_retry(self, exc): + return isinstance(exc, ( + DjOperationalError, # connection errors and whatnot + DjIntegrityError, # e.g. overlapping transactions with conflicting `get_or_create` + )) + + # implement for BaseBackend def _store_result(self, task_id, result, status, traceback=None, request=None, **kwargs): + _already_successful = ( + self.TaskModel.objects + .filter(task_id=task_id, status=states.SUCCESS) + .exists() + ) + if _already_successful: + # avoid clobbering prior successful result, which could be caused by network partition or lost worker, ostensibly: + # https://github.com/celery/celery/blob/92514ac88afc4ccdff31f3a1018b04499607ca1e/celery/backends/base.py#L967-L972 + return + fields = { 'result': result, 'traceback': traceback, @@ -88,20 +135,14 @@ def _store_result(self, task_id, result, status, traceback=None, request=None, * setattr(obj, key, value) obj.save() - return obj - - @die_on_unhandled - def cleanup(self, expires=None): - TaskResultCleaner( - success_ttl=(expires or self.expires), - nonsuccess_ttl=settings.FAILED_CELERY_RESULT_EXPIRES, - ).clean() - - @die_on_unhandled + # implement for BaseBackend def _get_task_meta_for(self, task_id): - return self.TaskModel.objects.get(task_id=task_id).as_dict() + try: + return self.TaskModel.objects.get(task_id=task_id).as_dict() + except self.TaskModel.DoesNotExist: + return {'status': states.PENDING, 'result': None} - @die_on_unhandled + # implement for BaseBackend def _forget(self, task_id): try: self.TaskModel.objects.get(task_id=task_id).delete() diff --git a/share/models/index_backfill.py b/share/models/index_backfill.py index 93f18ab6a..7734cf292 100644 --- a/share/models/index_backfill.py +++ b/share/models/index_backfill.py @@ -185,5 +185,4 @@ def task__schedule_index_backfill(self, index_backfill_pk): except Exception as error: _index_backfill.pls_mark_error(error) raise error - else: - _index_backfill.pls_note_scheduling_has_finished() + _index_backfill.pls_note_scheduling_has_finished() diff --git a/share/oaipmh/indexcard_repository.py b/share/oaipmh/indexcard_repository.py index d9d855f75..72a3ee407 100644 --- a/share/oaipmh/indexcard_repository.py +++ b/share/oaipmh/indexcard_repository.py @@ -1,15 +1,16 @@ +import datetime import uuid from django.core.exceptions import ValidationError as DjangoValidationError +from django.conf import settings from django.db.models import OuterRef, Subquery, F from share.oaipmh import errors as oai_errors from share.oaipmh.verbs import OAIVerb from share.oaipmh.response_renderer import OAIRenderer -from share.oaipmh.util import format_datetime -from share.util.fromisoformat import fromisoformat from share import models as share_db from trove import models as trove_db +from trove.util.datetime import datetime_isoformat_z as format_datetime from trove.vocab.namespaces import OAI_DC @@ -18,7 +19,7 @@ class OaiPmhRepository: REPOSITORY_IDENTIFIER = 'share.osf.io' IDENTIFER_DELIMITER = ':' GRANULARITY = 'YYYY-MM-DD' - ADMIN_EMAILS = ['share-support@osf.io'] + ADMIN_EMAILS = [settings.SHARE_SUPPORT_EMAIL] # TODO better way of structuring this than a bunch of dictionaries? # this dictionary's keys are `metadataPrefix` values @@ -206,7 +207,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None): ) if 'from' in kwargs: try: - _from = fromisoformat(kwargs['from']) + _from = datetime.datetime.fromisoformat(kwargs['from']) except ValueError: if not catch: raise @@ -217,7 +218,7 @@ def _get_indexcard_page_queryset(self, kwargs, catch=True, last_id=None): ) if 'until' in kwargs: try: - _until = fromisoformat(kwargs['until']) + _until = datetime.datetime.fromisoformat(kwargs['until']) except ValueError: if not catch: raise @@ -291,12 +292,12 @@ def _get_resumption_token(self, kwargs, last_id): _until = None if 'from' in kwargs: try: - _from = fromisoformat(kwargs['from']) + _from = datetime.datetime.fromisoformat(kwargs['from']) except ValueError: self.errors.append(oai_errors.BadArgument('Invalid value for', 'from')) if 'until' in kwargs: try: - _until = fromisoformat(kwargs['until']) + _until = datetime.datetime.fromisoformat(kwargs['until']) except ValueError: self.errors.append(oai_errors.BadArgument('Invalid value for', 'until')) _set_spec = kwargs.get('set', '') diff --git a/share/oaipmh/response_renderer.py b/share/oaipmh/response_renderer.py index c45aea770..c8e233e0a 100644 --- a/share/oaipmh/response_renderer.py +++ b/share/oaipmh/response_renderer.py @@ -4,7 +4,8 @@ from django.urls import reverse -from share.oaipmh.util import format_datetime, SubEl, ns, nsmap +from share.oaipmh.util import SubEl, ns, nsmap +from trove.util.datetime import datetime_isoformat_z as format_datetime class OAIRenderer: diff --git a/share/oaipmh/util.py b/share/oaipmh/util.py index 413ac0173..a7457d4ef 100644 --- a/share/oaipmh/util.py +++ b/share/oaipmh/util.py @@ -1,23 +1,11 @@ -import datetime from typing import Any from lxml import etree from primitive_metadata import primitive_rdf -from share.util.fromisoformat import fromisoformat from trove.vocab.namespaces import OAI, OAI_DC -def format_datetime(dt: datetime.datetime | primitive_rdf.Literal | str) -> str: - """OAI-PMH has specific time format requirements -- comply. - """ - if isinstance(dt, primitive_rdf.Literal): - dt = dt.unicode_value - if isinstance(dt, str): - dt = fromisoformat(dt) - return dt.strftime('%Y-%m-%dT%H:%M:%SZ') - - XML_NAMESPACES = { 'dc': 'http://purl.org/dc/elements/1.1/', 'oai': str(OAI), diff --git a/share/util/fromisoformat.py b/share/util/fromisoformat.py deleted file mode 100644 index 92ac3d4a8..000000000 --- a/share/util/fromisoformat.py +++ /dev/null @@ -1,10 +0,0 @@ -import datetime -import re - - -def fromisoformat(date_str: str) -> datetime.datetime: - # wrapper around `datetime.datetime.fromisoformat` that supports "Z" UTC suffix - # (may be removed in python 3.11+, when `fromisoformat` handles more iso-6801 formats) - return datetime.datetime.fromisoformat( - re.sub('Z$', '+00:00', date_str), # replace "Z" shorthand with explicit timezone offset - ) diff --git a/share/util/xml.py b/share/util/xml.py index d0979954c..6ff13f829 100644 --- a/share/util/xml.py +++ b/share/util/xml.py @@ -15,5 +15,5 @@ ) -def strip_illegal_xml_chars(string): +def strip_illegal_xml_chars(string: str) -> str: return RE_XML_ILLEGAL.sub('', string) diff --git a/templates/admin/login.html b/templates/admin/login.html new file mode 100644 index 000000000..dbe59e29c --- /dev/null +++ b/templates/admin/login.html @@ -0,0 +1,3 @@ +{% extends "admin/login.html" %} + +{% block content %}{{ block.super }}login with osf{% endblock %} diff --git a/templates/allauth/login_errored_cancelled.html b/templates/allauth/login_errored_cancelled.html index c850a15ec..f7a26ffe1 100644 --- a/templates/allauth/login_errored_cancelled.html +++ b/templates/allauth/login_errored_cancelled.html @@ -3,9 +3,6 @@ {% load static %} Login Failed - - -
diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 3d5f51e58..c7146a762 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -117,10 +117,10 @@ def test_cardsearch_pagination(self): })) self._index_indexcards(_cards) # gather all pages results: - _querystring: str = f'page[size]={_page_size}' + _querystring: str | None = f'page[size]={_page_size}' _result_iris: set[str] = set() _page_count = 0 - while True: + while _querystring is not None: _cardsearch_handle = self.index_strategy.pls_handle_cardsearch( CardsearchParams.from_querystring(_querystring), ) @@ -133,9 +133,11 @@ def test_cardsearch_pagination(self): _result_iris.update(_page_iris) _page_count += 1 _next_cursor = _cardsearch_handle.cursor.next_cursor() - if _next_cursor is None: - break - _querystring = urlencode({'page[cursor]': _next_cursor.as_queryparam_value()}) + _querystring = ( + urlencode({'page[cursor]': _next_cursor.as_queryparam_value()}) + if _next_cursor is not None + else None # done + ) self.assertEqual(_page_count, math.ceil(_total_count / _page_size)) self.assertEqual(_result_iris, _expected_iris) diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index a4219b312..ec4076668 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -48,12 +48,6 @@ def tearDown(self): connections['default']._test_serialized_contents ) - def enterContext(self, context_manager): - # TestCase.enterContext added in python3.11 -- implementing here until then - result = context_manager.__enter__() - self.addCleanup(lambda: context_manager.__exit__(None, None, None)) - return result - @contextlib.contextmanager def _daemon_up(self): _daemon_control = IndexerDaemonControl(celery_app) diff --git a/tests/share/test_oaipmh_trove.py b/tests/share/test_oaipmh_trove.py index 0bdd7df1b..64b0e0b93 100644 --- a/tests/share/test_oaipmh_trove.py +++ b/tests/share/test_oaipmh_trove.py @@ -8,8 +8,8 @@ import pytest from share import models as share_db -from share.oaipmh.util import format_datetime from trove import models as trove_db +from trove.util.datetime import datetime_isoformat_z as format_datetime from trove.vocab.namespaces import OAI_DC from tests import factories @@ -232,11 +232,9 @@ def _assert_full_list(self, verb, params, request_method, expected_count, page_s pages = 0 count = 0 token = None - while True: - if token: - parsed = oai_request({'verb': verb, 'resumptionToken': token}, request_method) - else: - parsed = oai_request({'verb': verb, 'metadataPrefix': 'oai_dc', **params}, request_method) + next_params: dict[str, str] | None = {'verb': verb, 'metadataPrefix': 'oai_dc', **params} + while next_params is not None: + parsed = oai_request(next_params, request_method) page = parsed.xpath('//oai:header/oai:identifier', namespaces=NAMESPACES) pages += 1 count += len(page) @@ -245,9 +243,10 @@ def _assert_full_list(self, verb, params, request_method, expected_count, page_s token = token[0].text if token: assert len(page) == page_size + next_params = {'verb': verb, 'resumptionToken': token} else: assert len(page) <= page_size - break + next_params = None # done assert count == expected_count assert pages == math.ceil(expected_count / page_size) diff --git a/tests/trove/_input_output_tests.py b/tests/trove/_input_output_tests.py index 90590fda9..72ec269f6 100644 --- a/tests/trove/_input_output_tests.py +++ b/tests/trove/_input_output_tests.py @@ -28,12 +28,12 @@ def assert_outputs_equal(self, expected_output: typing.Any, actual_output: typin self.assertEqual(expected_output, actual_output) # (optional override, for when logic is more complicated) - def run_input_output_test(self, given_input, expected_output): + def run_input_output_test(self, given_input: typing.Any, expected_output: typing.Any) -> None: _actual_output = self.compute_output(given_input) self.assert_outputs_equal(expected_output, _actual_output) # (optional override, for when logic is more complicated) - def missing_case(self, name: str, given_input): + def missing_case(self, name: str, given_input: typing.Any) -> typing.Never: _cls = self.__class__ _actual_output = self.compute_output(given_input) raise NotImplementedError('\n'.join(( @@ -43,16 +43,10 @@ def missing_case(self, name: str, given_input): pprint.pformat(_actual_output), ))) - def enterContext(self, context_manager): - # TestCase.enterContext added in python3.11 -- implementing here until then - result = context_manager.__enter__() - self.addCleanup(lambda: context_manager.__exit__(None, None, None)) - return result - ### # private details - def __init_subclass__(cls, **kwargs): + def __init_subclass__(cls, **kwargs: typing.Any) -> None: super().__init_subclass__(**kwargs) # HACK: assign `test_*` method only on concrete subclasses, # so the test runner doesn't try instantiating a base class diff --git a/tests/trove/digestive_tract/test_expel.py b/tests/trove/digestive_tract/test_expel.py index 7f2345eb2..333280a80 100644 --- a/tests/trove/digestive_tract/test_expel.py +++ b/tests/trove/digestive_tract/test_expel.py @@ -40,12 +40,6 @@ def setUp(self): def _replacement_notify_indexcard_update(self, indexcards, **kwargs): self.notified_indexcard_ids.update(_card.id for _card in indexcards) - def enterContext(self, context_manager): - # TestCase.enterContext added in python3.11 -- implementing here until then - result = context_manager.__enter__() - self.addCleanup(lambda: context_manager.__exit__(None, None, None)) - return result - def test_setup(self): self.indexcard_1.refresh_from_db() self.indexcard_2.refresh_from_db() diff --git a/tests/trove/render/_base.py b/tests/trove/render/_base.py index 94b8f94a8..7e5b59ab9 100644 --- a/tests/trove/render/_base.py +++ b/tests/trove/render/_base.py @@ -1,4 +1,5 @@ import json +import typing from primitive_metadata import ( gather, @@ -7,7 +8,7 @@ from trove.trovesearch.trovesearch_gathering import trovesearch_by_indexstrategy from trove.render._base import BaseRenderer -from trove.render._rendering import ProtoRendering +from trove.render.rendering import ProtoRendering from trove.vocab.namespaces import RDF from tests.trove._input_output_tests import BasicInputOutputTestCase from ._inputs import UNRENDERED_RDF, UNRENDERED_SEARCH_RDF, RdfCase @@ -56,7 +57,7 @@ def compute_output(self, given_input: RdfCase): ) return _renderer.render_document() - def assert_outputs_equal(self, expected_output, actual_output) -> None: + def assert_outputs_equal(self, expected_output: typing.Any, actual_output: typing.Any) -> None: if expected_output is None: print(repr(actual_output)) raise NotImplementedError @@ -66,9 +67,9 @@ def assert_outputs_equal(self, expected_output, actual_output) -> None: self._get_rendered_output(actual_output), ) - def _get_rendered_output(self, rendering: ProtoRendering): + def _get_rendered_output(self, rendering: ProtoRendering) -> str: # for now, they always iter strings (update if/when bytes are in play) - return ''.join(rendering.iter_content()) # type: ignore[arg-type] + return ''.join(map(str, rendering.iter_content())) class TrovesearchRendererTests(TroveRendererTests): diff --git a/tests/trove/render/_inputs.py b/tests/trove/render/_inputs.py index 29d6cb9ad..3ca9c9151 100644 --- a/tests/trove/render/_inputs.py +++ b/tests/trove/render/_inputs.py @@ -29,7 +29,7 @@ class RdfCase: DCTERMS.issued: {rdf.literal(datetime.date(2024, 1, 1))}, DCTERMS.modified: {rdf.literal(datetime.date(2024, 1, 1))}, TROVE.resourceMetadata: {rdf.literal( - json.dumps({'@id': BLARG.anItem, 'title': 'an item, yes'}), + json.dumps({'@id': BLARG.anItem, 'title': [{'@value': 'an item, yes'}]}), datatype_iris=RDF.JSON, )}, }, @@ -83,7 +83,7 @@ class RdfCase: DCTERMS.issued: {rdf.literal(datetime.date(2024, 1, 1))}, DCTERMS.modified: {rdf.literal(datetime.date(2024, 1, 1))}, TROVE.resourceMetadata: {rdf.literal( - json.dumps({'@id': BLARG.anItem, 'title': 'an item, yes'}), + json.dumps({'@id': BLARG.anItem, 'title': [{'@value': 'an item, yes'}]}), datatype_iris=RDF.JSON, )}, }, @@ -94,7 +94,7 @@ class RdfCase: DCTERMS.issued: {rdf.literal(datetime.date(2024, 2, 2))}, DCTERMS.modified: {rdf.literal(datetime.date(2024, 2, 2))}, TROVE.resourceMetadata: {rdf.literal( - json.dumps({'@id': BLARG.anItemm, 'title': 'an itemm, yes'}), + json.dumps({'@id': BLARG.anItemm, 'title': [{'@value': 'an itemm, yes'}]}), datatype_iris=RDF.JSON, )}, }, @@ -105,7 +105,31 @@ class RdfCase: DCTERMS.issued: {rdf.literal(datetime.date(2024, 3, 3))}, DCTERMS.modified: {rdf.literal(datetime.date(2024, 3, 3))}, TROVE.resourceMetadata: {rdf.literal( - json.dumps({'@id': BLARG.anItemmm, 'title': 'an itemmm, yes'}), + json.dumps({ + '@id': BLARG.anItemmm, + "sameAs": [ + {"@id": "https://doi.example/13.0/anItemmm"} + ], + 'title': [{'@value': 'an itemmm, yes'}], + "creator": [ + { + "@id": BLARG.aPerson, + "resourceType": [ + {"@id": "Agent"}, + {"@id": "Person"} + ], + "identifier": [ + {"@value": BLARG.aPerson} + ], + "name": [ + {"@value": "a person indeed"} + ] + } + ], + "dateCreated": [ + {"@value": "2001-02-03"} + ], + }), datatype_iris=RDF.JSON, )}, }, diff --git a/tests/trove/render/test_cardsearch_atom_renderer.py b/tests/trove/render/test_cardsearch_atom_renderer.py new file mode 100644 index 000000000..bd8d7d9c4 --- /dev/null +++ b/tests/trove/render/test_cardsearch_atom_renderer.py @@ -0,0 +1,60 @@ +from unittest import mock +import datetime + +from trove.render.cardsearch_atom import CardsearchAtomRenderer +from trove.render.rendering import EntireRendering +from . import _base + + +# note: cardsearch only -- this renderer doesn't do arbitrary rdf + +class TestCardsearchAtomRenderer(_base.TrovesearchRendererTests): + renderer_class = CardsearchAtomRenderer + expected_outputs = { + 'no_results': EntireRendering( + mediatype='application/atom+xml', + entire_content=( + b"\n" + b'' + b'shtrove search results' + b'feed of metadata records matching given filters' + b'' + b'http://blarg.example/vocab/aSearch' + b'2345-06-07T08:09:10Z' + b'' + ), + ), + 'few_results': EntireRendering( + mediatype='application/atom+xml', + entire_content=( + b"\n" + b'' + b'shtrove search results' + b'feed of metadata records matching given filters' + b'' + b'http://blarg.example/vocab/aSearchFew' + b'2345-06-07T08:09:10Z' + b'' + b'' + b'http://blarg.example/vocab/aCard' + b'an item, yes' + b'' + b'' + b'http://blarg.example/vocab/aCardd' + b'an itemm, yes' + b'' + b'' + b'http://blarg.example/vocab/aCarddd' + b'an itemmm, yes' + b'2001-02-03T00:00:00Z' + b'a person indeedhttp://blarg.example/vocab/aPerson' + b'' + ), + ), + } + + def setUp(self): + self.enterContext(mock.patch( + 'django.utils.timezone.now', + return_value=datetime.datetime(2345, 6, 7, 8, 9, 10, tzinfo=datetime.UTC), + )) diff --git a/tests/trove/render/test_cardsearch_rss_renderer.py b/tests/trove/render/test_cardsearch_rss_renderer.py new file mode 100644 index 000000000..a376b6cda --- /dev/null +++ b/tests/trove/render/test_cardsearch_rss_renderer.py @@ -0,0 +1,53 @@ +from trove.render.cardsearch_rss import CardsearchRssRenderer +from trove.render.rendering import EntireRendering +from . import _base + + +# note: cardsearch only -- this renderer doesn't do arbitrary rdf + +class TestCardsearchRssRenderer(_base.TrovesearchRendererTests): + renderer_class = CardsearchRssRenderer + expected_outputs = { + 'no_results': EntireRendering( + mediatype='application/rss+xml', + entire_content=( + b"\n" + b'' + b'' + b'shtrove search results' + b'http://blarg.example/vocab/aSearch' + b'' + b'feed of metadata records matching given filters' + b'share-support@cos.io' + b'' + ), + ), + 'few_results': EntireRendering( + mediatype='application/rss+xml', + entire_content=( + b"\n" + b'' + b'' + b'shtrove search results' + b'http://blarg.example/vocab/aSearchFew' + b'' + b'feed of metadata records matching given filters' + b'share-support@cos.io' + b'' + b'http://blarg.example/vocab/anItem' + b'http://blarg.example/vocab/anItem' + b'an item, yes' + b'' + b'http://blarg.example/vocab/anItemm' + b'http://blarg.example/vocab/anItemm' + b'an itemm, yes' + b'' + b'http://blarg.example/vocab/anItemmm' + b'http://blarg.example/vocab/anItemmm' + b'an itemmm, yes' + b'Sat, 03 Feb 2001 00:00:00 -0000' + b'http://blarg.example/vocab/aPerson (a person indeed)' + b'' + ), + ), + } diff --git a/tests/trove/render/test_html_browse_renderer.py b/tests/trove/render/test_html_browse_renderer.py new file mode 100644 index 000000000..ee740248c --- /dev/null +++ b/tests/trove/render/test_html_browse_renderer.py @@ -0,0 +1,31 @@ +import html +import typing + +from trove.render.html_browse import RdfHtmlBrowseRenderer +from . import _base + + +# note: smoke tests only (TODO: better) + +class TestTrovesearchHtmlRenderer(_base.TrovesearchRendererTests): + renderer_class = RdfHtmlBrowseRenderer + expected_outputs = { + 'no_results': { + 'mediatype': 'text/html', + 'result_iris': [], + }, + 'few_results': { + 'mediatype': 'text/html', + 'result_iris': [ + 'http://blarg.example/vocab/anItem', + 'http://blarg.example/vocab/anItemm', + 'http://blarg.example/vocab/anItemmm', + ], + }, + } + + def assert_outputs_equal(self, expected_output: typing.Any, actual_output: typing.Any) -> None: + self.assertEqual(actual_output.mediatype, expected_output['mediatype']) + # smoke tests -- instead of asserting full rendered html page, just check the results are in there + for _result_iri in expected_output['result_iris']: + self.assertIn(html.escape(_result_iri), actual_output.entire_content) diff --git a/tests/trove/render/test_jsonapi_renderer.py b/tests/trove/render/test_jsonapi_renderer.py index 9357c5ff6..de3019739 100644 --- a/tests/trove/render/test_jsonapi_renderer.py +++ b/tests/trove/render/test_jsonapi_renderer.py @@ -2,7 +2,7 @@ from unittest import mock from trove.render.jsonapi import RdfJsonapiRenderer -from trove.render._rendering import SimpleRendering +from trove.render.rendering import EntireRendering from trove.vocab.namespaces import BLARG from . import _base @@ -31,9 +31,9 @@ def _get_rendered_output(self, rendering): class TestJsonapiRenderer(_BaseJsonapiRendererTest): expected_outputs = { - 'simple_card': SimpleRendering( + 'simple_card': EntireRendering( mediatype='application/vnd.api+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "data": { "id": "blarg:aCard", "type": "index-card", @@ -43,7 +43,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): ], "resourceMetadata": { "@id": BLARG.anItem, - "title": "an item, yes" + "title": [{"@value": "an item, yes"}] } }, "links": { @@ -63,9 +63,9 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): } }), ), - 'various_types': SimpleRendering( + 'various_types': EntireRendering( mediatype='application/vnd.api+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "data": { "id": "blarg:aSubject", "type": "blarg:aType", @@ -86,9 +86,9 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonRendererTests): expected_outputs = { - 'no_results': SimpleRendering( + 'no_results': EntireRendering( mediatype='application/vnd.api+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "data": { "id": "blarg:aSearch", "type": "index-card-search", @@ -101,9 +101,9 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR } }), ), - 'few_results': SimpleRendering( + 'few_results': EntireRendering( mediatype='application/vnd.api+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "data": { "id": "blarg:aSearchFew", "type": "index-card-search", @@ -189,7 +189,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR ], "resourceMetadata": { "@id": BLARG.anItem, - "title": "an item, yes" + "title": [{"@value": "an item, yes"}] } }, "links": { @@ -215,8 +215,29 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR BLARG.anItemmm ], "resourceMetadata": { - "@id": BLARG.anItemmm, - "title": "an itemmm, yes" + '@id': BLARG.anItemmm, + "sameAs": [ + {"@id": "https://doi.example/13.0/anItemmm"} + ], + 'title': [{'@value': 'an itemmm, yes'}], + "creator": [ + { + "@id": BLARG.aPerson, + "resourceType": [ + {"@id": "Agent"}, + {"@id": "Person"} + ], + "identifier": [ + {"@value": BLARG.aPerson} + ], + "name": [ + {"@value": "a person indeed"} + ] + } + ], + "dateCreated": [ + {"@value": "2001-02-03"} + ], } }, "links": { @@ -243,7 +264,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR ], "resourceMetadata": { "@id": BLARG.anItemm, - "title": "an itemm, yes" + "title": [{"@value": "an itemm, yes"}] } }, "links": { diff --git a/tests/trove/render/test_jsonld_renderer.py b/tests/trove/render/test_jsonld_renderer.py index eef657f1d..c983cad19 100644 --- a/tests/trove/render/test_jsonld_renderer.py +++ b/tests/trove/render/test_jsonld_renderer.py @@ -1,8 +1,8 @@ import json from trove.render.jsonld import RdfJsonldRenderer -from trove.render._rendering import SimpleRendering -from ._inputs import BLARG +from trove.render.rendering import EntireRendering +from trove.vocab.namespaces import BLARG from . import _base @@ -10,9 +10,9 @@ class TestJsonldRenderer(_base.TroveJsonRendererTests): renderer_class = RdfJsonldRenderer expected_outputs = { - 'simple_card': SimpleRendering( + 'simple_card': EntireRendering( mediatype='application/ld+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "@id": "blarg:aCard", "dcterms:issued": [ { @@ -38,13 +38,13 @@ class TestJsonldRenderer(_base.TroveJsonRendererTests): ], "trove:resourceMetadata": { "@id": BLARG.anItem, - "title": "an item, yes" + "title": [{"@value": "an item, yes"}] } }), ), - 'various_types': SimpleRendering( + 'various_types': EntireRendering( mediatype='application/ld+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "@id": "blarg:aSubject", "blarg:hasDateLiteral": [ { @@ -88,9 +88,9 @@ class TestJsonldSearchRenderer(_base.TrovesearchJsonRendererTests): renderer_class = RdfJsonldRenderer expected_outputs = { - 'no_results': SimpleRendering( + 'no_results': EntireRendering( mediatype='application/ld+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "@id": "blarg:aSearch", "rdf:type": [ {"@id": "trove:Cardsearch"} @@ -101,9 +101,9 @@ class TestJsonldSearchRenderer(_base.TrovesearchJsonRendererTests): } }), ), - 'few_results': SimpleRendering( + 'few_results': EntireRendering( mediatype='application/ld+json', - rendered_content=json.dumps({ + entire_content=json.dumps({ "@id": "blarg:aSearchFew", "rdf:type": [ {"@id": "trove:Cardsearch"} @@ -145,7 +145,7 @@ class TestJsonldSearchRenderer(_base.TrovesearchJsonRendererTests): ], "trove:resourceMetadata": { "@id": BLARG.anItem, - "title": "an item, yes" + "title": [{"@value": "an item, yes"}] } } }, @@ -181,7 +181,7 @@ class TestJsonldSearchRenderer(_base.TrovesearchJsonRendererTests): ], "trove:resourceMetadata": { "@id": BLARG.anItemm, - "title": "an itemm, yes" + "title": [{"@value": "an itemm, yes"}] } } }, @@ -214,8 +214,29 @@ class TestJsonldSearchRenderer(_base.TrovesearchJsonRendererTests): {"@value": BLARG.anItemmm} ], "trove:resourceMetadata": { - "@id": BLARG.anItemmm, - "title": "an itemmm, yes" + '@id': BLARG.anItemmm, + "sameAs": [ + {"@id": "https://doi.example/13.0/anItemmm"} + ], + 'title': [{'@value': 'an itemmm, yes'}], + "creator": [ + { + "@id": BLARG.aPerson, + "resourceType": [ + {"@id": "Agent"}, + {"@id": "Person"} + ], + "identifier": [ + {"@value": BLARG.aPerson} + ], + "name": [ + {"@value": "a person indeed"} + ] + } + ], + "dateCreated": [ + {"@value": "2001-02-03"} + ], } } } diff --git a/tests/trove/render/test_simple_csv_renderer.py b/tests/trove/render/test_simple_csv_renderer.py deleted file mode 100644 index ca06aa273..000000000 --- a/tests/trove/render/test_simple_csv_renderer.py +++ /dev/null @@ -1,24 +0,0 @@ -from trove.render.simple_csv import TrovesearchSimpleCsvRenderer -from trove.render._rendering import SimpleRendering -from . import _base - - -# note: trovesearch only -- this renderer doesn't do arbitrary rdf - -class TestSimpleCsvRenderer(_base.TrovesearchRendererTests): - renderer_class = TrovesearchSimpleCsvRenderer - expected_outputs = { - 'no_results': SimpleRendering( - mediatype='text/csv', - rendered_content='@id,sameAs,resourceType,resourceNature,title,name,dateCreated,dateModified,rights\r\n', - ), - 'few_results': SimpleRendering( - mediatype='text/csv', - rendered_content=''.join(( - '@id,sameAs,resourceType,resourceNature,title,name,dateCreated,dateModified,rights\r\n', - 'http://blarg.example/vocab/anItem,,,,"an item, yes",,,,\r\n', - 'http://blarg.example/vocab/anItemm,,,,"an itemm, yes",,,,\r\n', - 'http://blarg.example/vocab/anItemmm,,,,"an itemmm, yes",,,,\r\n', - )), - ), - } diff --git a/tests/trove/render/test_simple_json_renderer.py b/tests/trove/render/test_simple_json_renderer.py deleted file mode 100644 index 7f59c8a59..000000000 --- a/tests/trove/render/test_simple_json_renderer.py +++ /dev/null @@ -1,62 +0,0 @@ -import json - -from trove.render.simple_json import TrovesearchSimpleJsonRenderer -from trove.render._rendering import SimpleRendering -from trove.vocab.namespaces import BLARG -from . import _base - - -# note: trovesearch only -- this renderer doesn't do arbitrary rdf - -class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): - renderer_class = TrovesearchSimpleJsonRenderer - expected_outputs = { - 'no_results': SimpleRendering( - mediatype='application/json', - rendered_content=json.dumps({ - "data": [], - "links": {}, - "meta": { - "total": 0 - } - }), - ), - 'few_results': SimpleRendering( - mediatype='application/json', - rendered_content=json.dumps({ - "data": [ - { - "@id": BLARG.anItem, - "title": "an item, yes", - "foaf:isPrimaryTopicOf": [ - { - "@id": BLARG.aCard - } - ] - }, - { - "@id": BLARG.anItemm, - "title": "an itemm, yes", - "foaf:isPrimaryTopicOf": [ - { - "@id": BLARG.aCardd - } - ] - }, - { - "@id": BLARG.anItemmm, - "title": "an itemmm, yes", - "foaf:isPrimaryTopicOf": [ - { - "@id": BLARG.aCarddd - } - ] - } - ], - "links": {}, - "meta": { - "total": 3 - } - }), - ), - } diff --git a/tests/trove/render/test_simple_tsv_renderer.py b/tests/trove/render/test_simple_tsv_renderer.py deleted file mode 100644 index 752493362..000000000 --- a/tests/trove/render/test_simple_tsv_renderer.py +++ /dev/null @@ -1,24 +0,0 @@ -from trove.render.simple_tsv import TrovesearchSimpleTsvRenderer -from trove.render._rendering import SimpleRendering -from . import _base - - -# note: trovesearch only -- this renderer doesn't do arbitrary rdf - -class TestSimpleTsvRenderer(_base.TrovesearchRendererTests): - renderer_class = TrovesearchSimpleTsvRenderer - expected_outputs = { - 'no_results': SimpleRendering( - mediatype='text/tab-separated-values', - rendered_content='@id\tsameAs\tresourceType\tresourceNature\ttitle\tname\tdateCreated\tdateModified\trights\r\n', - ), - 'few_results': SimpleRendering( - mediatype='text/tab-separated-values', - rendered_content=''.join(( - '@id\tsameAs\tresourceType\tresourceNature\ttitle\tname\tdateCreated\tdateModified\trights\r\n', - 'http://blarg.example/vocab/anItem\t\t\t\tan item, yes\t\t\t\t\r\n', - 'http://blarg.example/vocab/anItemm\t\t\t\tan itemm, yes\t\t\t\t\r\n', - 'http://blarg.example/vocab/anItemmm\t\t\t\tan itemmm, yes\t\t\t\t\r\n', - )), - ), - } diff --git a/tests/trove/render/test_trovesearch_csv_renderer.py b/tests/trove/render/test_trovesearch_csv_renderer.py new file mode 100644 index 000000000..aa31651d1 --- /dev/null +++ b/tests/trove/render/test_trovesearch_csv_renderer.py @@ -0,0 +1,24 @@ +from trove.render.trovesearch_csv import TrovesearchCsvRenderer +from trove.render.rendering import EntireRendering +from . import _base + + +# note: trovesearch only -- this renderer doesn't do arbitrary rdf + +class TestTrovesearchCsvRenderer(_base.TrovesearchRendererTests): + renderer_class = TrovesearchCsvRenderer + expected_outputs = { + 'no_results': EntireRendering( + mediatype='text/csv', + entire_content='@id,sameAs,resourceType,resourceNature,title,name,dateCreated,dateModified,rights\r\n', + ), + 'few_results': EntireRendering( + mediatype='text/csv', + entire_content=''.join(( + '@id,sameAs,resourceType,resourceNature,title,name,dateCreated,dateModified,rights\r\n', + 'http://blarg.example/vocab/anItem,,,,"an item, yes",,,,\r\n', + 'http://blarg.example/vocab/anItemm,,,,"an itemm, yes",,,,\r\n', + 'http://blarg.example/vocab/anItemmm,https://doi.example/13.0/anItemmm,,,"an itemmm, yes",,2001-02-03,,\r\n', + )), + ), + } diff --git a/tests/trove/render/test_trovesearch_json_renderer.py b/tests/trove/render/test_trovesearch_json_renderer.py new file mode 100644 index 000000000..a0a9c4ad0 --- /dev/null +++ b/tests/trove/render/test_trovesearch_json_renderer.py @@ -0,0 +1,71 @@ +import json + +from trove.render.trovesearch_json import TrovesearchJsonRenderer +from trove.render.rendering import EntireRendering +from trove.vocab.namespaces import BLARG +from . import _base + + +# note: trovesearch only -- this renderer doesn't do arbitrary rdf + +class TestTrovesearchJsonRenderer(_base.TrovesearchJsonRendererTests): + renderer_class = TrovesearchJsonRenderer + expected_outputs = { + 'no_results': EntireRendering( + mediatype='application/json', + entire_content=json.dumps({ + "data": [], + "links": {}, + "meta": { + "total": 0 + } + }), + ), + 'few_results': EntireRendering( + mediatype='application/json', + entire_content=json.dumps({ + "data": [ + { + "@id": BLARG.anItem, + "title": [{"@value": "an item, yes"}], + "foaf:isPrimaryTopicOf": [{"@id": BLARG.aCard}] + }, + { + "@id": BLARG.anItemm, + "title": [{"@value": "an itemm, yes"}], + "foaf:isPrimaryTopicOf": [{"@id": BLARG.aCardd}] + }, + { + '@id': BLARG.anItemmm, + "sameAs": [ + {"@id": "https://doi.example/13.0/anItemmm"} + ], + 'title': [{'@value': 'an itemmm, yes'}], + "creator": [ + { + "@id": BLARG.aPerson, + "resourceType": [ + {"@id": "Agent"}, + {"@id": "Person"} + ], + "identifier": [ + {"@value": BLARG.aPerson} + ], + "name": [ + {"@value": "a person indeed"} + ] + } + ], + "dateCreated": [ + {"@value": "2001-02-03"} + ], + "foaf:isPrimaryTopicOf": [{"@id": BLARG.aCarddd}] + } + ], + "links": {}, + "meta": { + "total": 3 + } + }), + ), + } diff --git a/tests/trove/render/test_trovesearch_tsv_renderer.py b/tests/trove/render/test_trovesearch_tsv_renderer.py new file mode 100644 index 000000000..9d9782a82 --- /dev/null +++ b/tests/trove/render/test_trovesearch_tsv_renderer.py @@ -0,0 +1,24 @@ +from trove.render.trovesearch_tsv import TrovesearchTsvRenderer +from trove.render.rendering import EntireRendering +from . import _base + + +# note: trovesearch only -- this renderer doesn't do arbitrary rdf + +class TestTrovesearchTsvRenderer(_base.TrovesearchRendererTests): + renderer_class = TrovesearchTsvRenderer + expected_outputs = { + 'no_results': EntireRendering( + mediatype='text/tab-separated-values', + entire_content='@id\tsameAs\tresourceType\tresourceNature\ttitle\tname\tdateCreated\tdateModified\trights\r\n', + ), + 'few_results': EntireRendering( + mediatype='text/tab-separated-values', + entire_content=''.join(( + '@id\tsameAs\tresourceType\tresourceNature\ttitle\tname\tdateCreated\tdateModified\trights\r\n', + 'http://blarg.example/vocab/anItem\t\t\t\tan item, yes\t\t\t\t\r\n', + 'http://blarg.example/vocab/anItemm\t\t\t\tan itemm, yes\t\t\t\t\r\n', + 'http://blarg.example/vocab/anItemmm\thttps://doi.example/13.0/anItemmm\t\t\tan itemmm, yes\t\t2001-02-03\t\t\r\n', + )), + ), + } diff --git a/tests/trove/render/test_turtle_renderer.py b/tests/trove/render/test_turtle_renderer.py index 32f949278..3bf5ee3d8 100644 --- a/tests/trove/render/test_turtle_renderer.py +++ b/tests/trove/render/test_turtle_renderer.py @@ -1,7 +1,7 @@ from primitive_metadata import primitive_rdf as rdf from trove.render.turtle import RdfTurtleRenderer -from trove.render._rendering import SimpleRendering +from trove.render.rendering import EntireRendering from . import _base @@ -14,9 +14,9 @@ def _get_rendered_output(self, rendering): class TestTurtleRenderer(_BaseTurtleRendererTest): expected_outputs = { - 'simple_card': SimpleRendering( + 'simple_card': EntireRendering( mediatype='text/turtle', - rendered_content=''' + entire_content=''' @prefix blarg: . @prefix dcat: . @prefix dcterms: . @@ -30,12 +30,12 @@ class TestTurtleRenderer(_BaseTurtleRendererTest): dcterms:modified "2024-01-01"^^xsd:date ; foaf:primaryTopic blarg:anItem ; trove:focusIdentifier "http://blarg.example/vocab/anItem"^^rdf:string ; - trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItem\\", \\"title\\": \\"an item, yes\\"}"^^rdf:JSON . + trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItem\\", \\"title\\": [{\\"@value\\": \\"an item, yes\\"}]}"^^rdf:JSON . ''', ), - 'various_types': SimpleRendering( + 'various_types': EntireRendering( mediatype='text/turtle', - rendered_content=''' + entire_content=''' @prefix blarg: . @prefix rdf: . @prefix xsd: . @@ -54,9 +54,9 @@ class TestTurtleRenderer(_BaseTurtleRendererTest): class TestTurtleTrovesearchRenderer(_BaseTurtleRendererTest, _base.TrovesearchRendererTests): expected_outputs = { - 'no_results': SimpleRendering( + 'no_results': EntireRendering( mediatype='text/turtle', - rendered_content=''' + entire_content=''' @prefix blarg: . @prefix trove: . @prefix xsd: . @@ -65,9 +65,9 @@ class TestTurtleTrovesearchRenderer(_BaseTurtleRendererTest, _base.TrovesearchRe trove:totalResultCount 0 . ''', ), - 'few_results': SimpleRendering( + 'few_results': EntireRendering( mediatype='text/turtle', - rendered_content=''' + entire_content=''' @prefix blarg: . @prefix dcat: . @prefix dcterms: . @@ -99,21 +99,21 @@ class TestTurtleTrovesearchRenderer(_BaseTurtleRendererTest, _base.TrovesearchRe dcterms:modified "2024-01-01"^^xsd:date ; foaf:primaryTopic blarg:anItem ; trove:focusIdentifier "http://blarg.example/vocab/anItem"^^rdf:string ; - trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItem\\", \\"title\\": \\"an item, yes\\"}"^^rdf:JSON . + trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItem\\", \\"title\\": [{\\"@value\\": \\"an item, yes\\"}]}"^^rdf:JSON . blarg:aCardd a dcat:CatalogRecord, trove:Indexcard ; dcterms:issued "2024-02-02"^^xsd:date ; dcterms:modified "2024-02-02"^^xsd:date ; foaf:primaryTopic blarg:anItemm ; trove:focusIdentifier "http://blarg.example/vocab/anItemm"^^rdf:string ; - trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItemm\\", \\"title\\": \\"an itemm, yes\\"}"^^rdf:JSON . + trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItemm\\", \\"title\\": [{\\"@value\\": \\"an itemm, yes\\"}]}"^^rdf:JSON . blarg:aCarddd a dcat:CatalogRecord, trove:Indexcard ; dcterms:issued "2024-03-03"^^xsd:date ; dcterms:modified "2024-03-03"^^xsd:date ; foaf:primaryTopic blarg:anItemmm ; trove:focusIdentifier "http://blarg.example/vocab/anItemmm"^^rdf:string ; - trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItemmm\\", \\"title\\": \\"an itemmm, yes\\"}"^^rdf:JSON . + trove:resourceMetadata "{\\"@id\\": \\"http://blarg.example/vocab/anItemmm\\", \\"sameAs\\": [{\\"@id\\": \\"https://doi.example/13.0/anItemmm\\"}], \\"title\\": [{\\"@value\\": \\"an itemmm, yes\\"}], \\"creator\\": [{\\"@id\\": \\"http://blarg.example/vocab/aPerson\\", \\"resourceType\\": [{\\"@id\\": \\"Agent\\"}, {\\"@id\\": \\"Person\\"}], \\"identifier\\": [{\\"@value\\": \\"http://blarg.example/vocab/aPerson\\"}], \\"name\\": [{\\"@value\\": \\"a person indeed\\"}]}], \\"dateCreated\\": [{\\"@value\\": \\"2001-02-03\\"}]}"^^rdf:JSON . ''', ), } diff --git a/tests/trove/test_doctest.py b/tests/trove/test_doctest.py index 18c77a18b..06baf8993 100644 --- a/tests/trove/test_doctest.py +++ b/tests/trove/test_doctest.py @@ -3,7 +3,9 @@ import trove.util.chainmap import trove.util.frozen import trove.util.iris +import trove.util.iter import trove.util.propertypath +import trove.vocab.mediatypes _DOCTEST_OPTIONFLAGS = ( doctest.ELLIPSIS @@ -14,7 +16,9 @@ trove.util.chainmap, trove.util.frozen, trove.util.iris, + trove.util.iter, trove.util.propertypath, + trove.vocab.mediatypes, ) diff --git a/trove/derive/oaidc_xml.py b/trove/derive/oaidc_xml.py index 610fb49fc..e8d3e0967 100644 --- a/trove/derive/oaidc_xml.py +++ b/trove/derive/oaidc_xml.py @@ -2,8 +2,9 @@ from lxml import etree from primitive_metadata import primitive_rdf as rdf -from share.oaipmh.util import format_datetime, ns, nsmap, SubEl +from share.oaipmh.util import ns, nsmap, SubEl +from trove.util.datetime import datetime_isoformat_z as format_datetime from trove.vocab.namespaces import ( DCTYPE, DCTERMS, diff --git a/trove/derive/osfmap_json.py b/trove/derive/osfmap_json.py index 69de39b26..21d3e2fad 100644 --- a/trove/derive/osfmap_json.py +++ b/trove/derive/osfmap_json.py @@ -151,8 +151,7 @@ def _list_or_single_value(self, predicate_iri: str, json_list: list[JsonValue]) (_only_obj,) = json_list except ValueError: return None - else: - return _only_obj + return _only_obj return ( sorted(json_list, key=json.dumps) if len(json_list) > 1 diff --git a/trove/links.py b/trove/links.py new file mode 100644 index 000000000..ae8feadeb --- /dev/null +++ b/trove/links.py @@ -0,0 +1,58 @@ +import dataclasses +import urllib.parse + +from django.conf import settings +from django.http import QueryDict +from django.urls import reverse + +from trove.vocab.namespaces import namespaces_shorthand + + +def is_local_url(iri: str) -> bool: + return iri.startswith(settings.SHARE_WEB_URL) + + +def trove_browse_link(iri: str) -> str: + return reverse( + 'trove:browse-iri', + query={ + 'blendCards': True, + 'iri': namespaces_shorthand().compact_iri(iri), + }, + ) + + +@dataclasses.dataclass +class FeedLinks: + rss: str + atom: str + + +def cardsearch_feed_links(cardsearch_iri: str) -> FeedLinks | None: + _split_iri = urllib.parse.urlsplit(cardsearch_iri) + if _split_iri.path != reverse('trove:index-card-search'): + return None + _feed_query = _get_feed_query(_split_iri.query) + _rss_link = urllib.parse.urljoin( + settings.SHARE_WEB_URL, + reverse('trove:cardsearch-rss', query=_feed_query) + ) + _atom_link = urllib.parse.urljoin( + settings.SHARE_WEB_URL, + reverse('trove:cardsearch-atom', query=_feed_query) + ) + return FeedLinks(rss=_rss_link, atom=_atom_link) + + +def _get_feed_query(query_string: str) -> QueryDict: + _qparams = QueryDict(query_string, mutable=True) + for _param_name in list(filter(_irrelevant_feed_param, _qparams.keys())): + del _qparams[_param_name] + return _qparams + + +def _irrelevant_feed_param(query_param_name: str) -> bool: + return ( + query_param_name in ('sort', 'include', 'acceptMediatype', 'blendCards', 'page[cursor]') + or query_param_name.startswith('fields') + ) diff --git a/trove/openapi.py b/trove/openapi.py index 0ed880583..89c0bee67 100644 --- a/trove/openapi.py +++ b/trove/openapi.py @@ -46,7 +46,7 @@ def get_trove_openapi() -> dict[str, Any]: 'contact': { # 'name': # 'url': web-browsable version of this - 'email': 'share-support@osf.io', + 'email': settings.SHARE_SUPPORT_EMAIL, }, # 'license': 'version': get_shtrove_version(), diff --git a/trove/render/__init__.py b/trove/render/__init__.py index c5bf699a1..cd3189ef2 100644 --- a/trove/render/__init__.py +++ b/trove/render/__init__.py @@ -1,16 +1,17 @@ -from typing import Type - from django import http from trove import exceptions as trove_exceptions +from trove.vocab.mediatypes import strip_mediatype_parameters from ._base import BaseRenderer from .jsonapi import RdfJsonapiRenderer from .html_browse import RdfHtmlBrowseRenderer from .turtle import RdfTurtleRenderer from .jsonld import RdfJsonldRenderer -from .simple_csv import TrovesearchSimpleCsvRenderer -from .simple_json import TrovesearchSimpleJsonRenderer -from .simple_tsv import TrovesearchSimpleTsvRenderer +from .cardsearch_rss import CardsearchRssRenderer +from .cardsearch_atom import CardsearchAtomRenderer +from .trovesearch_csv import TrovesearchCsvRenderer +from .trovesearch_json import TrovesearchJsonRenderer +from .trovesearch_tsv import TrovesearchTsvRenderer __all__ = ('get_renderer_type', 'BaseRenderer') @@ -20,20 +21,20 @@ RdfJsonapiRenderer, RdfTurtleRenderer, RdfJsonldRenderer, - TrovesearchSimpleCsvRenderer, - TrovesearchSimpleJsonRenderer, - TrovesearchSimpleTsvRenderer, + TrovesearchCsvRenderer, + TrovesearchJsonRenderer, + TrovesearchTsvRenderer, +) +CARDSEARCH_ONLY_RENDERERS = ( # TODO: use/consider + CardsearchRssRenderer, + CardsearchAtomRenderer, ) - -RendersType = Type[ - BaseRenderer | RdfHtmlBrowseRenderer | RdfJsonapiRenderer | RdfTurtleRenderer | RdfJsonldRenderer | TrovesearchSimpleCsvRenderer | TrovesearchSimpleJsonRenderer | TrovesearchSimpleTsvRenderer -] RENDERER_BY_MEDIATYPE = { _renderer_type.MEDIATYPE: _renderer_type for _renderer_type in RENDERERS } -DEFAULT_RENDERER_TYPE = RdfJsonapiRenderer # the most stable one +DEFAULT_RENDERER_TYPE = RdfJsonapiRenderer # the most stable one? def get_renderer_type(request: http.HttpRequest) -> type[BaseRenderer]: @@ -42,7 +43,9 @@ def get_renderer_type(request: http.HttpRequest) -> type[BaseRenderer]: _requested_mediatype = request.GET.get('acceptMediatype') if _requested_mediatype: try: - _chosen_renderer_type = RENDERER_BY_MEDIATYPE[_requested_mediatype] + _chosen_renderer_type = RENDERER_BY_MEDIATYPE[ + strip_mediatype_parameters(_requested_mediatype) + ] except KeyError: raise trove_exceptions.CannotRenderMediatype(_requested_mediatype) else: diff --git a/trove/render/_base.py b/trove/render/_base.py index 49a3a52ec..5facde0d4 100644 --- a/trove/render/_base.py +++ b/trove/render/_base.py @@ -13,7 +13,10 @@ from trove.vocab import mediatypes from trove.vocab.trove import TROVE_API_THESAURUS from trove.vocab.namespaces import namespaces_shorthand -from ._rendering import ProtoRendering, SimpleRendering +from .rendering import ( + EntireRendering, + ProtoRendering, +) @dataclasses.dataclass @@ -52,26 +55,16 @@ def response_tripledict(self) -> rdf.RdfTripleDictionary: # TODO: self.response_gathering.ask_all_about or a default ask... return self.response_gathering.leaf_a_record() - def simple_render_document(self) -> str: - raise NotImplementedError - + @abc.abstractmethod def render_document(self) -> ProtoRendering: - try: - _content = self.simple_render_document() - except NotImplementedError: - raise NotImplementedError(f'class "{type(self)}" must implement either `render_document` or `simple_render_document`') - else: - return SimpleRendering( # type: ignore[return-value] # until ProtoRendering(typing.Protocol) with py3.12 - mediatype=self.MEDIATYPE, - rendered_content=_content, - ) + raise NotImplementedError @classmethod def render_error_document(cls, error: trove_exceptions.TroveError) -> ProtoRendering: # may override, but default to jsonapi - return SimpleRendering( # type: ignore[return-value] # until ProtoRendering(typing.Protocol) with py3.12 + return EntireRendering( mediatype=mediatypes.JSONAPI, - rendered_content=json.dumps( + entire_content=json.dumps( {'errors': [{ # https://jsonapi.org/format/#error-objects 'status': error.http_status, 'code': error.error_location, diff --git a/trove/render/_html.py b/trove/render/_html.py deleted file mode 100644 index 6daa1e037..000000000 --- a/trove/render/_html.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import annotations -from collections.abc import Generator -import contextlib -import dataclasses -from xml.etree.ElementTree import ( - Element, - SubElement, -) -from typing import Any - -from primitive_metadata import primitive_rdf as rdf - - -__all__ = ('HtmlBuilder',) - - -@dataclasses.dataclass -class HtmlBuilder: - given_root: Element - _: dataclasses.KW_ONLY - _nested_elements: list[Element] = dataclasses.field(default_factory=list) - _heading_depth: int = 0 - - def __post_init__(self) -> None: - self._nested_elements.append(self.given_root) - - @property - def root_element(self) -> Element: - return self._nested_elements[0] - - @property - def _current_element(self) -> Element: - return self._nested_elements[-1] - - ### - # html-building helper methods - - @contextlib.contextmanager - def nest_h_tag(self, **kwargs: Any) -> Generator[Element]: - _outer_heading_depth = self._heading_depth - if not _outer_heading_depth: - self._heading_depth = 1 - elif _outer_heading_depth < 6: # h6 deepest - self._heading_depth += 1 - _h_tag = f'h{self._heading_depth}' - with self.nest(_h_tag, **kwargs) as _nested: - try: - yield _nested - finally: - self._heading_depth = _outer_heading_depth - - @contextlib.contextmanager - def nest(self, tag_name: str, attrs: dict | None = None) -> Generator[Element]: - _attrs = {**attrs} if attrs else {} - _nested_element = SubElement(self._current_element, tag_name, _attrs) - self._nested_elements.append(_nested_element) - try: - yield self._current_element - finally: - _popped_element = self._nested_elements.pop() - assert _popped_element is _nested_element - - def leaf(self, tag_name: str, *, text: str | None = None, attrs: dict | None = None) -> None: - _leaf_element = SubElement(self._current_element, tag_name, attrs or {}) - if isinstance(text, rdf.Literal): - # TODO: lang - _leaf_element.text = text.unicode_value - elif text is not None: - _leaf_element.text = text diff --git a/trove/render/_rendering.py b/trove/render/_rendering.py deleted file mode 100644 index 0de9b015a..000000000 --- a/trove/render/_rendering.py +++ /dev/null @@ -1,47 +0,0 @@ -import abc -import dataclasses -from typing import Iterator, Generator - -from trove import exceptions as trove_exceptions - - -class ProtoRendering(abc.ABC): - '''base class for all renderings - - (TODO: typing.Protocol (when py3.12+)) - ''' - - @property - @abc.abstractmethod - def mediatype(self) -> str: - '''`mediatype`: required readable attribute - ''' - raise NotImplementedError - - @abc.abstractmethod - def iter_content(self) -> Iterator[str | bytes | memoryview]: - '''`iter_content`: (only) required method - ''' - yield from () - - -@dataclasses.dataclass -class SimpleRendering: # implements ProtoRendering - mediatype: str - rendered_content: str = '' - - def iter_content(self) -> Generator[str]: - yield self.rendered_content - - -@dataclasses.dataclass -class StreamableRendering: # implements ProtoRendering - mediatype: str - content_stream: Iterator[str | bytes | memoryview] - _started_already: bool = False - - def iter_content(self) -> Iterator[str | bytes | memoryview]: - if self._started_already: - raise trove_exceptions.CannotRenderStreamTwice - self._started_already = True - yield from self.content_stream diff --git a/trove/render/_simple_trovesearch.py b/trove/render/_trovesearch_card_only.py similarity index 63% rename from trove/render/_simple_trovesearch.py rename to trove/render/_trovesearch_card_only.py index 36bc36c4b..f1bc3378e 100644 --- a/trove/render/_simple_trovesearch.py +++ b/trove/render/_trovesearch_card_only.py @@ -1,6 +1,8 @@ from __future__ import annotations -from collections.abc import Generator, Iterator +import abc +from collections.abc import Generator, Iterator, Sequence import json +import logging from typing import Any, TYPE_CHECKING from primitive_metadata import primitive_rdf as rdf @@ -9,42 +11,30 @@ from trove.vocab.jsonapi import JSONAPI_LINK_OBJECT from trove.vocab.namespaces import TROVE, RDF from ._base import BaseRenderer -from ._rendering import ProtoRendering, SimpleRendering if TYPE_CHECKING: from trove.util.json import JsonObject + from trove.render.rendering import ProtoRendering +_logger = logging.getLogger(__name__) -class SimpleTrovesearchRenderer(BaseRenderer): - '''for "simple" search api responses (including only result metadata) - (very entangled with trove/trovesearch/trovesearch_gathering.py) +class TrovesearchCardOnlyRenderer(BaseRenderer, abc.ABC): + '''for search api responses that include only metadata about results + + very entangled with trove/trovesearch/trovesearch_gathering.py and trove/derive/osfmap_json.py ''' PASSIVE_RENDER = False # knows the properties it cares about - _page_links: set[str] + INDEXCARD_DERIVER_IRI = TROVE['derive/osfmap_json'] # assumes osfmap_json + _page_links: set[str] # for use *after* iterating cards/card_pages __already_iterated_cards = False - def simple_unicard_rendering(self, card_iri: str, osfmap_json: JsonObject) -> str: - raise NotImplementedError - - def simple_multicard_rendering(self, cards: Iterator[tuple[str, JsonObject]]) -> str: - raise NotImplementedError + @abc.abstractmethod + def multicard_rendering(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> ProtoRendering: + raise NotImplementedError(f'{self.__class__.__name__} must implement `multicard_rendering`') def unicard_rendering(self, card_iri: str, osfmap_json: JsonObject) -> ProtoRendering: - return SimpleRendering( # type: ignore[return-value] - mediatype=self.MEDIATYPE, - rendered_content=self.simple_unicard_rendering(card_iri, osfmap_json), - ) - - def multicard_rendering(self, card_pages: Iterator[dict[str, JsonObject]]) -> ProtoRendering: - _cards = ( - (_card_iri, _card_contents) - for _page in card_pages - for _card_iri, _card_contents in _page.items() - ) - return SimpleRendering( # type: ignore[return-value] - mediatype=self.MEDIATYPE, - rendered_content=self.simple_multicard_rendering(_cards), - ) + _page = [(card_iri, osfmap_json)] + return self.multicard_rendering(card_pages=iter([_page])) def render_document(self) -> ProtoRendering: _focustypes = self.response_focus.type_iris @@ -57,7 +47,7 @@ def render_document(self) -> ProtoRendering: ) raise trove_exceptions.UnsupportedRdfType(_focustypes) - def _iter_card_pages(self) -> Generator[dict[str, JsonObject]]: + def _iter_card_pages(self) -> Generator[list[tuple[str, JsonObject]]]: assert not self.__already_iterated_cards self.__already_iterated_cards = True self._page_links = set() @@ -67,22 +57,22 @@ def _iter_card_pages(self) -> Generator[dict[str, JsonObject]]: if (RDF.type, JSONAPI_LINK_OBJECT) in _page: self._page_links.add(_page) elif rdf.is_container(_page): - _cardpage = [] - for _search_result in rdf.container_objects(_page): + _cardpage: list[tuple[str, JsonObject]] = [] + for _search_result_blanknode in rdf.container_objects(_page): try: _card = next( _obj - for _pred, _obj in _search_result + for _pred, _obj in _search_result_blanknode if _pred == TROVE.indexCard ) except StopIteration: pass # skip malformed else: - _cardpage.append(_card) - yield { - self._get_card_iri(_card): self._get_card_content(_card, _page_graph) - for _card in _cardpage - } + _cardpage.append(( + self._get_card_iri(_card), + self._get_card_content(_card, _page_graph), + )) + yield _cardpage def _get_card_iri(self, card: str | rdf.RdfBlanknode) -> str: return card if isinstance(card, str) else '' diff --git a/trove/render/cardsearch_atom.py b/trove/render/cardsearch_atom.py new file mode 100644 index 000000000..9d8188b1d --- /dev/null +++ b/trove/render/cardsearch_atom.py @@ -0,0 +1,83 @@ +from __future__ import annotations +import itertools +import typing + +from django.utils import timezone +from django.utils.translation import gettext as _ +from primitive_metadata import primitive_rdf as rdf + +from trove.render.rendering import EntireRendering +from trove.util.datetime import datetime_isoformat_z +from trove.util.json import ( + json_strs, + json_vals, + json_datetimes, +) +from trove.util.xml import XmlBuilder +from trove.vocab import mediatypes +from trove.vocab.trove import trove_indexcard_namespace +from ._trovesearch_card_only import TrovesearchCardOnlyRenderer + +if typing.TYPE_CHECKING: + from collections.abc import Iterator, Sequence + from trove.util.json import JsonObject + from trove.render.rendering import ProtoRendering + + +class CardsearchAtomRenderer(TrovesearchCardOnlyRenderer): + '''render card-search results into Atom following https://www.rfc-editor.org/rfc/rfc4287 + ''' + MEDIATYPE = mediatypes.ATOM + + def multicard_rendering(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> ProtoRendering: + def _strs(*path: str) -> Iterator[str]: + yield from json_strs(_osfmap_json, path, coerce_str=True) + + def _dates(*path: str) -> Iterator[str]: + yield from map(datetime_isoformat_z, json_datetimes(_osfmap_json, path)) + + _xb = XmlBuilder('feed', {'xmlns': 'http://www.w3.org/2005/Atom'}) + _xb.leaf('title', text=_('shtrove search results')) + _xb.leaf('subtitle', text=_('feed of metadata records matching given filters')) + _xb.leaf('link', {'href': self.response_focus.single_iri()}) + _xb.leaf('id', text=self.response_focus.single_iri()) + _xb.leaf('updated', text=datetime_isoformat_z(timezone.now())) + for _card_iri, _osfmap_json in itertools.chain.from_iterable(card_pages): + with _xb.nest('entry'): + _iri = _osfmap_json.get('@id', _card_iri) + _xb.leaf('link', {'href': _iri}) + _xb.leaf('id', text=self._atom_id(_card_iri)) + for _title in _strs('title'): + _xb.leaf('title', text=_title) + for _filename in _strs('fileName'): + _xb.leaf('title', text=_filename) + for _desc in _strs('description'): + _xb.leaf('summary', text=_desc) + for _keyword in _strs('keyword'): + _xb.leaf('category', {'term': _keyword}) + for _created in _dates('dateCreated'): + _xb.leaf('published', text=_created) + for _modified in _dates('dateModified'): + _xb.leaf('updated', text=_modified) + _creator_objs = list(json_vals(_osfmap_json, ['creator'])) + if not _creator_objs: + _creator_objs = list(json_vals(_osfmap_json, ['isContainedBy', 'creator'])) + for _creator_obj in _creator_objs: + assert isinstance(_creator_obj, dict) + with _xb.nest('author'): + for _name in json_strs(_creator_obj, ['name']): + _xb.leaf('name', text=_name) + _creator_iri = _creator_obj.get('@id') + if _creator_iri: + _xb.leaf('uri', text=_creator_iri) + return EntireRendering( + mediatype=self.MEDIATYPE, + entire_content=bytes(_xb), + ) + + def _atom_id(self, card_iri: str) -> str: + try: + _uuid = rdf.iri_minus_namespace(card_iri, namespace=trove_indexcard_namespace()) + except ValueError: + return card_iri + return f'urn:uuid:{_uuid}' diff --git a/trove/render/cardsearch_rss.py b/trove/render/cardsearch_rss.py new file mode 100644 index 000000000..2d93ea54a --- /dev/null +++ b/trove/render/cardsearch_rss.py @@ -0,0 +1,75 @@ +from __future__ import annotations +from email.utils import format_datetime as rfc2822_datetime +import itertools +import typing + +from django.conf import settings +from django.utils.translation import gettext as _ + +from trove.render.rendering import EntireRendering +from trove.util.json import ( + json_datetimes, + json_vals, + json_strs, +) +from trove.util.xml import XmlBuilder +from trove.vocab import mediatypes +from ._trovesearch_card_only import TrovesearchCardOnlyRenderer + +if typing.TYPE_CHECKING: + from collections.abc import Iterator, Sequence + from trove.util.json import JsonObject + from trove.render.rendering import ProtoRendering + + +class CardsearchRssRenderer(TrovesearchCardOnlyRenderer): + '''render card-search results into RSS following https://www.rssboard.org/rss-specification + ''' + MEDIATYPE = mediatypes.RSS + + def multicard_rendering(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> ProtoRendering: + def _strs(*path: str) -> Iterator[str]: + yield from json_strs(_osfmap_json, path, coerce_str=True) + + def _dates(*path: str) -> Iterator[str]: + for _dt in json_datetimes(_osfmap_json, path): + yield rfc2822_datetime(_dt) + + _xb = XmlBuilder('rss', { + 'version': '2.0', + 'xmlns:dc': 'http://purl.org/dc/elements/1.1/', + 'xmlns:atom': 'http://www.w3.org/2005/Atom', + }) + with _xb.nest('channel'): + # see https://www.rssboard.org/rss-specification#requiredChannelElements + _xb.leaf('title', text=_('shtrove search results')) + _xb.leaf('link', text=self.response_focus.single_iri()) + _xb.leaf('atom:link', { + 'rel': 'self', + 'href': self.response_focus.single_iri(), + }) + _xb.leaf('description', text=_('feed of metadata records matching given filters')) + _xb.leaf('webMaster', text=settings.SHARE_SUPPORT_EMAIL) + for _card_iri, _osfmap_json in itertools.chain.from_iterable(card_pages): + with _xb.nest('item'): + # see https://www.rssboard.org/rss-specification#hrelementsOfLtitemgt + _iri = _osfmap_json.get('@id', _card_iri) + _xb.leaf('link', text=_iri) + _xb.leaf('guid', {'isPermaLink': 'true'}, text=_iri) + _titles = itertools.chain(_strs('title'), _strs('fileName')) + _xb.leaf('title', text=next(_titles, '')) + for _desc in _strs('description'): + _xb.leaf('description', text=_desc) + for _keyword in _strs('keyword'): + _xb.leaf('category', text=_keyword) + for _created_date in _dates('dateCreated'): + _xb.leaf('pubDate', text=_created_date) + for _creator_obj in json_vals(_osfmap_json, ['creator']): + assert isinstance(_creator_obj, dict) + _creator_name = next(json_strs(_creator_obj, ['name'])) + _creator_id = _creator_obj.get('@id', _creator_name) + _xb.leaf('dc:creator', text=f'{_creator_id} ({_creator_name})') + return EntireRendering( + mediatype=self.MEDIATYPE, + entire_content=bytes(_xb), + ) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 1f5bffd6f..bb5d3c650 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -1,7 +1,4 @@ -from collections.abc import ( - Iterator, - Generator, -) +from collections.abc import Generator import contextlib import dataclasses import datetime @@ -12,11 +9,9 @@ from urllib.parse import quote, urlsplit, urlunsplit from xml.etree.ElementTree import ( Element, - tostring as etree_tostring, fromstring as etree_fromstring, ) -from django.conf import settings from django.contrib.staticfiles.storage import staticfiles_storage from django.http import QueryDict from django.urls import reverse @@ -24,14 +19,22 @@ import markdown2 from primitive_metadata import primitive_rdf as rdf +from trove.links import ( + trove_browse_link, + is_local_url, +) +from trove.util.html import HtmlBuilder from trove.util.iris import get_sufficiently_unique_iri from trove.util.randomness import shuffled from trove.vocab import mediatypes -from trove.vocab.namespaces import RDF, RDFS, SKOS, DCTERMS, FOAF, DC +from trove.vocab import jsonapi +from trove.vocab.namespaces import RDF, RDFS, SKOS, DCTERMS, FOAF, DC, OSFMAP, TROVE from trove.vocab.static_vocab import combined_thesaurus__suffuniq -from trove.vocab.trove import trove_browse_link from ._base import BaseRenderer -from ._html import HtmlBuilder +from .rendering import ( + EntireRendering, + ProtoRendering, +) STABLE_MEDIATYPES = (mediatypes.JSONAPI,) UNSTABLE_MEDIATYPES = ( @@ -42,6 +45,11 @@ mediatypes.TSV, mediatypes.CSV, ) +SEARCHONLY_MEDIATYPES = frozenset(( + mediatypes.JSON, + mediatypes.TSV, + mediatypes.CSV, +)) _LINK_TEXT_PREDICATES = ( SKOS.prefLabel, @@ -50,52 +58,64 @@ DCTERMS.title, DC.title, FOAF.name, + OSFMAP.fileName, ) _IMPLICIT_DATATYPES = frozenset(( RDF.string, RDF.langString, )) +_PREDICATES_RENDERED_SPECIAL = frozenset(( + RDF.type, +)) +_PRIMITIVE_LITERAL_TYPES = (float, int, datetime.date) _QUERYPARAM_SPLIT_RE = re.compile(r'(?=[?&])') _PHI = (math.sqrt(5) + 1) / 2 -_HTML_DOCTYPE = '' - @dataclasses.dataclass class RdfHtmlBrowseRenderer(BaseRenderer): - MEDIATYPE: ClassVar[str] = 'text/html; charset=utf-8' - __current_data: rdf.RdfTripleDictionary = dataclasses.field(init=False) + MEDIATYPE: ClassVar[str] = mediatypes.HTML + __current_data: rdf.RdfGraph = dataclasses.field(init=False) __visiting_iris: set[str] = dataclasses.field(init=False) __hb: HtmlBuilder = dataclasses.field(init=False) __last_hue_turn: float = dataclasses.field(default_factory=random.random) def __post_init__(self) -> None: # TODO: lang (according to request -- also translate) - self.__current_data = self.response_tripledict + self.__current_data = self.response_data self.__visiting_iris = set() @property def is_data_blended(self) -> bool | None: return self.response_gathering.gatherer_kwargs.get('blend_cards') + @property + def is_search(self) -> bool: + return not self.response_focus.type_iris.isdisjoint(( + TROVE.Cardsearch, + TROVE.Valuesearch, + )) + # override BaseRenderer - def simple_render_document(self) -> str: - self.__hb = HtmlBuilder(Element('html')) + def render_document(self) -> ProtoRendering: + return EntireRendering(self.MEDIATYPE, self.render_html_str()) + + def render_html_str(self) -> str: + self.__hb = HtmlBuilder() self.render_html_head() - _body_attrs = { - 'class': 'BrowseWrapper', - 'style': self._hue_turn_css(), - } - with self.__hb.nest('body', attrs=_body_attrs): + with ( + self._hue_turn_css() as _hue_turn_style, + self.__hb.nest('body', attrs={ + 'class': 'BrowseWrapper', + 'style': _hue_turn_style, + }), + ): self.render_nav() self.render_main() self.render_footer() - return '\n'.join(( - _HTML_DOCTYPE, - etree_tostring(self.__hb.root_element, encoding='unicode', method='html'), - )) + return self.__hb.as_html_doc() def render_html_head(self) -> None: with self.__hb.nest('head'): @@ -123,7 +143,10 @@ def render_footer(self) -> None: def __alternate_mediatypes_card(self) -> None: with self.__nest_card('details'): self.__hb.leaf('summary', text=_('alternate mediatypes')) - for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): + _linked_mediatypes = {*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES} + if not self.is_search: + _linked_mediatypes -= SEARCHONLY_MEDIATYPES + for _mediatype in shuffled(_linked_mediatypes): with self.__hb.nest('span', attrs={'class': 'Browse__literal'}): self.__mediatype_link(_mediatype) @@ -153,67 +176,69 @@ def __mediatype_link(self, mediatype: str) -> None: with self.__hb.nest('a', attrs={'href': reverse('trove:docs')}) as _link: _link.text = _('(stable for documented use)') - def __render_subj(self, subj_iri: str, *, start_collapsed: bool | None = None) -> None: - _twopledict = self.__current_data.get(subj_iri, {}) - with self.__visiting(subj_iri): + def __render_subj(self, subj_iri: str, *, include_details: bool = True) -> None: + with self.__visiting(subj_iri) as _h_tag: with self.__nest_card('article'): with self.__hb.nest('header'): - _compact = self.iri_shorthand.compact_iri(subj_iri) - _is_compactable = (_compact != subj_iri) - _should_link = (subj_iri not in self.response_focus.iris) - with self.__hb.nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: - if _should_link: - with self.__nest_link(subj_iri) as _link: - if _is_compactable: - _link.text = _compact - else: - self.__split_iri_pre(subj_iri) + with self.__hb.nest(_h_tag, attrs={'id': quote(subj_iri)}): + if self.__is_focus(subj_iri): + self.__split_iri_pre(subj_iri) else: - if _is_compactable: - _h.text = _compact - else: + with self.__nest_link(subj_iri): self.__split_iri_pre(subj_iri) self.__iri_subheaders(subj_iri) - if _twopledict: - with self.__hb.nest('details') as _details: - _detail_depth = sum((_el.tag == 'details') for _el in self.__hb._nested_elements) - _should_open = ( - _detail_depth < 3 - if start_collapsed is None - else not start_collapsed - ) - if _should_open: - _details.set('open', '') + if self.__is_focus(subj_iri): + self.__hb.leaf('pre', text=subj_iri) + if include_details and (_twopledict := self.__current_data.tripledict.get(subj_iri, {})): + _details_attrs = ( + {'open': ''} + if (self.__is_focus(subj_iri) or is_local_url(subj_iri)) + else {} + ) + with self.__hb.nest('details', _details_attrs): self.__hb.leaf('summary', text=_('more details...')) self.__twoples(_twopledict) def __twoples(self, twopledict: rdf.RdfTwopleDictionary) -> None: with self.__hb.nest('dl', {'class': 'Browse__twopleset'}): - for _pred, _obj_set in shuffled(twopledict.items()): + for _pred, _obj_set in self.__order_twopledict(twopledict): with self.__hb.nest('dt', attrs={'class': 'Browse__predicate'}): self.__compact_link(_pred) for _text in self.__iri_thesaurus_labels(_pred): self.__literal(_text) with self.__hb.nest('dd'): - for _obj in shuffled(_obj_set): + for _obj in _obj_set: self.__obj(_obj) + def __order_twopledict(self, twopledict: rdf.RdfTwopleDictionary) -> Generator[tuple[str, list[rdf.RdfObject]]]: + _items_with_sorted_objs = ( + (_pred, sorted(_obj_set, key=_obj_ordering_key)) + for _pred, _obj_set in twopledict.items() + if _pred not in _PREDICATES_RENDERED_SPECIAL + ) + yield from sorted( + _items_with_sorted_objs, + key=lambda _item: _obj_ordering_key(_item[1][0]), + ) + def __obj(self, obj: rdf.RdfObject) -> None: if isinstance(obj, str): # iri # TODO: detect whether indexcard? - if (obj in self.__current_data) and (obj not in self.__visiting_iris): + if (obj in self.__current_data.tripledict) and (obj not in self.__visiting_iris): self.__render_subj(obj) else: with self.__hb.nest('article', attrs={'class': 'Browse__object'}): self.__iri_link_and_labels(obj) elif isinstance(obj, frozenset): # blanknode - if (RDF.type, RDF.Seq) in obj: + if _is_jsonapi_link_obj(obj): + self.__jsonapi_link_obj(obj) + elif _is_sequence_obj(obj): self.__sequence(obj) else: self.__blanknode(obj) elif isinstance(obj, rdf.Literal): self.__literal(obj, is_rdf_object=True) - elif isinstance(obj, (float, int, datetime.date)): + elif isinstance(obj, _PRIMITIVE_LITERAL_TYPES): self.__literal(rdf.literal(obj), is_rdf_object=True) elif isinstance(obj, rdf.QuotedGraph): self.__quoted_graph(obj) @@ -240,7 +265,7 @@ def __literal( if _is_markdown: # TODO: tests for safe_mode _html = markdown2.markdown(_lit.unicode_value, safe_mode='escape') - self.__hb._current_element.append(etree_fromstring(f'{_html}')) + self.__hb.current_element.append(etree_fromstring(f'{_html}')) else: self.__hb.leaf('q', text=_lit) @@ -255,8 +280,16 @@ def __sequence(self, sequence_twoples: frozenset[rdf.RdfTwople]) -> None: self.__obj(_seq_obj) def __quoted_graph(self, quoted_graph: rdf.QuotedGraph) -> None: - with self.__quoted_data(quoted_graph.tripledict): - self.__render_subj(quoted_graph.focus_iri) # , start_collapsed=True) + _should_include_details = ( + self.__is_focus(quoted_graph.focus_iri) + or (( # primary topic of response focus + self.response_focus.single_iri(), + FOAF.primaryTopic, + quoted_graph.focus_iri, + ) in self.response_data) + ) + with self.__quoted_data(quoted_graph): + self.__render_subj(quoted_graph.focus_iri, include_details=_should_include_details) def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset) -> None: _twopledict = ( @@ -264,28 +297,46 @@ def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset) -> None: if isinstance(blanknode, dict) else rdf.twopledict_from_twopleset(blanknode) ) - with self.__hb.nest('details', attrs={ - 'open': '', - 'class': 'Browse__blanknode Browse__object', - 'style': self._hue_turn_css(), - }): - self.__hb.leaf('summary', text='(blank node)') + with ( + self._hue_turn_css() as _hue_turn_style, + self.__hb.nest('details', attrs={ + 'open': '', + 'class': 'Browse__blanknode Browse__object', + 'style': _hue_turn_style, + }), + ): + with self.__hb.nest('summary'): + for _type_iri in _twopledict.get(RDF.type, ()): + self.__compact_link(_type_iri) self.__twoples(_twopledict) + def __jsonapi_link_obj(self, twopleset: frozenset[rdf.RdfTwople]) -> None: + _iri = next( + (str(_obj) for (_pred, _obj) in twopleset if _pred == RDF.value), + '', + ) + _text = next( + (_obj.unicode_value for (_pred, _obj) in twopleset if _pred == jsonapi.JSONAPI_MEMBERNAME), + '', + ) + with self.__nest_link(_iri, attrs={'class': 'Browse__blanknode Browse__object'}) as _a: + _a.text = _('link: %(linktext)s') % {'linktext': _text} + def __split_iri_pre(self, iri: str) -> None: - self.__hb.leaf('pre', text='\n'.join(self.__iri_lines(iri))) + self.__hb.leaf('pre', text='\n'.join(self.__iri_display_lines(iri))) @contextlib.contextmanager - def __visiting(self, iri: str) -> Iterator[None]: + def __visiting(self, iri: str) -> Generator[str]: assert iri not in self.__visiting_iris self.__visiting_iris.add(iri) try: - yield + with self.__hb.deeper_heading() as _h_tag: + yield _h_tag finally: self.__visiting_iris.remove(iri) @contextlib.contextmanager - def __quoted_data(self, quoted_data: dict) -> Generator[None]: + def __quoted_data(self, quoted_data: rdf.RdfGraph) -> Generator[None]: _outer_data = self.__current_data _outer_visiting_iris = self.__visiting_iris self.__current_data = quoted_data @@ -301,27 +352,32 @@ def __iri_link_and_labels(self, iri: str) -> None: for _text in self.__iri_thesaurus_labels(iri): self.__literal(_text) - def __nest_link(self, iri: str) -> contextlib.AbstractContextManager[Element]: + def __nest_link(self, iri: str, attrs: dict[str, str] | None = None) -> contextlib.AbstractContextManager[Element]: _href = ( iri - if _is_local_url(iri) + if is_local_url(iri) else trove_browse_link(iri) ) - return self.__hb.nest('a', attrs={'href': _href}) + return self.__hb.nest('a', attrs={**(attrs or {}), 'href': _href}) def __compact_link(self, iri: str) -> Element: with self.__nest_link(iri) as _a: - _a.text = self.iri_shorthand.compact_iri(iri) + _a.text = ''.join(self.__iri_display_lines(iri)) return _a - def __nest_card(self, tag: str) -> contextlib.AbstractContextManager[Element]: - return self.__hb.nest( - tag, - attrs={ - 'class': 'Browse__card', - 'style': self._hue_turn_css(), - }, - ) + @contextlib.contextmanager + def __nest_card(self, tag: str) -> Generator[Element]: + with ( + self._hue_turn_css() as _hue_turn_style, + self.__hb.nest( + tag, + attrs={ + 'class': 'Browse__card', + 'style': _hue_turn_style, + }, + ) as _element, + ): + yield _element def __iri_thesaurus_labels(self, iri: str) -> list[str]: # TODO: consider requested language @@ -331,20 +387,25 @@ def __iri_thesaurus_labels(self, iri: str) -> list[str]: if _thesaurus_entry: for _pred in _LINK_TEXT_PREDICATES: _labels.update(_thesaurus_entry.get(_pred, ())) - _twoples = self.__current_data.get(iri) + _twoples = self.__current_data.tripledict.get(iri) if _twoples: for _pred in _LINK_TEXT_PREDICATES: _labels.update(_twoples.get(_pred, ())) return shuffled(_labels) - def _hue_turn_css(self) -> str: - _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 + @contextlib.contextmanager + def _hue_turn_css(self) -> Generator[str]: + _prior_turn = self.__last_hue_turn + _hue_turn = (_prior_turn + _PHI) % 1.0 self.__last_hue_turn = _hue_turn - return f'--hue-turn: {_hue_turn}turn;' + try: + yield f'--hue-turn: {_hue_turn}turn;' + finally: + self.__last_hue_turn = _prior_turn def _queryparam_href(self, param_name: str, param_value: str | None) -> str: _base_url = self.response_focus.single_iri() - if not _is_local_url(_base_url): + if not is_local_url(_base_url): _base_url = trove_browse_link(_base_url) (_scheme, _netloc, _path, _query, _fragment) = urlsplit(_base_url) _qparams = QueryDict(_query, mutable=True) @@ -364,26 +425,34 @@ def _queryparam_href(self, param_name: str, param_value: str | None) -> str: )) def __iri_subheaders(self, iri: str) -> None: - _type_iris = self.__current_data.get(iri, {}).get(RDF.type, ()) - if _type_iris: - for _type_iri in _type_iris: - self.__compact_link(_type_iri) + for _type_iri in self.__current_data.q(iri, RDF.type): + self.__compact_link(_type_iri) _labels = self.__iri_thesaurus_labels(iri) if _labels: for _label in _labels: self.__literal(_label) - def __iri_lines(self, iri: str) -> Iterator[str]: - (_scheme, _netloc, _path, _query, _fragment) = urlsplit(iri) - yield ( - f'://{_netloc}{_path}' - if _netloc - else f'{_scheme}:{_path}' - ) - if _query: - yield from filter(bool, _QUERYPARAM_SPLIT_RE.split(f'?{_query}')) - if _fragment: - yield f'#{_fragment}' + def __iri_display_lines(self, iri: str) -> Generator[str]: + _compact = self.iri_shorthand.compact_iri(iri) + if _compact != iri: + yield _compact + else: + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(iri) + # first line with path + if is_local_url(iri): + yield f'/{_path.lstrip('/')}' + elif _netloc: + yield f'://{_netloc}{_path}' + else: + yield f'{_scheme}:{_path}' + # query and fragment separate + if _query: + yield from filter(bool, _QUERYPARAM_SPLIT_RE.split(f'?{_query}')) + if _fragment: + yield f'#{_fragment}' + + def __is_focus(self, iri: str) -> bool: + return (iri in self.response_focus.iris) def _append_class(el: Element, element_class: str) -> None: @@ -393,5 +462,23 @@ def _append_class(el: Element, element_class: str) -> None: ) -def _is_local_url(iri: str) -> bool: - return iri.startswith(settings.SHARE_WEB_URL) +def _is_sequence_obj(obj: rdf.RdfObject) -> bool: + return ( + isinstance(obj, frozenset) + and (RDF.type, RDF.Seq) in obj + ) + + +def _is_jsonapi_link_obj(obj: rdf.RdfObject) -> bool: + return ( + isinstance(obj, frozenset) + and (RDF.type, jsonapi.JSONAPI_LINK_OBJECT) in obj + ) + + +def _obj_ordering_key(obj: rdf.RdfObject) -> tuple[bool, ...]: + return ( + not isinstance(obj, (rdf.Literal, *_PRIMITIVE_LITERAL_TYPES)), # literal values first + not isinstance(obj, str), # iris next + _is_jsonapi_link_obj(obj), # jsonapi link objects last + ) diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index e60fc2338..11a78708c 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -7,13 +7,17 @@ import itertools import json import time -from typing import Iterable, Union, List, Any, Dict, Tuple, Iterator +from typing import Iterable, Union, Any, Iterator -from typing import Optional from primitive_metadata import primitive_rdf from trove import exceptions as trove_exceptions +from trove.util.json import ( + JsonObject, + JsonValue, +) from trove.vocab.jsonapi import ( + JSONAPI_LINK, JSONAPI_MEMBERNAME, JSONAPI_RELATIONSHIP, JSONAPI_ATTRIBUTE, @@ -29,6 +33,10 @@ ) from trove.vocab.trove import trove_indexcard_namespace from ._base import BaseRenderer +from .rendering import ( + EntireRendering, + ProtoRendering, +) # a jsonapi resource may pull rdf data using an iri or blank node @@ -38,15 +46,11 @@ def _resource_ids_defaultdict() -> defaultdict[Any, str]: _prefix = str(time.time_ns()) - _ints = itertools.count() - - def _iter_ids() -> Iterator[str]: - while True: - _id = next(_ints) - yield f'{_prefix}-{_id}' - - _ids = _iter_ids() - return defaultdict(lambda: next(_ids)) + _infinite_ids = ( + f'{_prefix}-{_id}' + for _id in itertools.count() + ) + return defaultdict(_infinite_ids.__next__) @dataclasses.dataclass @@ -84,15 +88,16 @@ class RdfJsonapiRenderer(BaseRenderer): def get_deriver_iri(cls, card_blending: bool) -> str | None: return (None if card_blending else super().get_deriver_iri(card_blending)) - def simple_render_document(self) -> str: - return json.dumps( + def render_document(self) -> ProtoRendering: + _json_str = json.dumps( self.render_dict(self.response_focus.single_iri()), indent=2, # TODO: pretty-print query param? ) + return EntireRendering(self.MEDIATYPE, _json_str) - def render_dict(self, primary_iris: Union[str, Iterable[str]]) -> dict[str, Any]: - _primary_data: dict | list | None = None - _included_data = [] + def render_dict(self, primary_iris: Union[str, Iterable[str]]) -> JsonObject: + _primary_data: JsonValue = None + _included_data: list[JsonValue] = [] with self._contained__to_include() as _to_include: if isinstance(primary_iris, str): _already_included = {primary_iris} @@ -108,26 +113,37 @@ def render_dict(self, primary_iris: Union[str, Iterable[str]]) -> dict[str, Any] if _next not in _already_included: _already_included.add(_next) _included_data.append(self.render_resource_object(_next)) - _document = {'data': _primary_data} + _document: JsonObject = {'data': _primary_data} if _included_data: _document['included'] = _included_data return _document - def render_resource_object(self, iri_or_blanknode: _IriOrBlanknode) -> dict[str, Any]: - _resource_object = {**self.render_identifier_object(iri_or_blanknode)} + def render_resource_object(self, iri_or_blanknode: _IriOrBlanknode) -> JsonObject: + _resource_object: JsonObject = {**self.render_identifier_object(iri_or_blanknode)} _twopledict = ( (self.response_data.tripledict.get(iri_or_blanknode) or {}) if isinstance(iri_or_blanknode, str) else primitive_rdf.twopledict_from_twopleset(iri_or_blanknode) ) + _links: JsonObject = {} for _pred, _obj_set in _twopledict.items(): - if _pred != RDF.type: - self._render_field(_pred, _obj_set, into=_resource_object) + if _pred == JSONAPI_LINK: + _links.update( + self._render_link_object(_link_obj) + for _link_obj in _obj_set + ) + elif _pred != RDF.type: + _doc_key, _field_key, _field_value = self._render_field(_pred, _obj_set) + _doc_obj = _resource_object.setdefault(_doc_key, {}) + assert isinstance(_doc_obj, dict) + _doc_obj[_field_key] = _field_value if isinstance(iri_or_blanknode, str): - _resource_object.setdefault('links', {})['self'] = iri_or_blanknode + _links['self'] = iri_or_blanknode + if _links: + _resource_object['links'] = _links return _resource_object - def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode) -> Any | dict[str, Any]: + def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode) -> JsonObject: try: return self._identifier_object_cache[iri_or_blanknode] except KeyError: @@ -156,7 +172,7 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode) -> Any | d self._identifier_object_cache[iri_or_blanknode] = _id_obj return _id_obj - def _single_typename(self, type_iris: list[str]) -> Optional[str]: + def _single_typename(self, type_iris: list[str]) -> str: if not type_iris: return '' if len(type_iris) == 1: @@ -168,7 +184,7 @@ def _single_typename(self, type_iris: list[str]) -> Optional[str]: return self._membername_for_iri(_type_iris[0]) return self._membername_for_iri(sorted(type_iris)[0]) - def _membername_for_iri(self, iri: str) -> Optional[str] | Any: + def _membername_for_iri(self, iri: str) -> str: try: _membername = next(self.thesaurus.q(iri, JSONAPI_MEMBERNAME)) except StopIteration: @@ -193,12 +209,12 @@ def _resource_id_for_iri(self, iri: str) -> Any: # as fallback, encode the iri into a valid jsonapi member name return base64.urlsafe_b64encode(iri.encode()).decode() - def _render_field(self, predicate_iri: str, object_set: Iterable[Any], *, into: dict[str, Any]) -> None: + def _render_field(self, predicate_iri: str, object_set: Iterable[Any]) -> tuple[str, str, JsonValue]: _is_relationship = (predicate_iri, RDF.type, JSONAPI_RELATIONSHIP) in self.thesaurus _is_attribute = (predicate_iri, RDF.type, JSONAPI_ATTRIBUTE) in self.thesaurus _field_key = self._membername_for_iri(predicate_iri) _doc_key = 'meta' # unless configured for jsonapi, default to unstructured 'meta' - if ':' not in _field_key: # type: ignore + if ':' not in _field_key: if _is_relationship: _doc_key = 'relationships' elif _is_attribute: @@ -207,10 +223,9 @@ def _render_field(self, predicate_iri: str, object_set: Iterable[Any], *, into: _fieldvalue = self._render_relationship_object(predicate_iri, object_set) else: _fieldvalue = self._one_or_many(predicate_iri, self._attribute_datalist(object_set)) # type: ignore - # update the given `into` resource object - into.setdefault(_doc_key, {})[_field_key] = _fieldvalue + return _doc_key, _field_key, _fieldvalue - def _one_or_many(self, predicate_iri: str, datalist: list[Any]) -> Union[list[Any], Any, None]: + def _one_or_many(self, predicate_iri: str, datalist: list[Any]) -> JsonValue: _only_one = (predicate_iri, RDF.type, OWL.FunctionalProperty) in self.thesaurus if _only_one: if len(datalist) > 1: @@ -218,19 +233,19 @@ def _one_or_many(self, predicate_iri: str, datalist: list[Any]) -> Union[list[An return datalist[0] if datalist else None return datalist - def _attribute_datalist(self, object_set: Iterable[Any]) -> List[Any]: + def _attribute_datalist(self, object_set: Iterable[Any]) -> list[Any]: return [ self._render_attribute_datum(_obj) for _obj in object_set ] def _render_relationship_object( - self, - predicate_iri: str, - object_set: Iterable[Union[frozenset[Any], str]] - ) -> Dict[str, Any]: + self, + predicate_iri: str, + object_set: Iterable[Union[frozenset[Any], str]] + ) -> JsonObject: _data = [] - _links = {} + _links: JsonObject = {} for _obj in object_set: if isinstance(_obj, frozenset): if (RDF.type, RDF.Seq) in _obj: @@ -247,14 +262,14 @@ def _render_relationship_object( assert isinstance(_obj, str) _data.append(self.render_identifier_object(_obj)) self._pls_include(_obj) - _relationship_obj = { + _relationship_obj: JsonObject = { 'data': self._one_or_many(predicate_iri, _data), } if _links: _relationship_obj['links'] = _links return _relationship_obj - def _render_link_object(self, link_obj: frozenset[Tuple[Any, Any]]) -> Tuple[str, Dict[str, Any]]: + def _render_link_object(self, link_obj: frozenset[tuple[Any, Any]]) -> tuple[str, JsonObject]: _membername = next( _obj.unicode_value for _pred, _obj in link_obj @@ -296,14 +311,14 @@ def _pls_include(self, item: Any) -> None: if self.__to_include is not None: self.__to_include.add(item) - def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> dict[Any, Any] | list[Any] | str | float | int: + def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> JsonValue: if isinstance(rdfobject, frozenset): if (RDF.type, RDF.Seq) in rdfobject: return [ self._render_attribute_datum(_seq_obj) for _seq_obj in primitive_rdf.sequence_objects_in_order(rdfobject) ] - _json_blanknode = {} + _json_blanknode: JsonObject = {} for _pred, _obj_set in primitive_rdf.twopledict_from_twopleset(rdfobject).items(): _key = self._membername_for_iri(_pred) _json_blanknode[_key] = self._one_or_many(_pred, self._attribute_datalist(_obj_set)) diff --git a/trove/render/jsonld.py b/trove/render/jsonld.py index a7ca263c6..5c7299f1f 100644 --- a/trove/render/jsonld.py +++ b/trove/render/jsonld.py @@ -10,6 +10,10 @@ from trove.vocab.namespaces import RDF, OWL, TROVE from trove.vocab import mediatypes from ._base import BaseRenderer +from .rendering import ( + EntireRendering, + ProtoRendering, +) if TYPE_CHECKING: from trove.util.json import ( JsonObject, @@ -29,12 +33,13 @@ class RdfJsonldRenderer(BaseRenderer): __visiting_iris: set[str] | None = None - def simple_render_document(self) -> str: - return json.dumps( + def render_document(self) -> ProtoRendering: + _json_str = json.dumps( self.render_jsonld(self.response_data, self.response_focus.single_iri()), indent=2, sort_keys=True, ) + return EntireRendering(self.MEDIATYPE, _json_str) def render_jsonld( self, @@ -152,8 +157,7 @@ def _list_or_single_value(self, predicate_iri: str, objectlist: list[JsonValue]) (_only_obj,) = objectlist except ValueError: return None - else: - return _only_obj + return _only_obj if predicate_iri in _PREDICATES_OF_FLEXIBLE_CARDINALITY and len(objectlist) == 1: return objectlist[0] return sorted(objectlist, key=_naive_sort_key) diff --git a/trove/render/rendering/__init__.py b/trove/render/rendering/__init__.py new file mode 100644 index 000000000..9e8cb29b8 --- /dev/null +++ b/trove/render/rendering/__init__.py @@ -0,0 +1,4 @@ +from .proto import ProtoRendering +from .entire import EntireRendering + +__all__ = ('ProtoRendering', 'EntireRendering') diff --git a/trove/render/rendering/entire.py b/trove/render/rendering/entire.py new file mode 100644 index 000000000..45c7abc0f --- /dev/null +++ b/trove/render/rendering/entire.py @@ -0,0 +1,17 @@ +from collections.abc import Generator +import dataclasses + +from .proto import ProtoRendering + +__all__ = ('EntireRendering',) + + +@dataclasses.dataclass +class EntireRendering(ProtoRendering): + '''EntireRendering: for response content rendered in its entirety before being sent + ''' + mediatype: str + entire_content: str | bytes = '' + + def iter_content(self) -> Generator[str] | Generator[bytes]: + yield self.entire_content diff --git a/trove/render/rendering/html_wrapped.py b/trove/render/rendering/html_wrapped.py new file mode 100644 index 000000000..4aadaff58 --- /dev/null +++ b/trove/render/rendering/html_wrapped.py @@ -0,0 +1,22 @@ +import dataclasses +import html +from typing import Iterator + +from trove.vocab import mediatypes +from trove.util.html import HTML_DOCTYPE +from .proto import ProtoRendering + + +@dataclasses.dataclass +class HtmlWrappedRendering(ProtoRendering): + inner_rendering: ProtoRendering + mediatype: str = mediatypes.HTML + + def iter_content(self) -> Iterator[str]: + yield HTML_DOCTYPE + yield '
'
+        for _content in self.inner_rendering.iter_content():
+            if not isinstance(_content, str):
+                _content = _content.decode()
+            yield html.escape(_content)
+        yield '
' diff --git a/trove/render/rendering/proto.py b/trove/render/rendering/proto.py new file mode 100644 index 000000000..955940acb --- /dev/null +++ b/trove/render/rendering/proto.py @@ -0,0 +1,16 @@ +from typing import ( + Iterator, + Protocol, +) + +__all__ = ('ProtoRendering',) + + +class ProtoRendering(Protocol): + '''protocol for all renderings + ''' + mediatype: str # required attribute + + def iter_content(self) -> Iterator[str] | Iterator[bytes]: + '''`iter_content`: (only) required method + ''' diff --git a/trove/render/rendering/streamable.py b/trove/render/rendering/streamable.py new file mode 100644 index 000000000..c61ff6bcc --- /dev/null +++ b/trove/render/rendering/streamable.py @@ -0,0 +1,20 @@ +from collections.abc import Iterator +import dataclasses + +from trove import exceptions as trove_exceptions +from .proto import ProtoRendering + + +@dataclasses.dataclass +class StreamableRendering(ProtoRendering): + '''StreamableRendering: for response content that may be rendered incrementally while being streamed + ''' + mediatype: str + content_stream: Iterator[str] | Iterator[bytes] = iter(()) + _started_already: bool = False + + def iter_content(self) -> Iterator[str] | Iterator[bytes]: + if self._started_already: + raise trove_exceptions.CannotRenderStreamTwice + self._started_already = True + yield from self.content_stream diff --git a/trove/render/simple_tsv.py b/trove/render/simple_tsv.py deleted file mode 100644 index 30b01a8a6..000000000 --- a/trove/render/simple_tsv.py +++ /dev/null @@ -1,10 +0,0 @@ -import csv - -from trove.vocab import mediatypes - -from .simple_csv import TrovesearchSimpleCsvRenderer - - -class TrovesearchSimpleTsvRenderer(TrovesearchSimpleCsvRenderer): - MEDIATYPE = mediatypes.TSV - CSV_DIALECT = csv.excel_tab diff --git a/trove/render/simple_csv.py b/trove/render/trovesearch_csv.py similarity index 57% rename from trove/render/simple_csv.py rename to trove/render/trovesearch_csv.py index 52c9d700b..a6174f4f4 100644 --- a/trove/render/simple_csv.py +++ b/trove/render/trovesearch_csv.py @@ -2,31 +2,36 @@ from collections.abc import ( Generator, Iterator, - Iterable, Sequence, ) import csv +import dataclasses import functools import itertools -import dataclasses +import logging from typing import TYPE_CHECKING, ClassVar from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, ) +from trove.util.iter import iter_unique +from trove.util.json import json_prims from trove.util.propertypath import Propertypath, GLOB_PATHSTEP from trove.vocab import mediatypes from trove.vocab import osfmap -from trove.vocab.namespaces import TROVE -from ._simple_trovesearch import SimpleTrovesearchRenderer -from ._rendering import StreamableRendering, ProtoRendering +from ._trovesearch_card_only import TrovesearchCardOnlyRenderer +from .rendering import ProtoRendering +from .rendering.streamable import StreamableRendering if TYPE_CHECKING: from trove.util.trove_params import BasicTroveParams - from trove.util.json import JsonValue, JsonObject + from trove.util.json import ( + JsonObject, + JsonPath, + ) +_logger = logging.getLogger(__name__) -type Jsonpath = Sequence[str] # path of json keys type CsvValue = str | int | float | None _MULTIVALUE_DELIMITER = ' ; ' # possible improvement: smarter in-value delimiting? @@ -34,20 +39,16 @@ _ID_JSONPATH = ('@id',) -class TrovesearchSimpleCsvRenderer(SimpleTrovesearchRenderer): +class TrovesearchCsvRenderer(TrovesearchCardOnlyRenderer): MEDIATYPE = mediatypes.CSV - INDEXCARD_DERIVER_IRI = TROVE['derive/osfmap_json'] CSV_DIALECT: ClassVar[type[csv.Dialect]] = csv.excel - def unicard_rendering(self, card_iri: str, osfmap_json: JsonObject) -> ProtoRendering: - return self.multicard_rendering(card_pages=iter([{card_iri: osfmap_json}])) - - def multicard_rendering(self, card_pages: Iterator[dict[str, JsonObject]]) -> ProtoRendering: + def multicard_rendering(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> ProtoRendering: _doc = TabularDoc( card_pages, trove_params=getattr(self.response_focus, 'search_params', None), ) - return StreamableRendering( # type: ignore[return-value] + return StreamableRendering( mediatype=self.MEDIATYPE, content_stream=csv_stream(self.CSV_DIALECT, _doc.header(), _doc.rows()), ) @@ -66,22 +67,18 @@ def csv_stream( @dataclasses.dataclass class TabularDoc: - card_pages: Iterator[dict[str, JsonObject]] + card_pages: Iterator[Sequence[tuple[str, JsonObject]]] trove_params: BasicTroveParams | None = None _started: bool = False @functools.cached_property - def column_jsonpaths(self) -> tuple[Jsonpath, ...]: + def column_jsonpaths(self) -> tuple[JsonPath, ...]: _column_jsonpaths = ( _osfmap_jsonpath(_path) for _path in self._column_paths() ) return (_ID_JSONPATH, *_column_jsonpaths) - @functools.cached_property - def first_page(self) -> dict[str, JsonObject]: - return next(self.card_pages, {}) - def _column_paths(self) -> Iterator[Propertypath]: _pathlists: list[Sequence[Propertypath]] = [] if self.trove_params is not None: # hacks @@ -102,29 +99,16 @@ def _column_paths(self) -> Iterator[Propertypath]: _pathlists.append(_pathlist) if not _pathlists: _pathlists.append(osfmap.DEFAULT_TABULAR_SEARCH_COLUMN_PATHS) - return self.iter_unique(itertools.chain.from_iterable(_pathlists)) - - @staticmethod - def iter_unique[T](iterable: Iterable[T]) -> Generator[T]: - _seen = set() - for _item in iterable: - if _item not in _seen: - _seen.add(_item) - yield _item - - def _iter_card_pages(self) -> Generator[dict[str, JsonObject]]: - assert not self._started - self._started = True - if self.first_page: - yield self.first_page - yield from self.card_pages + return iter_unique(itertools.chain.from_iterable(_pathlists)) def header(self) -> list[CsvValue]: return ['.'.join(_path) for _path in self.column_jsonpaths] def rows(self) -> Generator[list[CsvValue]]: - for _page in self._iter_card_pages(): - for _card_iri, _osfmap_json in _page.items(): + assert not self._started + self._started = True + for _page in self.card_pages: + for _card_iri, _osfmap_json in _page: yield self._row_values(_osfmap_json) def _row_values(self, osfmap_json: JsonObject) -> list[CsvValue]: @@ -133,10 +117,11 @@ def _row_values(self, osfmap_json: JsonObject) -> list[CsvValue]: for _field_path in self.column_jsonpaths ] - def _row_field_value(self, osfmap_json: JsonObject, field_path: Jsonpath) -> CsvValue: + def _row_field_value(self, osfmap_json: JsonObject, field_path: JsonPath) -> CsvValue: _rendered_values = [ - _render_tabularly(_obj) - for _obj in _iter_values(osfmap_json, field_path) + _obj + for _obj in json_prims(osfmap_json, field_path, _VALUE_KEY_PREFERENCE) + if _obj is not None ] if len(_rendered_values) == 1: return _rendered_values[0] # preserve type for single numbers @@ -144,7 +129,7 @@ def _row_field_value(self, osfmap_json: JsonObject, field_path: Jsonpath) -> Csv return _MULTIVALUE_DELIMITER.join(map(str, _rendered_values)) -def _osfmap_jsonpath(iri_path: Propertypath) -> Jsonpath: +def _osfmap_jsonpath(iri_path: Propertypath) -> JsonPath: _shorthand = osfmap.osfmap_json_shorthand() return tuple( _shorthand.compact_iri(_pathstep) @@ -152,50 +137,6 @@ def _osfmap_jsonpath(iri_path: Propertypath) -> Jsonpath: ) -def _has_value(osfmap_json: JsonObject, path: Jsonpath) -> bool: - try: - next(_iter_values(osfmap_json, path)) - except StopIteration: - return False - else: - return True - - -def _iter_values(osfmap_json: JsonObject, path: Jsonpath) -> Generator[JsonValue]: - assert path - (_step, *_rest) = path - _val = osfmap_json.get(_step) - if _rest: - if isinstance(_val, dict): - yield from _iter_values(_val, _rest) - elif isinstance(_val, list): - for _val_obj in _val: - if isinstance(_val_obj, dict): - yield from _iter_values(_val_obj, _rest) - else: - if isinstance(_val, list): - yield from _val - elif _val is not None: - yield _val - - -def _render_tabularly(json_val: JsonValue) -> CsvValue: - if isinstance(json_val, (str, int, float)): - return json_val - if isinstance(json_val, dict): - for _key in _VALUE_KEY_PREFERENCE: - _val = json_val.get(_key) - if isinstance(_val, list): - return ( - _render_tabularly(_val[0]) - if _val - else None - ) - if _val is not None: - return _render_tabularly(_val) - return None - - class _Echo: '''a write-only file-like object, to convince `csv.csvwriter.writerow` to return strings diff --git a/trove/render/simple_json.py b/trove/render/trovesearch_json.py similarity index 63% rename from trove/render/simple_json.py rename to trove/render/trovesearch_json.py index 753d6ee6e..e5b4b4087 100644 --- a/trove/render/simple_json.py +++ b/trove/render/trovesearch_json.py @@ -1,4 +1,5 @@ from __future__ import annotations +import itertools import json import re import typing @@ -6,42 +7,54 @@ from primitive_metadata import primitive_rdf as rdf from trove.vocab.jsonapi import ( + JSONAPI_LINK, JSONAPI_LINK_OBJECT, JSONAPI_MEMBERNAME, ) from trove.vocab import mediatypes from trove.vocab.namespaces import TROVE, RDF -from ._rendering import StreamableRendering, ProtoRendering -from ._simple_trovesearch import SimpleTrovesearchRenderer +from .rendering import ( + ProtoRendering, + EntireRendering, +) +from .rendering.streamable import StreamableRendering +from ._trovesearch_card_only import TrovesearchCardOnlyRenderer if typing.TYPE_CHECKING: + from collections.abc import ( + Generator, + Iterator, + Sequence, + ) from trove.util.json import JsonObject -class TrovesearchSimpleJsonRenderer(SimpleTrovesearchRenderer): +class TrovesearchJsonRenderer(TrovesearchCardOnlyRenderer): '''for "simple json" search api -- very entangled with trove/trovesearch/trovesearch_gathering.py ''' MEDIATYPE = mediatypes.JSON - INDEXCARD_DERIVER_IRI = TROVE['derive/osfmap_json'] - def simple_unicard_rendering(self, card_iri: str, osfmap_json: dict[str, typing.Any]) -> str: - return json.dumps({ - 'data': self._render_card_content(card_iri, osfmap_json), - 'links': self._render_links(), - 'meta': self._render_meta(), - }, indent=2) + def unicard_rendering(self, card_iri: str, osfmap_json: JsonObject) -> ProtoRendering: + return EntireRendering( + mediatype=self.MEDIATYPE, + entire_content=json.dumps({ + 'data': self._render_card_content(card_iri, osfmap_json), + 'links': self._render_links(), + 'meta': self._render_meta(), + }, indent=2), + ) - def multicard_rendering(self, card_pages: typing.Iterator[dict[str, dict[str, typing.Any]]]) -> ProtoRendering: - return StreamableRendering( # type: ignore[return-value] + def multicard_rendering(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> ProtoRendering: + return StreamableRendering( mediatype=self.MEDIATYPE, content_stream=self._stream_json(card_pages), ) - def _stream_json(self, card_pages: typing.Iterator[dict[str, typing.Any]]) -> typing.Generator[str]: + def _stream_json(self, card_pages: Iterator[Sequence[tuple[str, JsonObject]]]) -> Generator[str]: _prefix = '{"data": [' yield _prefix _datum_prefix = None for _page in card_pages: - for _card_iri, _osfmap_json in _page.items(): + for _card_iri, _osfmap_json in _page: if _datum_prefix is not None: yield _datum_prefix yield json.dumps(self._render_card_content(_card_iri, _osfmap_json), indent=2) @@ -78,18 +91,19 @@ def _render_meta(self) -> dict[str, int | str]: pass return _meta - def _render_links(self) -> dict[str, typing.Any]: + def _render_links(self) -> JsonObject: _links = {} - for _pagelink in self._page_links: - _twopledict = rdf.twopledict_from_twopleset(_pagelink) + _response_links = self.response_gathering.ask(JSONAPI_LINK, focus=self.response_focus) + for _link_obj in itertools.chain(self._page_links, _response_links): + _twopledict = rdf.twopledict_from_twopleset(_link_obj) if JSONAPI_LINK_OBJECT in _twopledict.get(RDF.type, ()): (_membername,) = _twopledict[JSONAPI_MEMBERNAME] (_link_url,) = _twopledict[RDF.value] _links[_membername.unicode_value] = _link_url return _links - def _add_twople(self, json_dict: dict[str, typing.Any], predicate_iri: str, object_iri: str) -> None: - _obj_ref = {'@id': object_iri} + def _add_twople(self, json_dict: JsonObject, predicate_iri: str, object_iri: str) -> None: + _obj_ref: JsonObject = {'@id': object_iri} _obj_list = json_dict.setdefault(predicate_iri, []) if isinstance(_obj_list, list): _obj_list.append(_obj_ref) diff --git a/trove/render/trovesearch_tsv.py b/trove/render/trovesearch_tsv.py new file mode 100644 index 000000000..b58882591 --- /dev/null +++ b/trove/render/trovesearch_tsv.py @@ -0,0 +1,10 @@ +import csv + +from trove.vocab import mediatypes + +from .trovesearch_csv import TrovesearchCsvRenderer + + +class TrovesearchTsvRenderer(TrovesearchCsvRenderer): + MEDIATYPE = mediatypes.TSV + CSV_DIALECT = csv.excel_tab diff --git a/trove/render/turtle.py b/trove/render/turtle.py index 869e12472..afad46e96 100644 --- a/trove/render/turtle.py +++ b/trove/render/turtle.py @@ -1,9 +1,11 @@ -from typing import Any - from primitive_metadata import primitive_rdf as rdf from trove.vocab.namespaces import TROVE from ._base import BaseRenderer +from .rendering import ( + EntireRendering, + ProtoRendering, +) class RdfTurtleRenderer(BaseRenderer): @@ -11,7 +13,10 @@ class RdfTurtleRenderer(BaseRenderer): # include indexcard metadata as JSON literals (because QuotedGraph is non-standard) INDEXCARD_DERIVER_IRI = TROVE['derive/osfmap_json'] - def simple_render_document(self) -> Any: + def render_document(self) -> ProtoRendering: + return EntireRendering(self.MEDIATYPE, self._render_turtle()) + + def _render_turtle(self) -> str: return rdf.turtle_from_tripledict( self.response_data.tripledict, focus=self.response_focus.single_iri(), diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 643bcfcf2..75adadddc 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -20,7 +20,7 @@ flex-wrap: wrap; gap: var(--gutter-1); margin: 0; - padding: 1rem; + padding: var(--gutter-2); min-height: 100vh; background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); } @@ -36,7 +36,7 @@ .Browse__card { display: flex; flex-direction: column; - padding: var(--gutter-2) var(--gutter-3); + padding: var(--gutter-3) var(--gutter-4); background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); border-color: lch(59% var(--bg-chroma) var(--hue-turn)); border-style: solid; @@ -44,10 +44,10 @@ border-block-start-width: var(--gutter-4); border-inline-end-width: 0; border-block-end-width: 0; - /* - border-start-end-radius: 1rem; - border-end-start-radius: 1rem; - */ +} + +.BrowseWrapper details > summary { + padding-left: var(--gutter-4); } .BrowseWrapper details > summary::before { @@ -65,16 +65,22 @@ .Browse__card > header { display: flex; flex-direction: row; - gap: var(--gutter-2); + flex-wrap: wrap; + gap: var(--gutter-3); align-items: baseline; - border-bottom: solid 1px rgba(0,0,0,0.382); - margin-bottom: var(--gutter-3); + padding-left: var(--gutter-3); } .Browse__card > header > :first-child { margin: 0; } +.Browse__card > header:not(:last-child) { + border-bottom: solid 1px rgba(0,0,0,0.382); + padding-bottom: var(--gutter-3); + margin-bottom: var(--gutter-3); +} + .Browse__card > footer { padding: var(--gutter-2); } @@ -86,7 +92,7 @@ dl.Browse__twopleset { [twople-obj] 1fr ; grid-auto-flow: row; - row-gap: var(--gutter-2); + row-gap: var(--gutter-3); margin: 0; padding: 0; } @@ -126,8 +132,7 @@ dl.Browse__twopleset > dd { .Browse__literal { display: flex; flex-direction: row; - gap: var(--gutter-3); - padding: var(--gutter-4); + gap: var(--gutter-5); } .Browse__literal > q { @@ -140,10 +145,18 @@ dl.Browse__twopleset > dd { .Browse__predicate { background-color: lch(from var(--bg-color-initial) 89% c var(--hue-turn)); - padding: var(--gutter-4); + padding: 0 var(--gutter-4); +} + +.Browse__predicate .Browse__literal { + padding: 0 var(--gutter-3); } .Browse__object { background-color: lch(from var(--bg-color-initial) 93% c var(--hue-turn)); - padding: var(--gutter-4); + padding: 0 var(--gutter-4); +} + +.Browse__object.Browse__blanknode { + background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); } diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index f8efb9a60..8145ed9ef 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -39,14 +39,21 @@ def gather_cards_focused_on(focus: gather.Focus, *, blend_cards: bool) -> GathererGenerator: _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) + _lrd_qs = ( + trove_db.LatestResourceDescription.objects + .filter(indexcard__in=_indexcard_qs) + .select_related('indexcard') + ) if blend_cards: - for _latest_resource_description in trove_db.LatestResourceDescription.objects.filter(indexcard__in=_indexcard_qs): - yield from rdf.iter_tripleset(_latest_resource_description.as_rdf_tripledict()) + for _resource_description in _lrd_qs: + yield from rdf.iter_tripleset(_resource_description.as_rdfdoc_with_supplements().tripledict) + yield (ns.FOAF.isPrimaryTopicOf, _resource_description.indexcard.get_iri()) else: - for _indexcard in _indexcard_qs: - _card_iri = _indexcard.get_iri() + for _resource_description in _lrd_qs: + _card_iri = _resource_description.indexcard.get_iri() yield (ns.FOAF.isPrimaryTopicOf, _card_iri) yield (_card_iri, ns.RDF.type, ns.TROVE.Indexcard) + yield (_card_iri, ns.TROVE.resourceMetadata, _resource_description.as_quoted_graph()) @trovebrowse.gatherer(ns.TROVE.thesaurusEntry) diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py index 5bbdf5ac0..4f52dd40a 100644 --- a/trove/trovesearch/page_cursor.py +++ b/trove/trovesearch/page_cursor.py @@ -17,7 +17,6 @@ DEFAULT_PAGE_SIZE = 13 MAX_PAGE_SIZE = 101 -UNBOUNDED_PAGE_SIZE = math.inf # json-serialized as "Infinity" @dataclasses.dataclass diff --git a/trove/trovesearch/search_handle.py b/trove/trovesearch/search_handle.py index b3ce4a8f7..ec3fb74ce 100644 --- a/trove/trovesearch/search_handle.py +++ b/trove/trovesearch/search_handle.py @@ -39,7 +39,8 @@ class CardsearchHandle(BasicSearchHandle): search_result_page: typing.Iterable[CardsearchResult] = () related_propertypath_results: list[PropertypathUsage] = dataclasses.field(default_factory=list) - def __post_init__(self): # type: ignore + def __post_init__(self) -> None: + # update cursor and/or search_result_page to agree with each other _cursor = self.cursor _page = self.search_result_page if ( # TODO: move this logic into the... cursor? @@ -60,7 +61,6 @@ def __post_init__(self): # type: ignore elif not _cursor.has_many_more(): # visiting first page for the first time _cursor.first_page_ids = [_result.card_id for _result in _page] - return _page def get_next_streaming_handle(self) -> typing.Self | None: if self.cursor.is_complete_page: diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index dfe047a49..5149ba941 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -35,6 +35,7 @@ get_single_value, ) from trove.vocab import osfmap +from trove.vocab.jsonapi import JSONAPI_LINK from trove.vocab.trove import trove_json_shorthand from trove.vocab.namespaces import RDF, TROVE, OWL, FOAF, DCTERMS if typing.TYPE_CHECKING: @@ -82,6 +83,7 @@ (TROVE.totalResultCount,), (TROVE.cardSearchText,), (TROVE.cardSearchFilter,), + (JSONAPI_LINK,), ], TROVE.Valuesearch: [ (TROVE.propertyPath,), diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 14138cbf0..8b3b16a6e 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -9,9 +9,11 @@ from trove import models as trove_db from trove.derive.osfmap_json import _RdfOsfmapJsonldRenderer +from trove.links import cardsearch_feed_links from trove.util.iris import get_sufficiently_unique_iri from trove.vocab.namespaces import RDF, FOAF, DCTERMS, RDFS, DCAT, TROVE from trove.vocab.jsonapi import ( + JSONAPI_LINK, JSONAPI_LINK_OBJECT, JSONAPI_MEMBERNAME, ) @@ -40,7 +42,7 @@ ) -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) type GathererGenerator = Generator[rdf.RdfTriple | rdf.RdfTwople] @@ -313,6 +315,17 @@ def gather_valuesearch_count(focus: ValuesearchFocus, **kwargs: Any) -> Gatherer yield (TROVE.totalResultCount, focus.search_handle.total_result_count) +@trovesearch_by_indexstrategy.gatherer( + JSONAPI_LINK, + focustype_iris={TROVE.Cardsearch}, +) +def gather_feed_links(focus: CardsearchFocus, **kwargs: Any) -> GathererGenerator: + _feed_links = cardsearch_feed_links(focus.single_iri()) + if _feed_links is not None: + yield (JSONAPI_LINK, _jsonapi_link('rss', _feed_links.rss)) + yield (JSONAPI_LINK, _jsonapi_link('atom', _feed_links.atom)) + + # @trovesearch_by_indexstrategy.gatherer( # focustype_iris={TROVE.Indexcard}, # ) @@ -484,8 +497,7 @@ def _osfmap_or_unknown_iri_as_json(iri: str) -> rdf.Literal: _twopledict = osfmap.OSFMAP_THESAURUS[iri] except KeyError: return rdf.literal_json({'@id': iri}) - else: - return _osfmap_json({iri: _twopledict}, focus_iri=iri) + return _osfmap_json({iri: _twopledict}, focus_iri=iri) def _valuesearch_result_as_json(result: ValuesearchResult) -> rdf.Literal: diff --git a/trove/urls.py b/trove/urls.py index 64f4b4e3c..cb729facd 100644 --- a/trove/urls.py +++ b/trove/urls.py @@ -1,16 +1,20 @@ from django.urls import path, re_path from .views.browse import BrowseIriView +from .views.docs import ( + OpenapiHtmlView, + OpenapiJsonView, +) +from .views.feeds import ( + CardsearchRssView, + CardsearchAtomView, +) from .views.ingest import RdfIngestView from .views.indexcard import IndexcardView from .views.search import ( CardsearchView, ValuesearchView, ) -from .views.docs import ( - OpenapiHtmlView, - OpenapiJsonView, -) app_name = 'trove' @@ -19,6 +23,8 @@ path('index-card/', view=IndexcardView.as_view(), name='index-card'), path('index-card-search', view=CardsearchView.as_view(), name='index-card-search'), path('index-value-search', view=ValuesearchView.as_view(), name='index-value-search'), + path('index-card-search/rss.xml', view=CardsearchRssView.as_view(), name='cardsearch-rss'), + path('index-card-search/atom.xml', view=CardsearchAtomView.as_view(), name='cardsearch-atom'), path('browse', view=BrowseIriView.as_view(), name='browse-iri'), path('ingest', view=RdfIngestView.as_view(), name='ingest-rdf'), path('docs/openapi.json', view=OpenapiJsonView.as_view(), name='docs.openapi-json'), diff --git a/trove/util/datetime.py b/trove/util/datetime.py new file mode 100644 index 000000000..ce437e79c --- /dev/null +++ b/trove/util/datetime.py @@ -0,0 +1,18 @@ +import datetime + +from primitive_metadata import primitive_rdf as rdf + + +def datetime_isoformat_z(dt: datetime.datetime | rdf.Literal | str) -> str: + """format (or reformat) a datetime in UTC with 'Z' timezone indicator + + for complying with standards that require the 'Z', like OAI-PMH + https://www.openarchives.org/OAI/openarchivesprotocol.html#Dates + """ + if isinstance(dt, rdf.Literal): + dt = dt.unicode_value + if isinstance(dt, str): + dt = datetime.datetime.fromisoformat(dt) + if isinstance(dt, datetime.datetime) and dt.tzinfo is None: + dt = dt.astimezone(datetime.UTC) + return dt.strftime('%Y-%m-%dT%H:%M:%SZ') diff --git a/trove/util/django.py b/trove/util/django.py index 77cf184bd..9b79165ee 100644 --- a/trove/util/django.py +++ b/trove/util/django.py @@ -16,18 +16,16 @@ def pk_chunked(queryset: QuerySet, chunksize: int) -> Generator[list]: ''' _ordered_qs = queryset.order_by('pk') _prior_end_pk = None - while True: # for each chunk: - _qs = ( - _ordered_qs - if _prior_end_pk is None - else _ordered_qs.filter(pk__gt=_prior_end_pk) - ) + _chunk_qs: QuerySet | None = _ordered_qs + while _chunk_qs is not None: # for each chunk: # load primary key values only - _pks = list(_qs.values_list('pk', flat=True)[:chunksize]) - if not _pks: - break # done - _end_pk = _pks[-1] - if (_prior_end_pk is not None) and (_end_pk <= _prior_end_pk): - raise RuntimeError(f'sentinel pks not ascending?? got {_end_pk} after {_prior_end_pk}') - _prior_end_pk = _end_pk - yield _pks + _pks = list(_chunk_qs.values_list('pk', flat=True)[:chunksize]) + if _pks: + _end_pk = _pks[-1] + if (_prior_end_pk is not None) and (_end_pk <= _prior_end_pk): + raise RuntimeError(f'sentinel pks not ascending?? got {_end_pk} after {_prior_end_pk}') + yield _pks + _prior_end_pk = _end_pk + _chunk_qs = _ordered_qs.filter(pk__gt=_prior_end_pk) + else: + _chunk_qs = None # done diff --git a/trove/util/html.py b/trove/util/html.py new file mode 100644 index 000000000..1cef3bb5e --- /dev/null +++ b/trove/util/html.py @@ -0,0 +1,43 @@ +from __future__ import annotations +from collections.abc import Generator +import contextlib +import dataclasses +from xml.etree.ElementTree import tostring as etree_tostring + +from trove.util.xml import XmlBuilder + + +__all__ = ('HtmlBuilder',) + +HTML_DOCTYPE = '' + + +@dataclasses.dataclass +class HtmlBuilder(XmlBuilder): + root_tag_name: str = 'html' + _: dataclasses.KW_ONLY + _heading_depth: int = 0 + + ### + # html-building helper methods + + @contextlib.contextmanager + def deeper_heading(self) -> Generator[str]: + _outer_heading_depth = self._heading_depth + if not _outer_heading_depth: + self._heading_depth = 1 + elif _outer_heading_depth < 6: # h6 deepest + self._heading_depth += 1 + try: + yield f'h{self._heading_depth}' + finally: + self._heading_depth = _outer_heading_depth + + def as_html_doc(self) -> str: + return '\n'.join((HTML_DOCTYPE, str(self))) + + def __str__(self) -> str: + return etree_tostring(self.root_element, encoding='unicode', method='html') + + def __bytes__(self) -> bytes: + return etree_tostring(self.root_element, encoding='utf-8', method='html') diff --git a/trove/util/iter.py b/trove/util/iter.py new file mode 100644 index 000000000..414febee5 --- /dev/null +++ b/trove/util/iter.py @@ -0,0 +1,19 @@ +from collections.abc import ( + Generator, + Hashable, + Iterable, +) + + +def iter_unique[T: Hashable](iterable: Iterable[T]) -> Generator[T]: + ''' + >>> list(iter_unique([1,1,1])) + [1] + >>> list(iter_unique([1,2,3,2,4,2,1,5])) + [1, 2, 3, 4, 5] + ''' + _seen = set() + for _item in iterable: + if _item not in _seen: + _seen.add(_item) + yield _item diff --git a/trove/util/json.py b/trove/util/json.py index aa647681c..496a0607a 100644 --- a/trove/util/json.py +++ b/trove/util/json.py @@ -1,6 +1,99 @@ from __future__ import annotations +from collections.abc import ( + Iterable, + Sequence, + Generator, +) +import datetime +### +# types for json-serializable stuff + +JsonPrimitive = str | int | float | bool | None + +type JsonValue = JsonPrimitive | list[JsonValue] | JsonObject + +type JsonNonArrayValue = JsonPrimitive | JsonObject + type JsonObject = dict[str, JsonValue] -type JsonValue = str | int | float | list[JsonValue] | JsonObject | None +type JsonPath = Sequence[str] # path of json keys + +JSONLD_VALUE_KEYS = ('@value', '@id') + +### +# utils for navigating nested json in the style of trove.derive.osfmap_json +# (TODO: more general json-ld utils) + + +def json_vals(json_obj: JsonObject, path: JsonPath) -> Generator[JsonValue]: + assert path + (_step, *_rest) = path + try: + _val = json_obj[_step] + except KeyError: + return + if _rest: + if isinstance(_val, dict): + yield from json_vals(_val, _rest) + elif isinstance(_val, list): + for _val_obj in _val: + if isinstance(_val_obj, dict): + yield from json_vals(_val_obj, _rest) + else: + if isinstance(_val, list): + yield from _val + else: + yield _val + + +def json_prims( + json_val: JsonValue, + path: JsonPath, + value_key_options: Iterable[str] = JSONLD_VALUE_KEYS, +) -> Generator[JsonPrimitive]: + if isinstance(json_val, list): + for _list_val in json_val: + yield from json_prims(_list_val, path, value_key_options) + elif path: + if isinstance(json_val, dict): + for _path_val in json_vals(json_val, path): + yield from json_prims(_path_val, (), value_key_options) + else: # no path; not list + if isinstance(json_val, JsonPrimitive): + yield json_val + elif isinstance(json_val, dict): + try: + yield next( + _val + for _key in value_key_options + if _key in json_val and isinstance(_val := json_val[_key], JsonPrimitive) + ) + except StopIteration: + pass + + +def json_strs( + json_val: JsonValue, + path: JsonPath, + value_key_options: Iterable[str] = JSONLD_VALUE_KEYS, + coerce_str: bool = False, +) -> Generator[str]: + for _prim in json_prims(json_val, path, value_key_options): + if isinstance(_prim, str): + yield _prim + elif coerce_str and (_prim is not None): + yield str(_prim) + + +def json_datetimes( + json_val: JsonValue, + path: JsonPath, +) -> Generator[datetime.datetime]: + for _prim in json_prims(json_val, path): + if isinstance(_prim, str): + try: + yield datetime.datetime.fromisoformat(_prim) + except ValueError: + pass diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index 664e63971..feb85c898 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -113,8 +113,7 @@ def get_single_value( (_singlevalue,) = _paramvalues except ValueError: raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) - else: - return _singlevalue + return _singlevalue def get_bool_value( diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py index 8801e7d5b..77633841d 100644 --- a/trove/util/trove_params.py +++ b/trove/util/trove_params.py @@ -72,8 +72,7 @@ def _gather_shorthand(cls, queryparams: _qp.QueryparamDict) -> rdf.IriShorthand: (_shortname,) = _qp_name.bracketed_names except ValueError: raise trove_exceptions.InvalidQueryParamName(_qp_name) - else: - _prefixmap[_shortname] = _iri + _prefixmap[_shortname] = _iri _shorthand = cls._default_shorthand() if _prefixmap: _shorthand = _shorthand.with_update(_prefixmap) diff --git a/trove/util/xml.py b/trove/util/xml.py new file mode 100644 index 000000000..79ca0f972 --- /dev/null +++ b/trove/util/xml.py @@ -0,0 +1,66 @@ +from __future__ import annotations +from collections.abc import Generator +import contextlib +import dataclasses +from xml.etree.ElementTree import ( + Element, + SubElement, + tostring as etree_tostring, +) + +from primitive_metadata import primitive_rdf as rdf + + +__all__ = ('XmlBuilder',) + + +@dataclasses.dataclass +class XmlBuilder: + '''XmlBuilder: for building XML (an alternate convenience wrapper around xml.etree) + + >>> _xb = XmlBuilder('foo') + >>> with _xb.nest('bar', {'blib': 'bloz'}): + ... _xb.leaf('baz', text='hello') + ... _xb.leaf('boz', {'blib': 'blab'}, text='world') + >>> str(_xb) + ''' + root_tag_name: str + root_attrs: dict = dataclasses.field(default_factory=dict) + _: dataclasses.KW_ONLY + _nested_elements: list[Element] = dataclasses.field(repr=False, init=False) + + def __post_init__(self) -> None: + self._nested_elements = [Element(self.root_tag_name, self.root_attrs)] + + @property + def root_element(self) -> Element: + return self._nested_elements[0] + + @property + def current_element(self) -> Element: + return self._nested_elements[-1] + + @contextlib.contextmanager + def nest(self, tag_name: str, attrs: dict | None = None) -> Generator[Element]: + _attrs = {**attrs} if attrs else {} + _nested_element = SubElement(self.current_element, tag_name, _attrs) + self._nested_elements.append(_nested_element) + try: + yield self.current_element + finally: + _popped_element = self._nested_elements.pop() + assert _popped_element is _nested_element + + def leaf(self, tag_name: str, attrs: dict | None = None, *, text: str | rdf.Literal | None = None) -> None: + _leaf_element = SubElement(self.current_element, tag_name, attrs or {}) + if isinstance(text, rdf.Literal): + # TODO: lang + _leaf_element.text = text.unicode_value + elif text is not None: + _leaf_element.text = text + + def __str__(self) -> str: + return etree_tostring(self.root_element, encoding='unicode') + + def __bytes__(self) -> bytes: + return etree_tostring(self.root_element, encoding='utf-8', xml_declaration=True) diff --git a/trove/views/_base.py b/trove/views/_base.py index 802aa56e2..feede764b 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from django.http import HttpResponse, StreamingHttpResponse, HttpRequest from trove.render import BaseRenderer - from trove.render._rendering import ProtoRendering + from trove.render.rendering import ProtoRendering __all__ = ( @@ -45,7 +45,7 @@ def _render_response_content(self, request, params, renderer_type: type[BaseRend def get(self, request: HttpRequest, **kwargs: str) -> HttpResponse | StreamingHttpResponse: try: - _renderer_type = get_renderer_type(request) + _renderer_type = self._get_renderer_type(request) except trove_exceptions.CannotRenderMediatype as _error: return make_http_error_response( error=_error, @@ -63,6 +63,9 @@ def get(self, request: HttpRequest, **kwargs: str) -> HttpResponse | StreamingHt renderer_type=_renderer_type, ) + def _get_renderer_type(self, request: HttpRequest): + return get_renderer_type(request) + def _parse_params(self, request: HttpRequest): return self.params_type.from_querystring(request.META['QUERY_STRING']) @@ -74,6 +77,8 @@ class GatheredTroveView(BaseTroveView, abc.ABC): focus_type_iris: ClassVar[Container[str]] = () def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): + '''implement abstract method from BaseTroveView + ''' _focus = self._build_focus(request, params, url_kwargs) _renderer = self._gather_to_renderer(_focus, params, renderer_type) return _renderer.render_document() @@ -123,6 +128,8 @@ def cached_static_triples(cls, focus_iri): return cls.get_static_triples(focus_iri) def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): + '''implement abstract method from BaseTroveView + ''' _focus_iri = self.get_focus_iri() _triples = self.cached_static_triples(_focus_iri) _focus = gather.Focus.new( diff --git a/trove/views/_responder.py b/trove/views/_responder.py index 1d3365742..cada5e74d 100644 --- a/trove/views/_responder.py +++ b/trove/views/_responder.py @@ -5,14 +5,23 @@ from django import http as djhttp from trove.render._base import BaseRenderer -from trove.render._rendering import ( - ProtoRendering, - StreamableRendering, -) +from trove.render.rendering import ProtoRendering +from trove.render.rendering.streamable import StreamableRendering +from trove.render.rendering.html_wrapped import HtmlWrappedRendering from trove.exceptions import TroveError from trove.vocab import mediatypes +_BROWSER_FRIENDLY_MEDIATYPES = { + mediatypes.HTML, + mediatypes.JSON, + mediatypes.JSONLD, + mediatypes.JSONAPI, + mediatypes.ATOM, + mediatypes.RSS, +} + + def make_http_response( *, content_rendering: ProtoRendering, @@ -24,15 +33,26 @@ def make_http_response( if isinstance(content_rendering, StreamableRendering) else djhttp.HttpResponse ) + _download_filename = ( + http_request.GET.get('withFileName') + if http_request is not None + else None + ) + if ( + _download_filename is None + and content_rendering.mediatype not in _BROWSER_FRIENDLY_MEDIATYPES + and http_request is not None + and 'Accept' in http_request.headers + and http_request.accepts(mediatypes.HTML) + ): # when browsing in browser, return html (unless given filename) + content_rendering = HtmlWrappedRendering(content_rendering) _response = _response_type( content_rendering.iter_content(), - content_type=content_rendering.mediatype, + content_type=_make_content_type(content_rendering.mediatype), ) - if http_request is not None: - _requested_filename = http_request.GET.get('withFileName') - if _requested_filename is not None: - _file_name = _get_file_name(_requested_filename, content_rendering.mediatype) - _response.headers['Content-Disposition'] = _disposition(_file_name) + if _download_filename is not None: + _file_name = _get_file_name(_download_filename, content_rendering.mediatype) + _response.headers['Content-Disposition'] = _disposition(_file_name) return _response @@ -46,7 +66,7 @@ def make_http_error_response( return djhttp.HttpResponse( _content_rendering.iter_content(), status=error.http_status, - content_type=_content_rendering.mediatype, + content_type=_make_content_type(_content_rendering.mediatype), ) @@ -70,3 +90,13 @@ def _disposition(filename: str) -> bytes: b'filename=' + filename.encode('latin-1', errors='replace'), b"filename*=utf-8''" + filename.encode(), )) + + +def _make_content_type(mediatype: str) -> str: + """make a content-type header value from a mediatype + + currently just adds "charset=utf-8" to text mediatypes that don't already have one + """ + if mediatype.startswith('text/') and ('charset' not in mediatype): + return f'{mediatype};charset=utf-8' + return mediatype diff --git a/trove/views/browse.py b/trove/views/browse.py index 6739b53d7..e50b41721 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -47,6 +47,11 @@ def _default_include(cls): _ns.TROVE.usedAtPath, )) + def to_querydict(self): + _querydict = super().to_querydict() + _querydict['iri'] = self.iri + return _querydict + class BrowseIriView(GatheredTroveView): gathering_organizer = trovebrowse diff --git a/trove/views/feeds.py b/trove/views/feeds.py new file mode 100644 index 000000000..ae4b90eb8 --- /dev/null +++ b/trove/views/feeds.py @@ -0,0 +1,48 @@ +from __future__ import annotations +import dataclasses +from typing import TYPE_CHECKING + +from trove.render.cardsearch_rss import CardsearchRssRenderer +from trove.render.cardsearch_atom import CardsearchAtomRenderer +from trove.trovesearch.search_params import ( + CardsearchParams, + SortParam, + ValueType, +) +from trove.views.search import CardsearchView +from trove.vocab.namespaces import DCTERMS + +if TYPE_CHECKING: + from django.http import HttpRequest + + +class CardsearchRssView(CardsearchView): + def _get_renderer_type(self, request: HttpRequest): + '''override method from BaseTroveView + + ignore requested mediatype; always render RSS + ''' + return CardsearchRssRenderer + + def _parse_params(self, request: HttpRequest): + '''override method from BaseTroveView + + ignore requested sort; always sort by date created, descending + ''' + _params: CardsearchParams = super()._parse_params(request) + return dataclasses.replace(_params, sort_list=( + SortParam( + value_type=ValueType.DATE, + propertypath=(DCTERMS.created,), + descending=True, + ), + )) + + +class CardsearchAtomView(CardsearchRssView): + def _get_renderer_type(self, request: HttpRequest): + '''override method from BaseTroveView + + ignore requested mediatype; always render Atom + ''' + return CardsearchAtomRenderer diff --git a/trove/views/ingest.py b/trove/views/ingest.py index a6b21590a..4c634bf00 100644 --- a/trove/views/ingest.py +++ b/trove/views/ingest.py @@ -61,9 +61,8 @@ def post(self, request: HttpRequest) -> HttpResponse: except trove_exceptions.DigestiveError as e: logger.exception(str(e)) return http.HttpResponse(str(e), status=HTTPStatus.BAD_REQUEST) - else: - # TODO: include (link to?) extracted card(s) - return http.HttpResponse(status=HTTPStatus.CREATED) + # TODO: include (link to?) extracted card(s) + return http.HttpResponse(status=HTTPStatus.CREATED) def delete(self, request: HttpRequest) -> HttpResponse: # TODO: cleaner permissions diff --git a/trove/vocab/mediatypes.py b/trove/vocab/mediatypes.py index 66495683a..24dad5053 100644 --- a/trove/vocab/mediatypes.py +++ b/trove/vocab/mediatypes.py @@ -5,6 +5,8 @@ HTML = 'text/html' TSV = 'text/tab-separated-values' CSV = 'text/csv' +RSS = 'application/rss+xml' +ATOM = 'application/atom+xml' _file_extensions = { @@ -15,11 +17,31 @@ HTML: '.html', TSV: '.tsv', CSV: '.csv', + RSS: '.xml', + ATOM: '.xml', } +_PARAMETER_DELIMITER = ';' + + +def strip_mediatype_parameters(mediatype: str) -> str: + """from a full mediatype that may have parameters, get only the base mediatype + + >>> strip_mediatype_parameters('text/plain;charset=utf-8') + 'text/plain' + >>> strip_mediatype_parameters('text/plain') + 'text/plain' + + note: does not validate that the mediatype exists or makes sense + >>> strip_mediatype_parameters('application/whatever ; blarg=foo') + 'application/whatever' + """ + (_base, _, __) = mediatype.partition(_PARAMETER_DELIMITER) + return _base.strip() + def dot_extension(mediatype: str) -> str: try: - return _file_extensions[mediatype] + return _file_extensions[strip_mediatype_parameters(mediatype)] except KeyError: raise ValueError(f'unrecognized mediatype: {mediatype}') diff --git a/trove/vocab/namespaces.py b/trove/vocab/namespaces.py index c0ebf1cb6..db86e679c 100644 --- a/trove/vocab/namespaces.py +++ b/trove/vocab/namespaces.py @@ -47,6 +47,8 @@ SHAREv2 = rdf.IriNamespace('https://share.osf.io/vocab/2017/sharev2/') # for the OSF metadata application profile (TODO: update to resolvable URL, when there is one) OSFMAP = rdf.IriNamespace('https://osf.io/vocab/2022/') +# non-standard namespace used by OSF for datacite terms (resolves to datacite docs) +DATACITE = rdf.IriNamespace('https://schema.datacite.org/meta/kernel-4/#') # for identifying jsonapi concepts with linked anchors on the jsonapi spec (probably fine) JSONAPI = rdf.IriNamespace('https://jsonapi.org/format/1.1/#') @@ -58,6 +60,7 @@ 'jsonapi': JSONAPI, 'oai': OAI, 'oai_dc': OAI_DC, + 'datacite': DATACITE, } if __debug__: # blarg: a nothing namespace for examples and testing diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 7dd6d1a9e..5649db6b8 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -1,10 +1,8 @@ import functools -import urllib.parse from typing import Union, Any from uuid import UUID from django.conf import settings -from django.urls import reverse from primitive_metadata.primitive_rdf import ( IriNamespace, IriShorthand, @@ -44,14 +42,6 @@ def _literal_markdown(text: str, *, language: str) -> literal: return literal(text, language=language, mediatype='text/markdown;charset=utf-8') -def trove_browse_link(iri: str) -> str: - _compact = namespaces_shorthand().compact_iri(iri) - return urllib.parse.urljoin( - reverse('trove:browse-iri'), - f'?iri={urllib.parse.quote(_compact)}', - ) - - TROVE_API_THESAURUS: RdfTripleDictionary = { TROVE.search_api: { RDFS.label: {literal('trove search api', language='en')}, @@ -494,7 +484,7 @@ def trove_browse_link(iri: str) -> str: unstable mediatypes (may change or sometimes respond 500): -* `text/html;charset=utf-8`: rdf as browsable html +* `text/html`: rdf as browsable html * `text/turtle`: rdf as [turtle](https://www.w3.org/TR/turtle/) * `application/ld+json`: rdf as [json-ld](https://www.w3.org/TR/json-ld11/)