From f897e23a0c17bd80eaf169f2a3641271e0f599fc Mon Sep 17 00:00:00 2001 From: Tim Sherratt Date: Wed, 25 Oct 2023 22:21:47 +1100 Subject: [PATCH] Update Trove Harvester package and add RO-Crate file --- .gitignore | 1 + .zenodo.json | 8 +- Explore-harvested-text-files.ipynb | 9 + Exploring-your-TroveHarvester-data.ipynb | 9 + README.md | 2 +- ...er-to-get-newspaper-articles-in-bulk.ipynb | 9 + display_harvest_results_using_datasette.ipynb | 9 + harvest-specific-days.ipynb | 9 + newspaper_harvester_app.ipynb | 9 + requirements-dev.in | 6 +- requirements-dev.txt | 343 +++++++++++++++++- requirements.in | 1 - requirements.txt | 6 +- ro-crate-metadata.json | 223 ++++++++++++ scripts/extract_metadata.py | 39 +- scripts/update_crate.py | 50 ++- scripts/update_version.sh | 2 +- 17 files changed, 671 insertions(+), 64 deletions(-) create mode 100644 ro-crate-metadata.json diff --git a/.gitignore b/.gitignore index 5861864..0cec4f0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ getting-started/my_nma_dataset.csv anzac-day* front_pages* Untitled* +scripts/add_nb_metadata.py diff --git a/.zenodo.json b/.zenodo.json index ff16b28..237832b 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -5,7 +5,7 @@ "related_identifiers": [ { "scheme": "url", - "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.0", + "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.1", "relation": "isDerivedFrom", "resource_type": "software" }, @@ -22,7 +22,7 @@ "resource_type": "other" } ], - "version": "v2.0.0", + "version": "v2.0.1", "upload_type": "software", "keywords": [ "Trove", @@ -30,7 +30,7 @@ "Jupyter", "GLAM Workbench" ], - "publication_date": "2023-08-31", + "publication_date": "2023-10-25", "creators": [ { "orcid": "0000-0001-7956-4498", @@ -38,5 +38,5 @@ } ], "access_right": "open", - "description": "

Current version: v2.0.0

The Trove Newspaper & Gazette Harvester Harvester makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get everything.

The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.

The notebooks include:

See the GLAM Workbench for more details.

Cite as

See the GLAM Workbench or Zenodo for up-to-date citation details.


This repository is part of the GLAM Workbench.
If you think this project is worthwhile, you might like to sponsor me on GitHub.

" + "description": "

Current version: v2.0.1

The Trove Newspaper & Gazette Harvester Harvester makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get everything.

The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.

The notebooks include:

See the GLAM Workbench for more details.

Cite as

See the GLAM Workbench or Zenodo for up-to-date citation details.


This repository is part of the GLAM Workbench.
If you think this project is worthwhile, you might like to sponsor me on GitHub.

" } diff --git a/Explore-harvested-text-files.ipynb b/Explore-harvested-text-files.ipynb index b55e59e..4dab10e 100644 --- a/Explore-harvested-text-files.ipynb +++ b/Explore-harvested-text-files.ipynb @@ -1399,6 +1399,15 @@ "pygments_lexer": "ipython3", "version": "3.10.12" }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Explore harvested text files" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, diff --git a/Exploring-your-TroveHarvester-data.ipynb b/Exploring-your-TroveHarvester-data.ipynb index deeb764..3f9d417 100644 --- a/Exploring-your-TroveHarvester-data.ipynb +++ b/Exploring-your-TroveHarvester-data.ipynb @@ -1673,6 +1673,15 @@ "pygments_lexer": "ipython3", "version": "3.10.12" }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Exploring your harvested data" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, diff --git a/README.md b/README.md index d075d43..c188059 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Trove Newspaper and Gazette Harvester -Current version: [v2.0.0](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0) +Current version: [v2.0.1](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.1) The [Trove Newspaper & Gazette Harvester Harvester](https://pypi.org/project/trove-newspaper-harvester/) makes it easy to download large quantities of digitised articles from Trove's newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get **everything**. diff --git a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb index ddbf06c..97294ea 100644 --- a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb +++ b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb @@ -423,6 +423,15 @@ "pygments_lexer": "ipython3", "version": "3.10.12" }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Using TroveHarvester to get newspaper and gazette articles in bulk" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, diff --git a/display_harvest_results_using_datasette.ipynb b/display_harvest_results_using_datasette.ipynb index 578f799..5b0a9a7 100644 --- a/display_harvest_results_using_datasette.ipynb +++ b/display_harvest_results_using_datasette.ipynb @@ -341,6 +341,15 @@ "pygments_lexer": "ipython3", "version": "3.10.12" }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Display the results of a harvest as a searchable database using Datasette" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, diff --git a/harvest-specific-days.ipynb b/harvest-specific-days.ipynb index 9903999..b2b5aba 100644 --- a/harvest-specific-days.ipynb +++ b/harvest-specific-days.ipynb @@ -533,6 +533,15 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" + }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day" } }, "nbformat": 4, diff --git a/newspaper_harvester_app.ipynb b/newspaper_harvester_app.ipynb index 3c1de9e..3cf423e 100644 --- a/newspaper_harvester_app.ipynb +++ b/newspaper_harvester_app.ipynb @@ -289,6 +289,15 @@ "pygments_lexer": "ipython3", "version": "3.10.12" }, + "rocrate": { + "author": [ + { + "name": "Sherratt, Tim", + "orcid": "https://orcid.org/0000-0001-7956-4498" + } + ], + "name": "Trove Newspaper & Gazette Harvester" + }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { diff --git a/requirements-dev.in b/requirements-dev.in index 61a8817..648b628 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -5,4 +5,8 @@ nbqa black[jupyter] isort flake8 -pre-commit \ No newline at end of file +pre-commit +rocrate +giturlparse +jupyterlab-code-formatter +gitpython \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 22c2617..4aa258f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,13 +4,46 @@ # # pip-compile requirements-dev.in # +aiohttp==3.8.5 + # via + # -c requirements.txt + # tuspy +aiosignal==1.3.1 + # via + # -c requirements.txt + # aiohttp +anyio==3.7.1 + # via + # -c requirements.txt + # jupyter-server +arcp==0.2.1 + # via + # -c requirements.txt + # rocrate +argon2-cffi==23.1.0 + # via + # -c requirements.txt + # jupyter-server +argon2-cffi-bindings==21.2.0 + # via + # -c requirements.txt + # argon2-cffi +arrow==1.2.3 + # via + # -c requirements.txt + # isoduration asttokens==2.2.1 # via # -c requirements.txt # stack-data +async-timeout==4.0.3 + # via + # -c requirements.txt + # aiohttp attrs==23.1.0 # via # -c requirements.txt + # aiohttp # jsonschema # referencing autopep8==2.0.4 @@ -19,19 +52,49 @@ backcall==0.2.0 # via # -c requirements.txt # ipython -black[jupyter]==23.7.0 +beautifulsoup4==4.12.2 + # via + # -c requirements.txt + # nbconvert +bioblend==1.2.0 + # via + # -c requirements.txt + # gxformat2 +black[jupyter]==23.10.0 # via -r requirements-dev.in +bleach==6.0.0 + # via + # -c requirements.txt + # nbconvert +cachecontrol[filecache]==0.13.1 + # via + # -c requirements.txt + # schema-salad +certifi==2023.7.22 + # via + # -c requirements.txt + # requests +cffi==1.15.1 + # via + # -c requirements.txt + # argon2-cffi-bindings cfgv==3.4.0 # via pre-commit +charset-normalizer==3.2.0 + # via + # -c requirements.txt + # aiohttp + # requests click==8.1.7 # via # -c requirements.txt # black + # rocrate comm==0.1.4 # via # -c requirements.txt # ipykernel -coverage==7.3.0 +coverage==7.3.2 # via nbval debugpy==1.6.7.post1 # via @@ -41,11 +104,16 @@ decorator==5.1.1 # via # -c requirements.txt # ipython +defusedxml==0.7.1 + # via + # -c requirements.txt + # nbconvert distlib==0.3.7 # via virtualenv exceptiongroup==1.1.3 # via # -c requirements.txt + # anyio # pytest executing==1.2.0 # via @@ -58,11 +126,50 @@ fastjsonschema==2.18.0 filelock==3.12.3 # via # -c requirements.txt + # cachecontrol # virtualenv flake8==6.1.0 # via -r requirements-dev.in -identify==2.5.27 +fqdn==1.5.1 + # via + # -c requirements.txt + # jsonschema +frozenlist==1.4.0 + # via + # -c requirements.txt + # aiohttp + # aiosignal +future==0.18.3 + # via + # -c requirements.txt + # tuspy +galaxy2cwl==0.1.4 + # via + # -c requirements.txt + # rocrate +gitdb==4.0.11 + # via gitpython +gitpython==3.1.40 + # via -r requirements-dev.in +giturlparse==0.12.0 + # via -r requirements-dev.in +gxformat2==0.18.0 + # via + # -c requirements.txt + # galaxy2cwl +identify==2.5.30 # via pre-commit +idna==3.4 + # via + # -c requirements.txt + # anyio + # jsonschema + # requests + # yarl +importlib-resources==6.0.1 + # via + # -c requirements.txt + # schema-salad iniconfig==2.0.0 # via pytest ipykernel==6.25.1 @@ -75,15 +182,34 @@ ipython==8.14.0 # black # ipykernel # nbqa +isodate==0.6.1 + # via + # -c requirements.txt + # rdflib +isoduration==20.11.0 + # via + # -c requirements.txt + # jsonschema isort==5.12.0 # via -r requirements-dev.in jedi==0.19.0 # via # -c requirements.txt # ipython -jsonschema==4.19.0 +jinja2==3.1.2 + # via + # -c requirements.txt + # jupyter-server + # nbconvert + # rocrate +jsonpointer==2.4 # via # -c requirements.txt + # jsonschema +jsonschema[format-nongpl]==4.19.0 + # via + # -c requirements.txt + # jupyter-events # nbformat jsonschema-specifications==2023.7.1 # via @@ -93,13 +219,41 @@ jupyter-client==8.3.1 # via # -c requirements.txt # ipykernel + # jupyter-server + # nbclient # nbval jupyter-core==5.3.1 # via # -c requirements.txt # ipykernel # jupyter-client + # jupyter-server + # nbclient + # nbconvert # nbformat +jupyter-events==0.7.0 + # via + # -c requirements.txt + # jupyter-server +jupyter-server==2.7.2 + # via + # -c requirements.txt + # jupyterlab-code-formatter +jupyter-server-terminals==0.4.4 + # via + # -c requirements.txt + # jupyter-server +jupyterlab-code-formatter==2.2.1 + # via -r requirements-dev.in +jupyterlab-pygments==0.2.2 + # via + # -c requirements.txt + # nbconvert +markupsafe==2.1.3 + # via + # -c requirements.txt + # jinja2 + # nbconvert matplotlib-inline==0.1.6 # via # -c requirements.txt @@ -107,13 +261,39 @@ matplotlib-inline==0.1.6 # ipython mccabe==0.7.0 # via flake8 +mistune==2.0.5 + # via + # -c requirements.txt + # nbconvert + # schema-salad +msgpack==1.0.5 + # via + # -c requirements.txt + # cachecontrol +multidict==6.0.4 + # via + # -c requirements.txt + # aiohttp + # yarl mypy-extensions==1.0.0 # via # -c requirements.txt # black + # schema-salad +nbclient==0.7.4 + # via + # -c requirements.txt + # nbconvert +nbconvert==7.7.4 + # via + # -c requirements.txt + # jupyter-server nbformat==5.9.2 # via # -c requirements.txt + # jupyter-server + # nbclient + # nbconvert # nbval nbqa==1.7.0 # via -r requirements-dev.in @@ -125,12 +305,23 @@ nest-asyncio==1.5.7 # ipykernel nodeenv==1.8.0 # via pre-commit +overrides==7.4.0 + # via + # -c requirements.txt + # jupyter-server packaging==23.1 # via # -c requirements.txt # black # ipykernel + # jupyter-server + # jupyterlab-code-formatter + # nbconvert # pytest +pandocfilters==1.5.0 + # via + # -c requirements.txt + # nbconvert parso==0.8.3 # via # -c requirements.txt @@ -155,8 +346,12 @@ pluggy==1.3.0 # via # -c requirements.txt # pytest -pre-commit==3.3.3 +pre-commit==3.5.0 # via -r requirements-dev.in +prometheus-client==0.17.1 + # via + # -c requirements.txt + # jupyter-server prompt-toolkit==3.0.39 # via # -c requirements.txt @@ -169,56 +364,151 @@ ptyprocess==0.7.0 # via # -c requirements.txt # pexpect + # terminado pure-eval==0.2.2 # via # -c requirements.txt # stack-data -pycodestyle==2.11.0 +pycodestyle==2.11.1 # via # autopep8 # flake8 +pycparser==2.21 + # via + # -c requirements.txt + # cffi pyflakes==3.1.0 # via flake8 pygments==2.16.1 # via # -c requirements.txt # ipython -pytest==7.4.0 + # nbconvert +pyparsing==3.0.9 + # via + # -c requirements.txt + # rdflib +pytest==7.4.2 # via # -r requirements-dev.in # nbval python-dateutil==2.8.2 # via # -c requirements.txt + # arrow # jupyter-client + # rocrate +python-json-logger==2.0.7 + # via + # -c requirements.txt + # jupyter-events pyyaml==6.0.1 # via # -c requirements.txt + # galaxy2cwl + # gxformat2 + # jupyter-events # pre-commit pyzmq==25.1.1 # via # -c requirements.txt # ipykernel # jupyter-client + # jupyter-server +rdflib==6.3.2 + # via + # -c requirements.txt + # schema-salad referencing==0.30.2 # via # -c requirements.txt # jsonschema # jsonschema-specifications + # jupyter-events +requests==2.31.0 + # via + # -c requirements.txt + # bioblend + # cachecontrol + # requests-toolbelt + # rocrate + # schema-salad + # tuspy +requests-toolbelt==1.0.0 + # via + # -c requirements.txt + # bioblend +rfc3339-validator==0.1.4 + # via + # -c requirements.txt + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # -c requirements.txt + # jsonschema + # jupyter-events +rocrate==0.9.0 + # via + # -c requirements.txt + # -r requirements-dev.in rpds-py==0.10.0 # via # -c requirements.txt # jsonschema # referencing +ruamel-yaml==0.17.32 + # via + # -c requirements.txt + # schema-salad +ruamel-yaml-clib==0.2.7 + # via + # -c requirements.txt + # ruamel-yaml +schema-salad==8.4.20230808163024 + # via + # -c requirements.txt + # gxformat2 +send2trash==1.8.2 + # via + # -c requirements.txt + # jupyter-server six==1.16.0 # via # -c requirements.txt # asttokens + # bleach + # isodate # python-dateutil + # rfc3339-validator + # tuspy +smmap==5.0.1 + # via gitdb +sniffio==1.3.0 + # via + # -c requirements.txt + # anyio +soupsieve==2.4.1 + # via + # -c requirements.txt + # beautifulsoup4 stack-data==0.6.2 # via # -c requirements.txt # ipython +terminado==0.17.1 + # via + # -c requirements.txt + # jupyter-server + # jupyter-server-terminals +tinycss2==1.2.1 + # via + # -c requirements.txt + # nbconvert +tinydb==4.8.0 + # via + # -c requirements.txt + # tuspy tokenize-rt==5.2.0 # via # black @@ -235,6 +525,8 @@ tornado==6.3.3 # -c requirements.txt # ipykernel # jupyter-client + # jupyter-server + # terminado traitlets==5.9.0 # via # -c requirements.txt @@ -243,18 +535,53 @@ traitlets==5.9.0 # ipython # jupyter-client # jupyter-core + # jupyter-events + # jupyter-server # matplotlib-inline + # nbclient + # nbconvert # nbformat +tuspy==1.0.1 + # via + # -c requirements.txt + # bioblend typing-extensions==4.7.1 # via # -c requirements.txt + # bioblend + # black # filelock -virtualenv==20.24.3 +uri-template==1.3.0 + # via + # -c requirements.txt + # jsonschema +urllib3==2.0.4 + # via + # -c requirements.txt + # requests +virtualenv==20.24.5 # via pre-commit wcwidth==0.2.6 # via # -c requirements.txt # prompt-toolkit +webcolors==1.13 + # via + # -c requirements.txt + # jsonschema +webencodings==0.5.1 + # via + # -c requirements.txt + # bleach + # tinycss2 +websocket-client==1.6.2 + # via + # -c requirements.txt + # jupyter-server +yarl==1.9.2 + # via + # -c requirements.txt + # aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements.in b/requirements.in index fb4b393..0ef345a 100644 --- a/requirements.in +++ b/requirements.in @@ -16,7 +16,6 @@ ipywidgets voila voila-material @ git+https://github.com/GLAM-Workbench/voila-material.git wordcloud -trove-newspaper-harvester datasette datasette-media datasette-json-html diff --git a/requirements.txt b/requirements.txt index 15b1c89..48b8d3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile requirements.in +# pip-compile # aiofiles==23.2.1 # via datasette @@ -434,7 +434,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rocrate==0.8.0 +rocrate==0.9.0 # via trove-newspaper-harvester rpds-py==0.10.0 # via @@ -522,7 +522,7 @@ traitlets==5.9.0 # nbconvert # nbformat # voila -trove-newspaper-harvester==0.7.1 +trove-newspaper-harvester==0.7.2 # via -r requirements.in trove-newspaper-images==0.2.1 # via trove-newspaper-harvester diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json new file mode 100644 index 0000000..0f4deb9 --- /dev/null +++ b/ro-crate-metadata.json @@ -0,0 +1,223 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "author": [ + { + "@id": "0000-0001-7956-4498" + } + ], + "datePublished": "2023-10-25", + "description": "A GLAM Workbench repository", + "hasPart": [ + { + "@id": "newspaper_harvester_app.ipynb" + }, + { + "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb" + }, + { + "@id": "Explore-harvested-text-files.ipynb" + }, + { + "@id": "display_harvest_results_using_datasette.ipynb" + }, + { + "@id": "Exploring-your-TroveHarvester-data.ipynb" + }, + { + "@id": "harvest-specific-days.ipynb" + } + ], + "license": { + "@id": "https://spdx.org/licenses/MIT" + }, + "name": "trove-newspaper-harvester", + "url": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "version": "v2.0.1" + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + }, + "license": { + "@id": "https://creativecommons.org/publicdomain/zero/1.0/" + } + }, + { + "@id": "newspaper_harvester_app.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Trove Newspaper & Gazette Harvester", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Using TroveHarvester to get newspaper and gazette articles in bulk", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "Explore-harvested-text-files.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Explore harvested text files", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "display_harvest_results_using_datasette.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Display the results of a harvest as a searchable database using Datasette", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "Exploring-your-TroveHarvester-data.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Exploring your harvested data", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "harvest-specific-days.ipynb", + "@type": [ + "File", + "SoftwareSourceCode" + ], + "author": [ + { + "@id": "https://orcid.org/0000-0001-7956-4498" + } + ], + "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester", + "conformsTo": { + "@id": "https://purl.archive.org/textcommons/profile#Notebook" + }, + "description": "", + "encodingFormat": "application/x-ipynb+json", + "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day", + "programmingLanguage": { + "@id": "https://www.python.org/downloads/release/python-31012/" + } + }, + { + "@id": "https://orcid.org/0000-0001-7956-4498", + "@type": "Person", + "name": "Sherratt, Tim" + }, + { + "@id": "https://spdx.org/licenses/MIT", + "@type": "CreativeWork", + "name": "MIT License", + "url": "https://spdx.org/licenses/MIT.html" + }, + { + "@id": "https://creativecommons.org/publicdomain/zero/1.0/", + "@type": "CreativeWork", + "name": "CC0 Public Domain Dedication", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + { + "@id": "https://www.python.org/downloads/release/python-31012/", + "@type": [ + "ComputerLanguage", + "SoftwareApplication" + ], + "name": "Python 3.10.12", + "url": "https://www.python.org/downloads/release/python-31012/", + "version": "3.10.12" + }, + { + "@id": "#create_version_v2_0_1", + "@type": "UpdateAction", + "actionStatus": { + "@id": "http://schema.org/CompletedActionStatus" + }, + "endDate": "2023-10-25", + "name": "Create version v2.0.1" + } + ] +} \ No newline at end of file diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py index 07258e2..30c61f9 100644 --- a/scripts/extract_metadata.py +++ b/scripts/extract_metadata.py @@ -2,39 +2,24 @@ from pathlib import Path from typing import Any, Dict, List, Optional import nbformat +import re AuthorInfo = Dict[str, str] -DEFAULT_AUTHOR = { - "name": "Unknown", - "orcid": "https://orcid.org/0000-0000-0000-0000", -} -CREATORS_KEY = "creators" LISTIFY = ["author", "object", "input"] -def extract_metadata(metadata): - with open(metadata) as file: - data = json.load(file) - return data - -def extract_default_authors(metadata: Path) -> List[AuthorInfo]: - """Attempts to extract author information from the metadata.json file within - the repository. If none are found, returns a dummy value. - - Parameters: - metadata: The path to the metadata file, commonly metadata.json - """ - with open(metadata) as file: - data = json.load(file) - - return data.get(CREATORS_KEY, [DEFAULT_AUTHOR]) - def listify(value): if not isinstance(value, list): return [value] return value +def extract_notebook_title(nb): + md_cells = [c for c in nb.cells if c["cell_type"] == "markdown"] + for cell in md_cells: + if title := re.search(r"^# (.+)(\n|$)", cell["source"]): + return title.group(1) + def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str, Any]: """Attempts to extract metadata from the notebook. @@ -46,16 +31,6 @@ def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str, Returns: A dictionary containing the retrieved metadata for each key. """ - """ - with open(notebook) as file: - data = json.load(file) - - metadata = data["metadata"] - result = {} - - for key, default in keys.items(): - result[key] = metadata.get(key, default) - """ result = {} nb = nbformat.read(notebook, nbformat.NO_CONVERT) metadata = nb.metadata.rocrate diff --git a/scripts/update_crate.py b/scripts/update_crate.py index d86bf66..b245b65 100755 --- a/scripts/update_crate.py +++ b/scripts/update_crate.py @@ -3,6 +3,7 @@ import datetime import requests from giturlparse import parse as ghparse +from git import Repo from pathlib import Path from typing import Union, List, Dict, Tuple import mimetypes @@ -38,6 +39,11 @@ "@type": ["ComputerLanguage", "SoftwareApplication"], } +DEFAULT_AUTHORS = [{ + "name": "Sherratt, Tim", + "orcid": "0000-0001-7956-4498" +}] + def main(version: str): # Make working directory the parent of the scripts directory @@ -45,7 +51,7 @@ def main(version: str): # Get a list of paths to notebooks in the cwd notebooks = get_notebooks() - + print(notebooks) # Update the crate update_crate(version, notebooks) @@ -59,8 +65,9 @@ def get_notebooks() -> List[Path]: Returns: Paths of the notebooks found in the directory """ - files = [Path(file) for file in os.listdir()] - is_notebook = lambda file: file.suffix == NOTEBOOK_EXTENSION + # files = [Path(file) for file in os.listdir()] + files = Path(".").glob("*.ipynb") + is_notebook = lambda file: not file.name.lower().startswith(("draft", "untitled")) return list(filter(is_notebook, files)) @@ -300,6 +307,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None: "result": [], }, ) + print(notebook_metadata) # Check if this notebook is already in the crate nb_current = crate.get(notebook.name) @@ -313,8 +321,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None: properties.update( { "name": notebook_metadata["name"], - "description": notebook_metadata["description"], - "author": [], + "description": notebook_metadata["description"] } ) else: @@ -342,21 +349,22 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None: nb_new = crate.add_file(notebook, properties=properties) # Add a CreateAction that links the notebook run with the input and output files - add_action(crate, nb_new, input_files, output_files) + if input_files or output_files: + add_action(crate, nb_new, input_files, output_files) # If the notebook has author info, add people to crate if notebook_metadata["author"]: # Add people referenced in notebook metadata persons = add_people(crate, notebook_metadata["author"]) - # If people are not already attached to notebook, append them to the author property - for person in persons: - if person not in nb_current["author"]: - nb_new.append_to("author", person) - # Otherwise add crate root authors to notebook else: - nb_new.append_to("author", root["author"]) + persons = root["author"] + + # If people are not already attached to notebook, append them to the author property + for person in persons: + if (nb_current and person not in nb_current.get("author", [])) or not nb_current: + nb_new.append_to("author", person) def remove_deleted_files(crate: ROCrate) -> None: @@ -442,7 +450,23 @@ def update_crate(version: str, notebooks: List[Path]) -> None: notebooks: The notebooks to include in the crate """ # Load existing crate from cwd - crate = ROCrate(source="./") + try: + crate = ROCrate(source="./") + except ValueError: + crate = ROCrate() + repo = Repo(".") + repo_url = repo.git.config("--get", "remote.origin.url").rstrip(".git") + repo_name = repo_url.split("/")[-1] + crate.update_jsonld( + { + "@id": "./", + "name": repo_name, + "description": "A GLAM Workbench repository", + "url": repo_url, + "author": id_ify([a["orcid"] for a in DEFAULT_AUTHORS]) + } + ) + add_people(crate, DEFAULT_AUTHORS) # If this is a new version, change version number and add UpdateAction if version: diff --git a/scripts/update_version.sh b/scripts/update_version.sh index e198fda..3824b95 100755 --- a/scripts/update_version.sh +++ b/scripts/update_version.sh @@ -11,4 +11,4 @@ jq --arg text "$text" '.description = $text' .zenodo.json \ | jq --arg pdate "$pdate" '.publication_date = $pdate' > zenodo.json; rm .zenodo.json; mv zenodo.json .zenodo.json; -#python scripts/update_crate.py --version $1 \ No newline at end of file +python scripts/update_crate.py --version $1 \ No newline at end of file