diff --git a/.gitignore b/.gitignore
index 5861864..0cec4f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ getting-started/my_nma_dataset.csv
anzac-day*
front_pages*
Untitled*
+scripts/add_nb_metadata.py
diff --git a/.zenodo.json b/.zenodo.json
index ff16b28..237832b 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -5,7 +5,7 @@
"related_identifiers": [
{
"scheme": "url",
- "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.0",
+ "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.1",
"relation": "isDerivedFrom",
"resource_type": "software"
},
@@ -22,7 +22,7 @@
"resource_type": "other"
}
],
- "version": "v2.0.0",
+ "version": "v2.0.1",
"upload_type": "software",
"keywords": [
"Trove",
@@ -30,7 +30,7 @@
"Jupyter",
"GLAM Workbench"
],
- "publication_date": "2023-08-31",
+ "publication_date": "2023-10-25",
"creators": [
{
"orcid": "0000-0001-7956-4498",
@@ -38,5 +38,5 @@
}
],
"access_right": "open",
- "description": "
Current version: v2.0.0
The Trove Newspaper & Gazette Harvester Harvester makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get everything.
The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.
The notebooks include:
- Using TroveHarvester to get newspaper articles in bulk — an easy introduction to the TroveHarvester tool
- Trove Harvester web app — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)
- Harvesting articles that mention “Anzac Day” on Anzac Day – import the Harvester as a Python library to harvest a complex search
- Display the results of a harvest as a searchable database using Datasette – load your harvested data into a SQLite database and explore it using Datasette
- Exploring your TroveHarvester data — use Pandas to analyse your data and create some visualisations
- Explore harvested text files (experimental) — analyse the full text content of harvested articles
See the GLAM Workbench for more details.
Cite as
See the GLAM Workbench or Zenodo for up-to-date citation details.
This repository is part of the GLAM Workbench.
If you think this project is worthwhile, you might like to sponsor me on GitHub.
"
+ "description": "Current version: v2.0.1
The Trove Newspaper & Gazette Harvester Harvester makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get everything.
The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.
The notebooks include:
- Using TroveHarvester to get newspaper articles in bulk — an easy introduction to the TroveHarvester tool
- Trove Harvester web app — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)
- Harvesting articles that mention “Anzac Day” on Anzac Day – import the Harvester as a Python library to harvest a complex search
- Display the results of a harvest as a searchable database using Datasette – load your harvested data into a SQLite database and explore it using Datasette
- Exploring your TroveHarvester data — use Pandas to analyse your data and create some visualisations
- Explore harvested text files (experimental) — analyse the full text content of harvested articles
See the GLAM Workbench for more details.
Cite as
See the GLAM Workbench or Zenodo for up-to-date citation details.
This repository is part of the GLAM Workbench.
If you think this project is worthwhile, you might like to sponsor me on GitHub.
"
}
diff --git a/Explore-harvested-text-files.ipynb b/Explore-harvested-text-files.ipynb
index b55e59e..4dab10e 100644
--- a/Explore-harvested-text-files.ipynb
+++ b/Explore-harvested-text-files.ipynb
@@ -1399,6 +1399,15 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Explore harvested text files"
+ },
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
diff --git a/Exploring-your-TroveHarvester-data.ipynb b/Exploring-your-TroveHarvester-data.ipynb
index deeb764..3f9d417 100644
--- a/Exploring-your-TroveHarvester-data.ipynb
+++ b/Exploring-your-TroveHarvester-data.ipynb
@@ -1673,6 +1673,15 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Exploring your harvested data"
+ },
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
diff --git a/README.md b/README.md
index d075d43..c188059 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Trove Newspaper and Gazette Harvester
-Current version: [v2.0.0](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0)
+Current version: [v2.0.1](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.1)
The [Trove Newspaper & Gazette Harvester Harvester](https://pypi.org/project/trove-newspaper-harvester/) makes it easy to download large quantities of digitised articles from Trove's newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get **everything**.
diff --git a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
index ddbf06c..97294ea 100644
--- a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
+++ b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
@@ -423,6 +423,15 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Using TroveHarvester to get newspaper and gazette articles in bulk"
+ },
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
diff --git a/display_harvest_results_using_datasette.ipynb b/display_harvest_results_using_datasette.ipynb
index 578f799..5b0a9a7 100644
--- a/display_harvest_results_using_datasette.ipynb
+++ b/display_harvest_results_using_datasette.ipynb
@@ -341,6 +341,15 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Display the results of a harvest as a searchable database using Datasette"
+ },
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
diff --git a/harvest-specific-days.ipynb b/harvest-specific-days.ipynb
index 9903999..b2b5aba 100644
--- a/harvest-specific-days.ipynb
+++ b/harvest-specific-days.ipynb
@@ -533,6 +533,15 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
+ },
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day"
}
},
"nbformat": 4,
diff --git a/newspaper_harvester_app.ipynb b/newspaper_harvester_app.ipynb
index 3c1de9e..3cf423e 100644
--- a/newspaper_harvester_app.ipynb
+++ b/newspaper_harvester_app.ipynb
@@ -289,6 +289,15 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
+ "rocrate": {
+ "author": [
+ {
+ "name": "Sherratt, Tim",
+ "orcid": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "name": "Trove Newspaper & Gazette Harvester"
+ },
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {
diff --git a/requirements-dev.in b/requirements-dev.in
index 61a8817..648b628 100644
--- a/requirements-dev.in
+++ b/requirements-dev.in
@@ -5,4 +5,8 @@ nbqa
black[jupyter]
isort
flake8
-pre-commit
\ No newline at end of file
+pre-commit
+rocrate
+giturlparse
+jupyterlab-code-formatter
+gitpython
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 22c2617..4aa258f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,13 +4,46 @@
#
# pip-compile requirements-dev.in
#
+aiohttp==3.8.5
+ # via
+ # -c requirements.txt
+ # tuspy
+aiosignal==1.3.1
+ # via
+ # -c requirements.txt
+ # aiohttp
+anyio==3.7.1
+ # via
+ # -c requirements.txt
+ # jupyter-server
+arcp==0.2.1
+ # via
+ # -c requirements.txt
+ # rocrate
+argon2-cffi==23.1.0
+ # via
+ # -c requirements.txt
+ # jupyter-server
+argon2-cffi-bindings==21.2.0
+ # via
+ # -c requirements.txt
+ # argon2-cffi
+arrow==1.2.3
+ # via
+ # -c requirements.txt
+ # isoduration
asttokens==2.2.1
# via
# -c requirements.txt
# stack-data
+async-timeout==4.0.3
+ # via
+ # -c requirements.txt
+ # aiohttp
attrs==23.1.0
# via
# -c requirements.txt
+ # aiohttp
# jsonschema
# referencing
autopep8==2.0.4
@@ -19,19 +52,49 @@ backcall==0.2.0
# via
# -c requirements.txt
# ipython
-black[jupyter]==23.7.0
+beautifulsoup4==4.12.2
+ # via
+ # -c requirements.txt
+ # nbconvert
+bioblend==1.2.0
+ # via
+ # -c requirements.txt
+ # gxformat2
+black[jupyter]==23.10.0
# via -r requirements-dev.in
+bleach==6.0.0
+ # via
+ # -c requirements.txt
+ # nbconvert
+cachecontrol[filecache]==0.13.1
+ # via
+ # -c requirements.txt
+ # schema-salad
+certifi==2023.7.22
+ # via
+ # -c requirements.txt
+ # requests
+cffi==1.15.1
+ # via
+ # -c requirements.txt
+ # argon2-cffi-bindings
cfgv==3.4.0
# via pre-commit
+charset-normalizer==3.2.0
+ # via
+ # -c requirements.txt
+ # aiohttp
+ # requests
click==8.1.7
# via
# -c requirements.txt
# black
+ # rocrate
comm==0.1.4
# via
# -c requirements.txt
# ipykernel
-coverage==7.3.0
+coverage==7.3.2
# via nbval
debugpy==1.6.7.post1
# via
@@ -41,11 +104,16 @@ decorator==5.1.1
# via
# -c requirements.txt
# ipython
+defusedxml==0.7.1
+ # via
+ # -c requirements.txt
+ # nbconvert
distlib==0.3.7
# via virtualenv
exceptiongroup==1.1.3
# via
# -c requirements.txt
+ # anyio
# pytest
executing==1.2.0
# via
@@ -58,11 +126,50 @@ fastjsonschema==2.18.0
filelock==3.12.3
# via
# -c requirements.txt
+ # cachecontrol
# virtualenv
flake8==6.1.0
# via -r requirements-dev.in
-identify==2.5.27
+fqdn==1.5.1
+ # via
+ # -c requirements.txt
+ # jsonschema
+frozenlist==1.4.0
+ # via
+ # -c requirements.txt
+ # aiohttp
+ # aiosignal
+future==0.18.3
+ # via
+ # -c requirements.txt
+ # tuspy
+galaxy2cwl==0.1.4
+ # via
+ # -c requirements.txt
+ # rocrate
+gitdb==4.0.11
+ # via gitpython
+gitpython==3.1.40
+ # via -r requirements-dev.in
+giturlparse==0.12.0
+ # via -r requirements-dev.in
+gxformat2==0.18.0
+ # via
+ # -c requirements.txt
+ # galaxy2cwl
+identify==2.5.30
# via pre-commit
+idna==3.4
+ # via
+ # -c requirements.txt
+ # anyio
+ # jsonschema
+ # requests
+ # yarl
+importlib-resources==6.0.1
+ # via
+ # -c requirements.txt
+ # schema-salad
iniconfig==2.0.0
# via pytest
ipykernel==6.25.1
@@ -75,15 +182,34 @@ ipython==8.14.0
# black
# ipykernel
# nbqa
+isodate==0.6.1
+ # via
+ # -c requirements.txt
+ # rdflib
+isoduration==20.11.0
+ # via
+ # -c requirements.txt
+ # jsonschema
isort==5.12.0
# via -r requirements-dev.in
jedi==0.19.0
# via
# -c requirements.txt
# ipython
-jsonschema==4.19.0
+jinja2==3.1.2
+ # via
+ # -c requirements.txt
+ # jupyter-server
+ # nbconvert
+ # rocrate
+jsonpointer==2.4
# via
# -c requirements.txt
+ # jsonschema
+jsonschema[format-nongpl]==4.19.0
+ # via
+ # -c requirements.txt
+ # jupyter-events
# nbformat
jsonschema-specifications==2023.7.1
# via
@@ -93,13 +219,41 @@ jupyter-client==8.3.1
# via
# -c requirements.txt
# ipykernel
+ # jupyter-server
+ # nbclient
# nbval
jupyter-core==5.3.1
# via
# -c requirements.txt
# ipykernel
# jupyter-client
+ # jupyter-server
+ # nbclient
+ # nbconvert
# nbformat
+jupyter-events==0.7.0
+ # via
+ # -c requirements.txt
+ # jupyter-server
+jupyter-server==2.7.2
+ # via
+ # -c requirements.txt
+ # jupyterlab-code-formatter
+jupyter-server-terminals==0.4.4
+ # via
+ # -c requirements.txt
+ # jupyter-server
+jupyterlab-code-formatter==2.2.1
+ # via -r requirements-dev.in
+jupyterlab-pygments==0.2.2
+ # via
+ # -c requirements.txt
+ # nbconvert
+markupsafe==2.1.3
+ # via
+ # -c requirements.txt
+ # jinja2
+ # nbconvert
matplotlib-inline==0.1.6
# via
# -c requirements.txt
@@ -107,13 +261,39 @@ matplotlib-inline==0.1.6
# ipython
mccabe==0.7.0
# via flake8
+mistune==2.0.5
+ # via
+ # -c requirements.txt
+ # nbconvert
+ # schema-salad
+msgpack==1.0.5
+ # via
+ # -c requirements.txt
+ # cachecontrol
+multidict==6.0.4
+ # via
+ # -c requirements.txt
+ # aiohttp
+ # yarl
mypy-extensions==1.0.0
# via
# -c requirements.txt
# black
+ # schema-salad
+nbclient==0.7.4
+ # via
+ # -c requirements.txt
+ # nbconvert
+nbconvert==7.7.4
+ # via
+ # -c requirements.txt
+ # jupyter-server
nbformat==5.9.2
# via
# -c requirements.txt
+ # jupyter-server
+ # nbclient
+ # nbconvert
# nbval
nbqa==1.7.0
# via -r requirements-dev.in
@@ -125,12 +305,23 @@ nest-asyncio==1.5.7
# ipykernel
nodeenv==1.8.0
# via pre-commit
+overrides==7.4.0
+ # via
+ # -c requirements.txt
+ # jupyter-server
packaging==23.1
# via
# -c requirements.txt
# black
# ipykernel
+ # jupyter-server
+ # jupyterlab-code-formatter
+ # nbconvert
# pytest
+pandocfilters==1.5.0
+ # via
+ # -c requirements.txt
+ # nbconvert
parso==0.8.3
# via
# -c requirements.txt
@@ -155,8 +346,12 @@ pluggy==1.3.0
# via
# -c requirements.txt
# pytest
-pre-commit==3.3.3
+pre-commit==3.5.0
# via -r requirements-dev.in
+prometheus-client==0.17.1
+ # via
+ # -c requirements.txt
+ # jupyter-server
prompt-toolkit==3.0.39
# via
# -c requirements.txt
@@ -169,56 +364,151 @@ ptyprocess==0.7.0
# via
# -c requirements.txt
# pexpect
+ # terminado
pure-eval==0.2.2
# via
# -c requirements.txt
# stack-data
-pycodestyle==2.11.0
+pycodestyle==2.11.1
# via
# autopep8
# flake8
+pycparser==2.21
+ # via
+ # -c requirements.txt
+ # cffi
pyflakes==3.1.0
# via flake8
pygments==2.16.1
# via
# -c requirements.txt
# ipython
-pytest==7.4.0
+ # nbconvert
+pyparsing==3.0.9
+ # via
+ # -c requirements.txt
+ # rdflib
+pytest==7.4.2
# via
# -r requirements-dev.in
# nbval
python-dateutil==2.8.2
# via
# -c requirements.txt
+ # arrow
# jupyter-client
+ # rocrate
+python-json-logger==2.0.7
+ # via
+ # -c requirements.txt
+ # jupyter-events
pyyaml==6.0.1
# via
# -c requirements.txt
+ # galaxy2cwl
+ # gxformat2
+ # jupyter-events
# pre-commit
pyzmq==25.1.1
# via
# -c requirements.txt
# ipykernel
# jupyter-client
+ # jupyter-server
+rdflib==6.3.2
+ # via
+ # -c requirements.txt
+ # schema-salad
referencing==0.30.2
# via
# -c requirements.txt
# jsonschema
# jsonschema-specifications
+ # jupyter-events
+requests==2.31.0
+ # via
+ # -c requirements.txt
+ # bioblend
+ # cachecontrol
+ # requests-toolbelt
+ # rocrate
+ # schema-salad
+ # tuspy
+requests-toolbelt==1.0.0
+ # via
+ # -c requirements.txt
+ # bioblend
+rfc3339-validator==0.1.4
+ # via
+ # -c requirements.txt
+ # jsonschema
+ # jupyter-events
+rfc3986-validator==0.1.1
+ # via
+ # -c requirements.txt
+ # jsonschema
+ # jupyter-events
+rocrate==0.9.0
+ # via
+ # -c requirements.txt
+ # -r requirements-dev.in
rpds-py==0.10.0
# via
# -c requirements.txt
# jsonschema
# referencing
+ruamel-yaml==0.17.32
+ # via
+ # -c requirements.txt
+ # schema-salad
+ruamel-yaml-clib==0.2.7
+ # via
+ # -c requirements.txt
+ # ruamel-yaml
+schema-salad==8.4.20230808163024
+ # via
+ # -c requirements.txt
+ # gxformat2
+send2trash==1.8.2
+ # via
+ # -c requirements.txt
+ # jupyter-server
six==1.16.0
# via
# -c requirements.txt
# asttokens
+ # bleach
+ # isodate
# python-dateutil
+ # rfc3339-validator
+ # tuspy
+smmap==5.0.1
+ # via gitdb
+sniffio==1.3.0
+ # via
+ # -c requirements.txt
+ # anyio
+soupsieve==2.4.1
+ # via
+ # -c requirements.txt
+ # beautifulsoup4
stack-data==0.6.2
# via
# -c requirements.txt
# ipython
+terminado==0.17.1
+ # via
+ # -c requirements.txt
+ # jupyter-server
+ # jupyter-server-terminals
+tinycss2==1.2.1
+ # via
+ # -c requirements.txt
+ # nbconvert
+tinydb==4.8.0
+ # via
+ # -c requirements.txt
+ # tuspy
tokenize-rt==5.2.0
# via
# black
@@ -235,6 +525,8 @@ tornado==6.3.3
# -c requirements.txt
# ipykernel
# jupyter-client
+ # jupyter-server
+ # terminado
traitlets==5.9.0
# via
# -c requirements.txt
@@ -243,18 +535,53 @@ traitlets==5.9.0
# ipython
# jupyter-client
# jupyter-core
+ # jupyter-events
+ # jupyter-server
# matplotlib-inline
+ # nbclient
+ # nbconvert
# nbformat
+tuspy==1.0.1
+ # via
+ # -c requirements.txt
+ # bioblend
typing-extensions==4.7.1
# via
# -c requirements.txt
+ # bioblend
+ # black
# filelock
-virtualenv==20.24.3
+uri-template==1.3.0
+ # via
+ # -c requirements.txt
+ # jsonschema
+urllib3==2.0.4
+ # via
+ # -c requirements.txt
+ # requests
+virtualenv==20.24.5
# via pre-commit
wcwidth==0.2.6
# via
# -c requirements.txt
# prompt-toolkit
+webcolors==1.13
+ # via
+ # -c requirements.txt
+ # jsonschema
+webencodings==0.5.1
+ # via
+ # -c requirements.txt
+ # bleach
+ # tinycss2
+websocket-client==1.6.2
+ # via
+ # -c requirements.txt
+ # jupyter-server
+yarl==1.9.2
+ # via
+ # -c requirements.txt
+ # aiohttp
# The following packages are considered to be unsafe in a requirements file:
# setuptools
diff --git a/requirements.in b/requirements.in
index fb4b393..0ef345a 100644
--- a/requirements.in
+++ b/requirements.in
@@ -16,7 +16,6 @@ ipywidgets
voila
voila-material @ git+https://github.com/GLAM-Workbench/voila-material.git
wordcloud
-trove-newspaper-harvester
datasette
datasette-media
datasette-json-html
diff --git a/requirements.txt b/requirements.txt
index 15b1c89..48b8d3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
-# pip-compile requirements.in
+# pip-compile
#
aiofiles==23.2.1
# via datasette
@@ -434,7 +434,7 @@ rfc3986-validator==0.1.1
# via
# jsonschema
# jupyter-events
-rocrate==0.8.0
+rocrate==0.9.0
# via trove-newspaper-harvester
rpds-py==0.10.0
# via
@@ -522,7 +522,7 @@ traitlets==5.9.0
# nbconvert
# nbformat
# voila
-trove-newspaper-harvester==0.7.1
+trove-newspaper-harvester==0.7.2
# via -r requirements.in
trove-newspaper-images==0.2.1
# via trove-newspaper-harvester
diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
new file mode 100644
index 0000000..0f4deb9
--- /dev/null
+++ b/ro-crate-metadata.json
@@ -0,0 +1,223 @@
+{
+ "@context": "https://w3id.org/ro/crate/1.1/context",
+ "@graph": [
+ {
+ "@id": "./",
+ "@type": "Dataset",
+ "author": [
+ {
+ "@id": "0000-0001-7956-4498"
+ }
+ ],
+ "datePublished": "2023-10-25",
+ "description": "A GLAM Workbench repository",
+ "hasPart": [
+ {
+ "@id": "newspaper_harvester_app.ipynb"
+ },
+ {
+ "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb"
+ },
+ {
+ "@id": "Explore-harvested-text-files.ipynb"
+ },
+ {
+ "@id": "display_harvest_results_using_datasette.ipynb"
+ },
+ {
+ "@id": "Exploring-your-TroveHarvester-data.ipynb"
+ },
+ {
+ "@id": "harvest-specific-days.ipynb"
+ }
+ ],
+ "license": {
+ "@id": "https://spdx.org/licenses/MIT"
+ },
+ "name": "trove-newspaper-harvester",
+ "url": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "version": "v2.0.1"
+ },
+ {
+ "@id": "ro-crate-metadata.json",
+ "@type": "CreativeWork",
+ "about": {
+ "@id": "./"
+ },
+ "conformsTo": {
+ "@id": "https://w3id.org/ro/crate/1.1"
+ },
+ "license": {
+ "@id": "https://creativecommons.org/publicdomain/zero/1.0/"
+ }
+ },
+ {
+ "@id": "newspaper_harvester_app.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Trove Newspaper & Gazette Harvester",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Using TroveHarvester to get newspaper and gazette articles in bulk",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "Explore-harvested-text-files.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Explore harvested text files",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "display_harvest_results_using_datasette.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Display the results of a harvest as a searchable database using Datasette",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "Exploring-your-TroveHarvester-data.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Exploring your harvested data",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "harvest-specific-days.ipynb",
+ "@type": [
+ "File",
+ "SoftwareSourceCode"
+ ],
+ "author": [
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498"
+ }
+ ],
+ "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+ "conformsTo": {
+ "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+ },
+ "description": "",
+ "encodingFormat": "application/x-ipynb+json",
+ "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day",
+ "programmingLanguage": {
+ "@id": "https://www.python.org/downloads/release/python-31012/"
+ }
+ },
+ {
+ "@id": "https://orcid.org/0000-0001-7956-4498",
+ "@type": "Person",
+ "name": "Sherratt, Tim"
+ },
+ {
+ "@id": "https://spdx.org/licenses/MIT",
+ "@type": "CreativeWork",
+ "name": "MIT License",
+ "url": "https://spdx.org/licenses/MIT.html"
+ },
+ {
+ "@id": "https://creativecommons.org/publicdomain/zero/1.0/",
+ "@type": "CreativeWork",
+ "name": "CC0 Public Domain Dedication",
+ "url": "https://creativecommons.org/publicdomain/zero/1.0/"
+ },
+ {
+ "@id": "https://www.python.org/downloads/release/python-31012/",
+ "@type": [
+ "ComputerLanguage",
+ "SoftwareApplication"
+ ],
+ "name": "Python 3.10.12",
+ "url": "https://www.python.org/downloads/release/python-31012/",
+ "version": "3.10.12"
+ },
+ {
+ "@id": "#create_version_v2_0_1",
+ "@type": "UpdateAction",
+ "actionStatus": {
+ "@id": "http://schema.org/CompletedActionStatus"
+ },
+ "endDate": "2023-10-25",
+ "name": "Create version v2.0.1"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py
index 07258e2..30c61f9 100644
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -2,39 +2,24 @@
from pathlib import Path
from typing import Any, Dict, List, Optional
import nbformat
+import re
AuthorInfo = Dict[str, str]
-DEFAULT_AUTHOR = {
- "name": "Unknown",
- "orcid": "https://orcid.org/0000-0000-0000-0000",
-}
-CREATORS_KEY = "creators"
LISTIFY = ["author", "object", "input"]
-def extract_metadata(metadata):
- with open(metadata) as file:
- data = json.load(file)
- return data
-
-def extract_default_authors(metadata: Path) -> List[AuthorInfo]:
- """Attempts to extract author information from the metadata.json file within
- the repository. If none are found, returns a dummy value.
-
- Parameters:
- metadata: The path to the metadata file, commonly metadata.json
- """
- with open(metadata) as file:
- data = json.load(file)
-
- return data.get(CREATORS_KEY, [DEFAULT_AUTHOR])
-
def listify(value):
if not isinstance(value, list):
return [value]
return value
+def extract_notebook_title(nb):
+ md_cells = [c for c in nb.cells if c["cell_type"] == "markdown"]
+ for cell in md_cells:
+ if title := re.search(r"^# (.+)(\n|$)", cell["source"]):
+ return title.group(1)
+
def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str, Any]:
"""Attempts to extract metadata from the notebook.
@@ -46,16 +31,6 @@ def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str,
Returns:
A dictionary containing the retrieved metadata for each key.
"""
- """
- with open(notebook) as file:
- data = json.load(file)
-
- metadata = data["metadata"]
- result = {}
-
- for key, default in keys.items():
- result[key] = metadata.get(key, default)
- """
result = {}
nb = nbformat.read(notebook, nbformat.NO_CONVERT)
metadata = nb.metadata.rocrate
diff --git a/scripts/update_crate.py b/scripts/update_crate.py
index d86bf66..b245b65 100755
--- a/scripts/update_crate.py
+++ b/scripts/update_crate.py
@@ -3,6 +3,7 @@
import datetime
import requests
from giturlparse import parse as ghparse
+from git import Repo
from pathlib import Path
from typing import Union, List, Dict, Tuple
import mimetypes
@@ -38,6 +39,11 @@
"@type": ["ComputerLanguage", "SoftwareApplication"],
}
+DEFAULT_AUTHORS = [{
+ "name": "Sherratt, Tim",
+ "orcid": "0000-0001-7956-4498"
+}]
+
def main(version: str):
# Make working directory the parent of the scripts directory
@@ -45,7 +51,7 @@ def main(version: str):
# Get a list of paths to notebooks in the cwd
notebooks = get_notebooks()
-
+ print(notebooks)
# Update the crate
update_crate(version, notebooks)
@@ -59,8 +65,9 @@ def get_notebooks() -> List[Path]:
Returns:
Paths of the notebooks found in the directory
"""
- files = [Path(file) for file in os.listdir()]
- is_notebook = lambda file: file.suffix == NOTEBOOK_EXTENSION
+ # files = [Path(file) for file in os.listdir()]
+ files = Path(".").glob("*.ipynb")
+ is_notebook = lambda file: not file.name.lower().startswith(("draft", "untitled"))
return list(filter(is_notebook, files))
@@ -300,6 +307,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
"result": [],
},
)
+ print(notebook_metadata)
# Check if this notebook is already in the crate
nb_current = crate.get(notebook.name)
@@ -313,8 +321,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
properties.update(
{
"name": notebook_metadata["name"],
- "description": notebook_metadata["description"],
- "author": [],
+ "description": notebook_metadata["description"]
}
)
else:
@@ -342,21 +349,22 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
nb_new = crate.add_file(notebook, properties=properties)
# Add a CreateAction that links the notebook run with the input and output files
- add_action(crate, nb_new, input_files, output_files)
+ if input_files or output_files:
+ add_action(crate, nb_new, input_files, output_files)
# If the notebook has author info, add people to crate
if notebook_metadata["author"]:
# Add people referenced in notebook metadata
persons = add_people(crate, notebook_metadata["author"])
- # If people are not already attached to notebook, append them to the author property
- for person in persons:
- if person not in nb_current["author"]:
- nb_new.append_to("author", person)
-
# Otherwise add crate root authors to notebook
else:
- nb_new.append_to("author", root["author"])
+ persons = root["author"]
+
+ # If people are not already attached to notebook, append them to the author property
+ for person in persons:
+ if (nb_current and person not in nb_current.get("author", [])) or not nb_current:
+ nb_new.append_to("author", person)
def remove_deleted_files(crate: ROCrate) -> None:
@@ -442,7 +450,23 @@ def update_crate(version: str, notebooks: List[Path]) -> None:
notebooks: The notebooks to include in the crate
"""
# Load existing crate from cwd
- crate = ROCrate(source="./")
+ try:
+ crate = ROCrate(source="./")
+ except ValueError:
+ crate = ROCrate()
+ repo = Repo(".")
+ repo_url = repo.git.config("--get", "remote.origin.url").rstrip(".git")
+ repo_name = repo_url.split("/")[-1]
+ crate.update_jsonld(
+ {
+ "@id": "./",
+ "name": repo_name,
+ "description": "A GLAM Workbench repository",
+ "url": repo_url,
+ "author": id_ify([a["orcid"] for a in DEFAULT_AUTHORS])
+ }
+ )
+ add_people(crate, DEFAULT_AUTHORS)
# If this is a new version, change version number and add UpdateAction
if version:
diff --git a/scripts/update_version.sh b/scripts/update_version.sh
index e198fda..3824b95 100755
--- a/scripts/update_version.sh
+++ b/scripts/update_version.sh
@@ -11,4 +11,4 @@ jq --arg text "$text" '.description = $text' .zenodo.json \
| jq --arg pdate "$pdate" '.publication_date = $pdate' > zenodo.json;
rm .zenodo.json;
mv zenodo.json .zenodo.json;
-#python scripts/update_crate.py --version $1
\ No newline at end of file
+python scripts/update_crate.py --version $1
\ No newline at end of file