From f897e23a0c17bd80eaf169f2a3641271e0f599fc Mon Sep 17 00:00:00 2001
From: Tim Sherratt <tim@discontents.com.au>
Date: Wed, 25 Oct 2023 22:21:47 +1100
Subject: [PATCH] Update Trove Harvester package and add RO-Crate file

---
 .gitignore                                    |   1 +
 .zenodo.json                                  |   8 +-
 Explore-harvested-text-files.ipynb            |   9 +
 Exploring-your-TroveHarvester-data.ipynb      |   9 +
 README.md                                     |   2 +-
 ...er-to-get-newspaper-articles-in-bulk.ipynb |   9 +
 display_harvest_results_using_datasette.ipynb |   9 +
 harvest-specific-days.ipynb                   |   9 +
 newspaper_harvester_app.ipynb                 |   9 +
 requirements-dev.in                           |   6 +-
 requirements-dev.txt                          | 343 +++++++++++++++++-
 requirements.in                               |   1 -
 requirements.txt                              |   6 +-
 ro-crate-metadata.json                        | 223 ++++++++++++
 scripts/extract_metadata.py                   |  39 +-
 scripts/update_crate.py                       |  50 ++-
 scripts/update_version.sh                     |   2 +-
 17 files changed, 671 insertions(+), 64 deletions(-)
 create mode 100644 ro-crate-metadata.json
diff --git a/.gitignore b/.gitignore
index 5861864..0cec4f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ getting-started/my_nma_dataset.csv
 anzac-day*
 front_pages*
 Untitled*
+scripts/add_nb_metadata.py
diff --git a/.zenodo.json b/.zenodo.json
index ff16b28..237832b 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -5,7 +5,7 @@
   "related_identifiers": [
     {
       "scheme": "url",
-      "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.0",
+      "identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.1",
       "relation": "isDerivedFrom",
       "resource_type": "software"
     },
@@ -22,7 +22,7 @@
       "resource_type": "other"
     }
   ],
-  "version": "v2.0.0",
+  "version": "v2.0.1",
   "upload_type": "software",
   "keywords": [
     "Trove",
@@ -30,7 +30,7 @@
     "Jupyter",
     "GLAM Workbench"
   ],
-  "publication_date": "2023-08-31",
+  "publication_date": "2023-10-25",
   "creators": [
     {
       "orcid": "0000-0001-7956-4498",
@@ -38,5 +38,5 @@
     }
   ],
   "access_right": "open",
-  "description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0\">v2.0.0</a></p> <p>The <a href=\"https://pypi.org/project/trove-newspaper-harvester/\">Trove Newspaper &amp; Gazette Harvester Harvester</a> makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper &amp; Gazette Harvester will get <strong>everything</strong>.</p> <p>The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.</p> <p>The notebooks include:</p> <ul> <li><strong>Using TroveHarvester to get newspaper articles in bulk</strong> — an easy introduction to the TroveHarvester tool</li> <li><strong>Trove Harvester web app</strong> — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)</li> <li><strong>Harvesting articles that mention “Anzac Day” on Anzac Day</strong> – import the Harvester as a Python library to harvest a complex search</li> <li><strong>Display the results of a harvest as a searchable database using Datasette</strong> – load your harvested data into a SQLite database and explore it using Datasette</li> <li><strong>Exploring your TroveHarvester data</strong> — use Pandas to analyse your data and create some visualisations</li> <li><strong>Explore harvested text files</strong> (experimental) — analyse the full text content of harvested articles</li> </ul> <p>See the <a href=\"https://glam-workbench.github.io/trove-harvester/\">GLAM Workbench for more details</a>.</p> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3545044\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
+  "description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.1\">v2.0.1</a></p> <p>The <a href=\"https://pypi.org/project/trove-newspaper-harvester/\">Trove Newspaper &amp; Gazette Harvester Harvester</a> makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper &amp; Gazette Harvester will get <strong>everything</strong>.</p> <p>The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.</p> <p>The notebooks include:</p> <ul> <li><strong>Using TroveHarvester to get newspaper articles in bulk</strong> — an easy introduction to the TroveHarvester tool</li> <li><strong>Trove Harvester web app</strong> — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)</li> <li><strong>Harvesting articles that mention “Anzac Day” on Anzac Day</strong> – import the Harvester as a Python library to harvest a complex search</li> <li><strong>Display the results of a harvest as a searchable database using Datasette</strong> – load your harvested data into a SQLite database and explore it using Datasette</li> <li><strong>Exploring your TroveHarvester data</strong> — use Pandas to analyse your data and create some visualisations</li> <li><strong>Explore harvested text files</strong> (experimental) — analyse the full text content of harvested articles</li> </ul> <p>See the <a href=\"https://glam-workbench.github.io/trove-harvester/\">GLAM Workbench for more details</a>.</p> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3545044\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
 }
diff --git a/Explore-harvested-text-files.ipynb b/Explore-harvested-text-files.ipynb
index b55e59e..4dab10e 100644
--- a/Explore-harvested-text-files.ipynb
+++ b/Explore-harvested-text-files.ipynb
@@ -1399,6 +1399,15 @@
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
   },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Explore harvested text files"
+  },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {},
diff --git a/Exploring-your-TroveHarvester-data.ipynb b/Exploring-your-TroveHarvester-data.ipynb
index deeb764..3f9d417 100644
--- a/Exploring-your-TroveHarvester-data.ipynb
+++ b/Exploring-your-TroveHarvester-data.ipynb
@@ -1673,6 +1673,15 @@
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
   },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Exploring your harvested data"
+  },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {},
diff --git a/README.md b/README.md
index d075d43..c188059 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Trove Newspaper and Gazette Harvester
 
-Current version: [v2.0.0](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0)
+Current version: [v2.0.1](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.1)
 
 The [Trove Newspaper & Gazette Harvester Harvester](https://pypi.org/project/trove-newspaper-harvester/) makes it easy to download large quantities of digitised articles from Trove's newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get **everything**.
 
diff --git a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
index ddbf06c..97294ea 100644
--- a/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
+++ b/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb
@@ -423,6 +423,15 @@
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
   },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Using TroveHarvester to get newspaper and gazette articles in bulk"
+  },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {},
diff --git a/display_harvest_results_using_datasette.ipynb b/display_harvest_results_using_datasette.ipynb
index 578f799..5b0a9a7 100644
--- a/display_harvest_results_using_datasette.ipynb
+++ b/display_harvest_results_using_datasette.ipynb
@@ -341,6 +341,15 @@
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
   },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Display the results of a harvest as a searchable database using Datasette"
+  },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {},
diff --git a/harvest-specific-days.ipynb b/harvest-specific-days.ipynb
index 9903999..b2b5aba 100644
--- a/harvest-specific-days.ipynb
+++ b/harvest-specific-days.ipynb
@@ -533,6 +533,15 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
+  },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day"
   }
  },
  "nbformat": 4,
diff --git a/newspaper_harvester_app.ipynb b/newspaper_harvester_app.ipynb
index 3c1de9e..3cf423e 100644
--- a/newspaper_harvester_app.ipynb
+++ b/newspaper_harvester_app.ipynb
@@ -289,6 +289,15 @@
    "pygments_lexer": "ipython3",
    "version": "3.10.12"
   },
+  "rocrate": {
+   "author": [
+    {
+     "name": "Sherratt, Tim",
+     "orcid": "https://orcid.org/0000-0001-7956-4498"
+    }
+   ],
+   "name": "Trove Newspaper & Gazette Harvester"
+  },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {
diff --git a/requirements-dev.in b/requirements-dev.in
index 61a8817..648b628 100644
--- a/requirements-dev.in
+++ b/requirements-dev.in
@@ -5,4 +5,8 @@ nbqa
 black[jupyter]
 isort
 flake8
-pre-commit
\ No newline at end of file
+pre-commit
+rocrate
+giturlparse
+jupyterlab-code-formatter
+gitpython
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 22c2617..4aa258f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,13 +4,46 @@
 #
 #    pip-compile requirements-dev.in
 #
+aiohttp==3.8.5
+    # via
+    #   -c requirements.txt
+    #   tuspy
+aiosignal==1.3.1
+    # via
+    #   -c requirements.txt
+    #   aiohttp
+anyio==3.7.1
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+arcp==0.2.1
+    # via
+    #   -c requirements.txt
+    #   rocrate
+argon2-cffi==23.1.0
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+argon2-cffi-bindings==21.2.0
+    # via
+    #   -c requirements.txt
+    #   argon2-cffi
+arrow==1.2.3
+    # via
+    #   -c requirements.txt
+    #   isoduration
 asttokens==2.2.1
     # via
     #   -c requirements.txt
     #   stack-data
+async-timeout==4.0.3
+    # via
+    #   -c requirements.txt
+    #   aiohttp
 attrs==23.1.0
     # via
     #   -c requirements.txt
+    #   aiohttp
     #   jsonschema
     #   referencing
 autopep8==2.0.4
@@ -19,19 +52,49 @@ backcall==0.2.0
     # via
     #   -c requirements.txt
     #   ipython
-black[jupyter]==23.7.0
+beautifulsoup4==4.12.2
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+bioblend==1.2.0
+    # via
+    #   -c requirements.txt
+    #   gxformat2
+black[jupyter]==23.10.0
     # via -r requirements-dev.in
+bleach==6.0.0
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+cachecontrol[filecache]==0.13.1
+    # via
+    #   -c requirements.txt
+    #   schema-salad
+certifi==2023.7.22
+    # via
+    #   -c requirements.txt
+    #   requests
+cffi==1.15.1
+    # via
+    #   -c requirements.txt
+    #   argon2-cffi-bindings
 cfgv==3.4.0
     # via pre-commit
+charset-normalizer==3.2.0
+    # via
+    #   -c requirements.txt
+    #   aiohttp
+    #   requests
 click==8.1.7
     # via
     #   -c requirements.txt
     #   black
+    #   rocrate
 comm==0.1.4
     # via
     #   -c requirements.txt
     #   ipykernel
-coverage==7.3.0
+coverage==7.3.2
     # via nbval
 debugpy==1.6.7.post1
     # via
@@ -41,11 +104,16 @@ decorator==5.1.1
     # via
     #   -c requirements.txt
     #   ipython
+defusedxml==0.7.1
+    # via
+    #   -c requirements.txt
+    #   nbconvert
 distlib==0.3.7
     # via virtualenv
 exceptiongroup==1.1.3
     # via
     #   -c requirements.txt
+    #   anyio
     #   pytest
 executing==1.2.0
     # via
@@ -58,11 +126,50 @@ fastjsonschema==2.18.0
 filelock==3.12.3
     # via
     #   -c requirements.txt
+    #   cachecontrol
     #   virtualenv
 flake8==6.1.0
     # via -r requirements-dev.in
-identify==2.5.27
+fqdn==1.5.1
+    # via
+    #   -c requirements.txt
+    #   jsonschema
+frozenlist==1.4.0
+    # via
+    #   -c requirements.txt
+    #   aiohttp
+    #   aiosignal
+future==0.18.3
+    # via
+    #   -c requirements.txt
+    #   tuspy
+galaxy2cwl==0.1.4
+    # via
+    #   -c requirements.txt
+    #   rocrate
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.40
+    # via -r requirements-dev.in
+giturlparse==0.12.0
+    # via -r requirements-dev.in
+gxformat2==0.18.0
+    # via
+    #   -c requirements.txt
+    #   galaxy2cwl
+identify==2.5.30
     # via pre-commit
+idna==3.4
+    # via
+    #   -c requirements.txt
+    #   anyio
+    #   jsonschema
+    #   requests
+    #   yarl
+importlib-resources==6.0.1
+    # via
+    #   -c requirements.txt
+    #   schema-salad
 iniconfig==2.0.0
     # via pytest
 ipykernel==6.25.1
@@ -75,15 +182,34 @@ ipython==8.14.0
     #   black
     #   ipykernel
     #   nbqa
+isodate==0.6.1
+    # via
+    #   -c requirements.txt
+    #   rdflib
+isoduration==20.11.0
+    # via
+    #   -c requirements.txt
+    #   jsonschema
 isort==5.12.0
     # via -r requirements-dev.in
 jedi==0.19.0
     # via
     #   -c requirements.txt
     #   ipython
-jsonschema==4.19.0
+jinja2==3.1.2
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+    #   nbconvert
+    #   rocrate
+jsonpointer==2.4
     # via
     #   -c requirements.txt
+    #   jsonschema
+jsonschema[format-nongpl]==4.19.0
+    # via
+    #   -c requirements.txt
+    #   jupyter-events
     #   nbformat
 jsonschema-specifications==2023.7.1
     # via
@@ -93,13 +219,41 @@ jupyter-client==8.3.1
     # via
     #   -c requirements.txt
     #   ipykernel
+    #   jupyter-server
+    #   nbclient
     #   nbval
 jupyter-core==5.3.1
     # via
     #   -c requirements.txt
     #   ipykernel
     #   jupyter-client
+    #   jupyter-server
+    #   nbclient
+    #   nbconvert
     #   nbformat
+jupyter-events==0.7.0
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+jupyter-server==2.7.2
+    # via
+    #   -c requirements.txt
+    #   jupyterlab-code-formatter
+jupyter-server-terminals==0.4.4
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+jupyterlab-code-formatter==2.2.1
+    # via -r requirements-dev.in
+jupyterlab-pygments==0.2.2
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+markupsafe==2.1.3
+    # via
+    #   -c requirements.txt
+    #   jinja2
+    #   nbconvert
 matplotlib-inline==0.1.6
     # via
     #   -c requirements.txt
@@ -107,13 +261,39 @@ matplotlib-inline==0.1.6
     #   ipython
 mccabe==0.7.0
     # via flake8
+mistune==2.0.5
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+    #   schema-salad
+msgpack==1.0.5
+    # via
+    #   -c requirements.txt
+    #   cachecontrol
+multidict==6.0.4
+    # via
+    #   -c requirements.txt
+    #   aiohttp
+    #   yarl
 mypy-extensions==1.0.0
     # via
     #   -c requirements.txt
     #   black
+    #   schema-salad
+nbclient==0.7.4
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+nbconvert==7.7.4
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
 nbformat==5.9.2
     # via
     #   -c requirements.txt
+    #   jupyter-server
+    #   nbclient
+    #   nbconvert
     #   nbval
 nbqa==1.7.0
     # via -r requirements-dev.in
@@ -125,12 +305,23 @@ nest-asyncio==1.5.7
     #   ipykernel
 nodeenv==1.8.0
     # via pre-commit
+overrides==7.4.0
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
 packaging==23.1
     # via
     #   -c requirements.txt
     #   black
     #   ipykernel
+    #   jupyter-server
+    #   jupyterlab-code-formatter
+    #   nbconvert
     #   pytest
+pandocfilters==1.5.0
+    # via
+    #   -c requirements.txt
+    #   nbconvert
 parso==0.8.3
     # via
     #   -c requirements.txt
@@ -155,8 +346,12 @@ pluggy==1.3.0
     # via
     #   -c requirements.txt
     #   pytest
-pre-commit==3.3.3
+pre-commit==3.5.0
     # via -r requirements-dev.in
+prometheus-client==0.17.1
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
 prompt-toolkit==3.0.39
     # via
     #   -c requirements.txt
@@ -169,56 +364,151 @@ ptyprocess==0.7.0
     # via
     #   -c requirements.txt
     #   pexpect
+    #   terminado
 pure-eval==0.2.2
     # via
     #   -c requirements.txt
     #   stack-data
-pycodestyle==2.11.0
+pycodestyle==2.11.1
     # via
     #   autopep8
     #   flake8
+pycparser==2.21
+    # via
+    #   -c requirements.txt
+    #   cffi
 pyflakes==3.1.0
     # via flake8
 pygments==2.16.1
     # via
     #   -c requirements.txt
     #   ipython
-pytest==7.4.0
+    #   nbconvert
+pyparsing==3.0.9
+    # via
+    #   -c requirements.txt
+    #   rdflib
+pytest==7.4.2
     # via
     #   -r requirements-dev.in
     #   nbval
 python-dateutil==2.8.2
     # via
     #   -c requirements.txt
+    #   arrow
     #   jupyter-client
+    #   rocrate
+python-json-logger==2.0.7
+    # via
+    #   -c requirements.txt
+    #   jupyter-events
 pyyaml==6.0.1
     # via
     #   -c requirements.txt
+    #   galaxy2cwl
+    #   gxformat2
+    #   jupyter-events
     #   pre-commit
 pyzmq==25.1.1
     # via
     #   -c requirements.txt
     #   ipykernel
     #   jupyter-client
+    #   jupyter-server
+rdflib==6.3.2
+    # via
+    #   -c requirements.txt
+    #   schema-salad
 referencing==0.30.2
     # via
     #   -c requirements.txt
     #   jsonschema
     #   jsonschema-specifications
+    #   jupyter-events
+requests==2.31.0
+    # via
+    #   -c requirements.txt
+    #   bioblend
+    #   cachecontrol
+    #   requests-toolbelt
+    #   rocrate
+    #   schema-salad
+    #   tuspy
+requests-toolbelt==1.0.0
+    # via
+    #   -c requirements.txt
+    #   bioblend
+rfc3339-validator==0.1.4
+    # via
+    #   -c requirements.txt
+    #   jsonschema
+    #   jupyter-events
+rfc3986-validator==0.1.1
+    # via
+    #   -c requirements.txt
+    #   jsonschema
+    #   jupyter-events
+rocrate==0.9.0
+    # via
+    #   -c requirements.txt
+    #   -r requirements-dev.in
 rpds-py==0.10.0
     # via
     #   -c requirements.txt
     #   jsonschema
     #   referencing
+ruamel-yaml==0.17.32
+    # via
+    #   -c requirements.txt
+    #   schema-salad
+ruamel-yaml-clib==0.2.7
+    # via
+    #   -c requirements.txt
+    #   ruamel-yaml
+schema-salad==8.4.20230808163024
+    # via
+    #   -c requirements.txt
+    #   gxformat2
+send2trash==1.8.2
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
 six==1.16.0
     # via
     #   -c requirements.txt
     #   asttokens
+    #   bleach
+    #   isodate
     #   python-dateutil
+    #   rfc3339-validator
+    #   tuspy
+smmap==5.0.1
+    # via gitdb
+sniffio==1.3.0
+    # via
+    #   -c requirements.txt
+    #   anyio
+soupsieve==2.4.1
+    # via
+    #   -c requirements.txt
+    #   beautifulsoup4
 stack-data==0.6.2
     # via
     #   -c requirements.txt
     #   ipython
+terminado==0.17.1
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+    #   jupyter-server-terminals
+tinycss2==1.2.1
+    # via
+    #   -c requirements.txt
+    #   nbconvert
+tinydb==4.8.0
+    # via
+    #   -c requirements.txt
+    #   tuspy
 tokenize-rt==5.2.0
     # via
     #   black
@@ -235,6 +525,8 @@ tornado==6.3.3
     #   -c requirements.txt
     #   ipykernel
     #   jupyter-client
+    #   jupyter-server
+    #   terminado
 traitlets==5.9.0
     # via
     #   -c requirements.txt
@@ -243,18 +535,53 @@ traitlets==5.9.0
     #   ipython
     #   jupyter-client
     #   jupyter-core
+    #   jupyter-events
+    #   jupyter-server
     #   matplotlib-inline
+    #   nbclient
+    #   nbconvert
     #   nbformat
+tuspy==1.0.1
+    # via
+    #   -c requirements.txt
+    #   bioblend
 typing-extensions==4.7.1
     # via
     #   -c requirements.txt
+    #   bioblend
+    #   black
     #   filelock
-virtualenv==20.24.3
+uri-template==1.3.0
+    # via
+    #   -c requirements.txt
+    #   jsonschema
+urllib3==2.0.4
+    # via
+    #   -c requirements.txt
+    #   requests
+virtualenv==20.24.5
     # via pre-commit
 wcwidth==0.2.6
     # via
     #   -c requirements.txt
     #   prompt-toolkit
+webcolors==1.13
+    # via
+    #   -c requirements.txt
+    #   jsonschema
+webencodings==0.5.1
+    # via
+    #   -c requirements.txt
+    #   bleach
+    #   tinycss2
+websocket-client==1.6.2
+    # via
+    #   -c requirements.txt
+    #   jupyter-server
+yarl==1.9.2
+    # via
+    #   -c requirements.txt
+    #   aiohttp
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/requirements.in b/requirements.in
index fb4b393..0ef345a 100644
--- a/requirements.in
+++ b/requirements.in
@@ -16,7 +16,6 @@ ipywidgets
 voila
 voila-material @ git+https://github.com/GLAM-Workbench/voila-material.git
 wordcloud
-trove-newspaper-harvester
 datasette
 datasette-media
 datasette-json-html
diff --git a/requirements.txt b/requirements.txt
index 15b1c89..48b8d3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile requirements.in
+#    pip-compile
 #
 aiofiles==23.2.1
     # via datasette
@@ -434,7 +434,7 @@ rfc3986-validator==0.1.1
     # via
     #   jsonschema
     #   jupyter-events
-rocrate==0.8.0
+rocrate==0.9.0
     # via trove-newspaper-harvester
 rpds-py==0.10.0
     # via
@@ -522,7 +522,7 @@ traitlets==5.9.0
     #   nbconvert
     #   nbformat
     #   voila
-trove-newspaper-harvester==0.7.1
+trove-newspaper-harvester==0.7.2
     # via -r requirements.in
 trove-newspaper-images==0.2.1
     # via trove-newspaper-harvester
diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
new file mode 100644
index 0000000..0f4deb9
--- /dev/null
+++ b/ro-crate-metadata.json
@@ -0,0 +1,223 @@
+{
+    "@context": "https://w3id.org/ro/crate/1.1/context",
+    "@graph": [
+        {
+            "@id": "./",
+            "@type": "Dataset",
+            "author": [
+                {
+                    "@id": "0000-0001-7956-4498"
+                }
+            ],
+            "datePublished": "2023-10-25",
+            "description": "A GLAM Workbench repository",
+            "hasPart": [
+                {
+                    "@id": "newspaper_harvester_app.ipynb"
+                },
+                {
+                    "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb"
+                },
+                {
+                    "@id": "Explore-harvested-text-files.ipynb"
+                },
+                {
+                    "@id": "display_harvest_results_using_datasette.ipynb"
+                },
+                {
+                    "@id": "Exploring-your-TroveHarvester-data.ipynb"
+                },
+                {
+                    "@id": "harvest-specific-days.ipynb"
+                }
+            ],
+            "license": {
+                "@id": "https://spdx.org/licenses/MIT"
+            },
+            "name": "trove-newspaper-harvester",
+            "url": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "version": "v2.0.1"
+        },
+        {
+            "@id": "ro-crate-metadata.json",
+            "@type": "CreativeWork",
+            "about": {
+                "@id": "./"
+            },
+            "conformsTo": {
+                "@id": "https://w3id.org/ro/crate/1.1"
+            },
+            "license": {
+                "@id": "https://creativecommons.org/publicdomain/zero/1.0/"
+            }
+        },
+        {
+            "@id": "newspaper_harvester_app.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Trove Newspaper & Gazette Harvester",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Using TroveHarvester to get newspaper and gazette articles in bulk",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "Explore-harvested-text-files.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Explore harvested text files",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "display_harvest_results_using_datasette.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Display the results of a harvest as a searchable database using Datasette",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "Exploring-your-TroveHarvester-data.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Exploring your harvested data",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "harvest-specific-days.ipynb",
+            "@type": [
+                "File",
+                "SoftwareSourceCode"
+            ],
+            "author": [
+                {
+                    "@id": "https://orcid.org/0000-0001-7956-4498"
+                }
+            ],
+            "codeRepository": "https://github.com/GLAM-Workbench/trove-newspaper-harvester",
+            "conformsTo": {
+                "@id": "https://purl.archive.org/textcommons/profile#Notebook"
+            },
+            "description": "",
+            "encodingFormat": "application/x-ipynb+json",
+            "name": "Harvesting articles that mention \"Anzac Day\" on Anzac Day",
+            "programmingLanguage": {
+                "@id": "https://www.python.org/downloads/release/python-31012/"
+            }
+        },
+        {
+            "@id": "https://orcid.org/0000-0001-7956-4498",
+            "@type": "Person",
+            "name": "Sherratt, Tim"
+        },
+        {
+            "@id": "https://spdx.org/licenses/MIT",
+            "@type": "CreativeWork",
+            "name": "MIT License",
+            "url": "https://spdx.org/licenses/MIT.html"
+        },
+        {
+            "@id": "https://creativecommons.org/publicdomain/zero/1.0/",
+            "@type": "CreativeWork",
+            "name": "CC0 Public Domain Dedication",
+            "url": "https://creativecommons.org/publicdomain/zero/1.0/"
+        },
+        {
+            "@id": "https://www.python.org/downloads/release/python-31012/",
+            "@type": [
+                "ComputerLanguage",
+                "SoftwareApplication"
+            ],
+            "name": "Python 3.10.12",
+            "url": "https://www.python.org/downloads/release/python-31012/",
+            "version": "3.10.12"
+        },
+        {
+            "@id": "#create_version_v2_0_1",
+            "@type": "UpdateAction",
+            "actionStatus": {
+                "@id": "http://schema.org/CompletedActionStatus"
+            },
+            "endDate": "2023-10-25",
+            "name": "Create version v2.0.1"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py
index 07258e2..30c61f9 100644
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -2,39 +2,24 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import nbformat
+import re
 
 AuthorInfo = Dict[str, str]
 
-DEFAULT_AUTHOR = {
-    "name": "Unknown",
-    "orcid": "https://orcid.org/0000-0000-0000-0000",
-}
-CREATORS_KEY = "creators"
 
 LISTIFY = ["author", "object", "input"]
 
-def extract_metadata(metadata):
-    with open(metadata) as file:
-        data = json.load(file)
-    return data
-
-def extract_default_authors(metadata: Path) -> List[AuthorInfo]:
-    """Attempts to extract author information from the metadata.json file within
-    the repository. If none are found, returns a dummy value.
-
-    Parameters:
-        metadata: The path to the metadata file, commonly metadata.json
-    """
-    with open(metadata) as file:
-        data = json.load(file)
-
-    return data.get(CREATORS_KEY, [DEFAULT_AUTHOR])
-
 def listify(value):
     if not isinstance(value, list):
         return [value]
     return value
 
+def extract_notebook_title(nb):
+    md_cells = [c for c in nb.cells if c["cell_type"] == "markdown"]
+    for cell in md_cells:
+        if title := re.search(r"^# (.+)(\n|$)", cell["source"]):
+            return title.group(1)
+
 def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str, Any]:
     """Attempts to extract metadata from the notebook.
 
@@ -46,16 +31,6 @@ def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str,
     Returns:
         A dictionary containing the retrieved metadata for each key.
     """
-    """
-    with open(notebook) as file:
-        data = json.load(file)
-
-    metadata = data["metadata"]
-    result = {}
-
-    for key, default in keys.items():
-        result[key] = metadata.get(key, default)
-    """
     result = {}
     nb = nbformat.read(notebook, nbformat.NO_CONVERT)
     metadata = nb.metadata.rocrate
diff --git a/scripts/update_crate.py b/scripts/update_crate.py
index d86bf66..b245b65 100755
--- a/scripts/update_crate.py
+++ b/scripts/update_crate.py
@@ -3,6 +3,7 @@
 import datetime
 import requests
 from giturlparse import parse as ghparse
+from git import Repo
 from pathlib import Path
 from typing import Union, List, Dict, Tuple
 import mimetypes
@@ -38,6 +39,11 @@
     "@type": ["ComputerLanguage", "SoftwareApplication"],
 }
 
+DEFAULT_AUTHORS = [{
+    "name": "Sherratt, Tim",
+    "orcid": "0000-0001-7956-4498"
+}]
+
 
 def main(version: str):
     # Make working directory the parent of the scripts directory
@@ -45,7 +51,7 @@ def main(version: str):
 
     # Get a list of paths to notebooks in the cwd
     notebooks = get_notebooks()
-
+    print(notebooks)
     # Update the crate
     update_crate(version, notebooks)
 
@@ -59,8 +65,9 @@ def get_notebooks() -> List[Path]:
     Returns:
         Paths of the notebooks found in the directory
     """
-    files = [Path(file) for file in os.listdir()]
-    is_notebook = lambda file: file.suffix == NOTEBOOK_EXTENSION
+    # files = [Path(file) for file in os.listdir()]
+    files = Path(".").glob("*.ipynb")
+    is_notebook = lambda file: not file.name.lower().startswith(("draft", "untitled"))
     return list(filter(is_notebook, files))
 
 
@@ -300,6 +307,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
             "result": [],
         },
     )
+    print(notebook_metadata)
 
     # Check if this notebook is already in the crate
     nb_current = crate.get(notebook.name)
@@ -313,8 +321,7 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
         properties.update(
             {
                 "name": notebook_metadata["name"],
-                "description": notebook_metadata["description"],
-                "author": [],
+                "description": notebook_metadata["description"]
             }
         )
     else:
@@ -342,21 +349,22 @@ def add_notebook(crate: ROCrate, notebook: Path) -> None:
     nb_new = crate.add_file(notebook, properties=properties)
 
     # Add a CreateAction that links the notebook run with the input and output files
-    add_action(crate, nb_new, input_files, output_files)
+    if input_files or output_files:
+        add_action(crate, nb_new, input_files, output_files)
 
     # If the notebook has author info, add people to crate
     if notebook_metadata["author"]:
         # Add people referenced in notebook metadata
         persons = add_people(crate, notebook_metadata["author"])
 
-        # If people are not already attached to notebook, append them to the author property
-        for person in persons:
-            if person not in nb_current["author"]:
-                nb_new.append_to("author", person)
-
     # Otherwise add crate root authors to notebook
     else:
-        nb_new.append_to("author", root["author"])
+        persons = root["author"]
+
+    # If people are not already attached to notebook, append them to the author property
+    for person in persons:
+        if (nb_current and person not in nb_current.get("author", [])) or not nb_current:
+            nb_new.append_to("author", person)
 
 
 def remove_deleted_files(crate: ROCrate) -> None:
@@ -442,7 +450,23 @@ def update_crate(version: str, notebooks: List[Path]) -> None:
         notebooks: The notebooks to include in the crate
     """
     # Load existing crate from cwd
-    crate = ROCrate(source="./")
+    try:
+        crate = ROCrate(source="./")
+    except ValueError:
+        crate = ROCrate()
+        repo = Repo(".")
+        repo_url = repo.git.config("--get", "remote.origin.url").rstrip(".git")
+        repo_name = repo_url.split("/")[-1]
+        crate.update_jsonld(
+            {
+                "@id": "./",
+                "name": repo_name,
+                "description": "A GLAM Workbench repository",
+                "url": repo_url,
+                "author": id_ify([a["orcid"] for a in DEFAULT_AUTHORS])
+            }
+        )
+        add_people(crate, DEFAULT_AUTHORS)
 
     # If this is a new version, change version number and add UpdateAction
     if version:
diff --git a/scripts/update_version.sh b/scripts/update_version.sh
index e198fda..3824b95 100755
--- a/scripts/update_version.sh
+++ b/scripts/update_version.sh
@@ -11,4 +11,4 @@ jq --arg text "$text" '.description = $text' .zenodo.json \
 | jq --arg pdate "$pdate" '.publication_date = $pdate' > zenodo.json;
 rm .zenodo.json;
 mv zenodo.json .zenodo.json;
-#python scripts/update_crate.py --version $1
\ No newline at end of file
+python scripts/update_crate.py --version $1
\ No newline at end of file