generated from GLAM-Workbench/glam-workbench-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crate_maker.py
114 lines (101 loc) · 3.61 KB
/
crate_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from rocrate.rocrate import ContextEntity, ROCrate
import ipynbname
import nbformat
import mimetypes
from datetime import datetime
from giturlparse import parse as ghparse
import requests
def add_gh_file(crate, url):
datafile = url.replace("/raw/", "/blob/")
gh_parts = ghparse(datafile)
# API url to get the latest commit for this file
gh_commit_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/commits?path={gh_parts.path_raw.split('/')[-1]}"
try:
response = requests.get(gh_commit_url)
# Get the date of the last commit
date = response.json()[0]["commit"]["committer"]["date"][:10]
except (IndexError, KeyError):
date = None
# Different API endpoint for file data
gh_file_url = f"https://api.github.com/repos/{gh_parts.owner}/{gh_parts.repo}/contents/{gh_parts.path_raw.split('/')[-1]}"
try:
response = requests.get(gh_file_url)
contents_data = response.json()
# Get the file size
try:
size = contents_data["size"]
except TypeError:
size = None
except KeyError:
size = None
obj_properties = {
"@type": [
"File",
"Dataset"
],
"contentSize": size,
"dateModified": date,
"name": gh_parts.path_raw.split('/')[-1],
"url": datafile
}
crate.add_file(datafile, properties=obj_properties)
def create_rocrate(subject, file_path, start_date, end_date):
"""
Create an RO-Crate metadata file describing the downloaded dataset.
"""
crate = ROCrate()
# Initialise crate with dataset
crate.add_file(file_path)
# Add notebook details
nb_path = ipynbname.path()
nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
metadata = nb.metadata.rocrate
nb_url = metadata.get("url", "")
nb_properties = {
"@type": ["File", "SoftwareSourceCode"],
"name": metadata.get("name", ""),
"description": metadata.get("description", ""),
"encodingFormat": "application/x-ipynb+json",
"codeRepository": metadata.get("codeRepository", ""),
"url": nb_url,
}
crate.add(ContextEntity(crate, nb_url, properties=nb_properties))
# Add action
action_id = f"{nb_path.stem}_run"
action_properties = {
"@type": "CreateAction",
"instrument": {"@id": nb_url},
"actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
"name": f"Run of notebook: {nb_path.name}",
"result": {"@id": f"{file_path.name}/"},
"object": [{"@id": o["url"]} for o in metadata["action"][0]["object"]],
"query": f"{subject['id']} ({subject['name']})",
"startDate": start_date,
"endDate": end_date,
}
# If there are any GitHub references in action objects, add them to the crate
for obj in metadata["action"][0]["object"]:
if "github.com" in obj["url"]:
add_gh_file(crate, obj["url"])
# Update dataset details
encoding = mimetypes.guess_type(file_path)[0]
stats = file_path.stat()
size = stats.st_size
date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
rows = 0
with file_path.open("r") as df:
for line in df:
rows += 1
crate.update_jsonld(
{
"@id": file_path.name,
"dateModified": date,
"contentSize": size,
"size": rows,
"encodingFormat": encoding,
}
)
crate.add(ContextEntity(crate, action_id, properties=action_properties))
# Save the crate
crate.write(file_path.parent)
crate.write_zip(file_path.parent)