-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdans_scrape.py
175 lines (131 loc) · 6.44 KB
/
dans_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Module for scraping file metadata from the DANS Archaeology Datastation.
The scrape harvests only metadata from the files as originally deposited, not the files being offered as converted to
preferred formats. This way, we can analyze the original file formats in use at the time of deposition.
Good example: https://archaeology.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-zbe-b8h5
In order to extract this specific metadata, the `main` script filters data on the following:
1. It iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
2. It extracts the digital object identifier (DOI) from the links to the datasets
3. It rejects all datasets having a description "Files not yet migrated to Data Station"
4. It gets the versions metadata for the dataset
5. It appends the versions metadata to a scrape log as ndjson-formatted file
TODO:
- One version labeled "EASY Migration":
see https://archaeology.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-28e-mmdt
- Two or more versions labeled "EASY Migration":
see https://archaeology.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x8z-d4he,
"""
import datetime
import json
import logging
from argparse import ArgumentParser
from http.client import HTTPSConnection
from math import ceil
from typing import List, Dict, Optional
from bs4 import BeautifulSoup
from ruamel.yaml import CommentedMap
from tqdm import tqdm
from analysis.config import load_config
from analysis.loaders_dumpers import get
def main(config: CommentedMap) -> int:
"""
Iterates over all pages in the datasets index of the Archaeology Datastation (~120k results in ~12k pages)
:param config: a analysis.config.Config instance
:return: 0
"""
start = datetime.datetime.now()
dans_cfg = config['data']['dans']
url = dans_cfg['root_url']
connection = HTTPSConnection(url.split('/')[-1])
res_text = get(url, connection)
soup = BeautifulSoup(res_text, features="html.parser")
results_count = soup.find(class_='results-count').text.split(' ')
total = int(results_count[4].replace(',', ''))
page_size = int(results_count[2])
num_pages = ceil(total / page_size)
num_skipped_datasets = 0
for page_num in tqdm(range(dans_cfg['start_page'], num_pages)):
dois = dois_from_results(page_num, connection, dans_cfg)
# Extract the file metadata for each dataset (by DOI)
for doi in dois:
version_metadata = scrape_version_metadata(doi, connection, dans_cfg)
if version_metadata is None:
num_skipped_datasets += 1
continue
# Append the version metadata for the dataset to the newline-delimited json scrape log
with open(dans_cfg['scrape_log_path'], 'at') as f:
f.write(json.dumps(version_metadata) + '\n')
filenames, deposit_date = version_metadata
filetype_counts: Dict[str, int] = {}
# Update the monthly file extension counts by going over all the v1 files in the dataset
for file in filenames:
# Skip filenames without file extension
if '.' not in file:
continue
extension = file.split('.')[-1]
filetype_counts.setdefault(extension, 0)
filetype_counts[extension] += 1
end = datetime.datetime.now()
logging.info(f'Script took {end - start}')
return 0
def dois_from_results(page_num: int, conn: HTTPSConnection, dans_cfg: CommentedMap) -> List[str]:
"""
Processes a specific results page indicated by `page_num` from the main Archaeology Datastation datasets index
:param page_num: The page number of the complete result set, as a whole positive number
:param dans_cfg: The DANS configuration parsed extracted from a Config instance
:return: A list of DOIs as strings
"""
root_url = dans_cfg['root_url']
page_subpath = dans_cfg['page_subpath'].format(page=page_num)
url = root_url + page_subpath
res_text = get(url, conn)
dois = extract_dois(res_text)
return dois
def extract_dois(res_text: str) -> List[str]:
"""
Extracts the digital object identifier (DOI) from the links to the datasets on a datasets results page.
:param res_text: The HTML text of a Archaeology Datastation datasets results page
:return: A list of extracted DOIs as strings
"""
soup = BeautifulSoup(res_text, features="html.parser")
dois: List[str] = []
for dataset in soup.find_all(class_='card-title-icon-block'):
hyperlink = dataset.a['href']
doi = hyperlink.split('=')[1]
dois.append(doi)
return dois
def scrape_version_metadata(
doi: str,
conn: HTTPSConnection,
dans_cfg: CommentedMap
) -> Optional[dict[str, list[dict[str, int]]]]:
"""
Extracts a list of original filenames and a deposit date for a dataset designated by `doi`
It returns None for a dataset having
- a description "Files not yet migrated to Data Station"
:param doi: Digital object identifier for the dataset
:param dans_cfg: DANS Archaeology datastation extracted from a Config instance
:return: A tuple of a list of filenames and a deposit date, or None if requirements ar not met.
"""
root_url = dans_cfg['root_url']
overview_subpath = dans_cfg['dataset_overview_api_subpath']
url = root_url + overview_subpath.format(doi=doi)
res = json.loads(get(url, conn))
# Return empty list for datasets not yet migrated
for citation_field in res['data']['latestVersion']['metadataBlocks']['citation']['fields']:
if citation_field['typeName'] == 'dsDescription':
for value in citation_field['value']:
if 'not yet migrated' in value['dsDescriptionValue']['value']:
logging.debug(f"Skipping {doi}: not yet migrated")
return None
# Inspect the available dataset versions
versions_subpath = dans_cfg['dataset_versions_api_subpath']
url = root_url + versions_subpath.format(doi=doi)
versions = json.loads(get(url, conn))
return dict(versions)
if __name__ == '__main__':
parser = ArgumentParser('Performs the Data Archiving and Networked Services file metadata analysis')
parser.add_argument('-c', '--config', default='config.yaml')
args = parser.parse_args()
config = load_config(args.config)
raise SystemExit(main(config))