-
Notifications
You must be signed in to change notification settings - Fork 4
/
__init__.py
34 lines (28 loc) · 1.09 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from datagovharvester.extract.dcatus import parse_catalog
from datagovharvester.extract.utils import download_json
from datagovharvester.utils.s3_utilities import upload_to_S3
from datagovharvester.validate.dcat_us import is_dcatus_schema
# ruff: noqa: F841
def main( job_info, S3_client ):
""" extract a file, mild validation, upload to s3 bucket.
job_info (dict) : info on the job ( e.g. source_id, job_id, url )
S3_client (boto3.client) : S3 client
"""
output = { "job_id": job_info["job_id"], "s3_paths": [] }
# download file
try:
catalog = download_json( job_info["url"] )
except Exception as e:
# do something with e
return e
# check schema
if not is_dcatus_schema( catalog ):
return "invalid dcatus catalog"
# parse catalog and upload records
try:
for record_info in parse_catalog( catalog, job_info ):
upload_to_S3( S3_client, record_info )
output["s3_paths"].append( record_info["Key"] )
except Exception as e:
return False
return output