Skip to content

Commit

Permalink
black formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
robert-bryson committed Jun 1, 2023
1 parent 4f6554a commit 6426179
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 45 deletions.
3 changes: 1 addition & 2 deletions datagovharvester/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

# configuration settings
bucket_name = "test-bucket"
bucket_name = "test-bucket"
content_types = {
"json": "application/json",
}
Expand Down
23 changes: 12 additions & 11 deletions datagovharvester/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,31 @@

# ruff: noqa: F841

def main( job_info, S3_client ):
""" extract a file, mild validation, upload to s3 bucket.

def main(job_info, S3_client):
"""extract a file, mild validation, upload to s3 bucket.
job_info (dict) : info on the job ( e.g. source_id, job_id, url )
S3_client (boto3.client) : S3 client
"""
output = { "job_id": job_info["job_id"], "s3_paths": [] }
output = {"job_id": job_info["job_id"], "s3_paths": []}

# download file
try:
catalog = download_json( job_info["url"] )
catalog = download_json(job_info["url"])
except Exception as e:
# do something with e
return e
return e

# check schema
if not is_dcatus_schema( catalog ):
if not is_dcatus_schema(catalog):
return "invalid dcatus catalog"

# parse catalog and upload records
try:
for record_info in parse_catalog( catalog, job_info ):
upload_to_S3( S3_client, record_info )
output["s3_paths"].append( record_info["Key"] )
for record_info in parse_catalog(catalog, job_info):
upload_to_S3(S3_client, record_info)
output["s3_paths"].append(record_info["Key"])
except Exception as e:
return False

return output
12 changes: 7 additions & 5 deletions datagovharvester/extract/dcatus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@

# ruff: noqa: F841

def parse_catalog( catalog, job_info ):
""" parse the catalog and yield each record as an S3 data upload dict

def parse_catalog(catalog, job_info):
"""parse the catalog and yield each record as an S3 data upload dict
catalog (dict) : dcatus catalog json
job_info (dict) : info on the job ( e.g. source_id, job_id, url )
"""
for idx, record in enumerate(catalog["dataset"]):
try:
record = json.dumps(record)
key_name = "{}/{}/{}/{}.json".format( extract_feat_name,
job_info["source_id"], job_info["job_id"], idx )
key_name = "{}/{}/{}/{}.json".format(
extract_feat_name, job_info["source_id"], job_info["job_id"], idx
)
yield create_s3_upload_data(record, key_name, content_types["json"])
except Exception as e:
pass
pass
4 changes: 2 additions & 2 deletions datagovharvester/extract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@


def download_json(url):
""" download file and pull json from response
"""download file and pull json from response
url (str) : path to the file to be downloaded.
"""
try:
resp = requests.get(url)
except requests.exceptions.RequestException as e:
raise Exception(e)
except requests.exceptions.JSONDecodeError as e:
raise Exception(e)
raise Exception(e)

if resp.status_code != 200:
raise Exception("non-200 status code")
Expand Down
3 changes: 2 additions & 1 deletion datagovharvester/utils/json_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

# ruff: noqa: F841


def open_json(file_path):
""" open input json file as dictionary
"""open input json file as dictionary
file_path (str) : json file path.
"""
try:
Expand Down
18 changes: 10 additions & 8 deletions datagovharvester/utils/s3_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,21 @@

# ruff: noqa: F841


def create_s3_client(s3_config):
""" create boto3.client object
s3_config (dict) : configuration dict.
"""create boto3.client object
s3_config (dict) : configuration dict.
"""
try:
return boto3.client("s3", **s3_config)
except botocore.exceptions.ClientError as e:
pass
pass


def create_s3_upload_data(body, key_name, content_type):
""" create s3 data to be uploaded to the default bucket
json_str (str) : data to be placed in s3 bucket as json string.
key_name (str) : name of the file to be placed in the s3 bucket.
"""create s3 data to be uploaded to the default bucket
json_str (str) : data to be placed in s3 bucket as json string.
key_name (str) : name of the file to be placed in the s3 bucket.
"""
return {
"Body": body,
Expand All @@ -28,11 +30,11 @@ def create_s3_upload_data(body, key_name, content_type):


def upload_to_S3(S3, s3_upload_data):
""" store the s3 payload
"""store the s3 payload
S3 (boto3 client) : boto3 S3 client
s3_upload_data (dict) : payload to be stored in s3 bucket.
"""
try:
return S3.put_object(**s3_upload_data)
except Exception as e:
pass
pass
6 changes: 3 additions & 3 deletions datagovharvester/validate/dcat_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from jsonschema.exceptions import ValidationError


def is_dcatus_schema( catalog ):

def is_dcatus_schema(catalog):
if "dataset" in catalog:
return True

return False


def parse_errors(errors):
error_message = ""

Expand Down
6 changes: 5 additions & 1 deletion tests/extract/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def get_dcatus_job():
"job_id": str(uuid4()),
}


@pytest.fixture
def get_bad_url():
"""example dcatus job payload with bad url"""
Expand All @@ -25,6 +26,7 @@ def get_bad_url():
"job_id": str(uuid4()),
}


@pytest.fixture
def get_bad_json():
"""example bad json with missing enclosing bracket"""
Expand All @@ -34,6 +36,7 @@ def get_bad_json():
"job_id": str(uuid4()),
}


@pytest.fixture
def get_no_dataset_key_dcatus_json():
"""example dcatus json with no 'dataset' key"""
Expand All @@ -43,9 +46,10 @@ def get_no_dataset_key_dcatus_json():
"job_id": str(uuid4()),
}


@pytest.fixture
def create_client_config():
"""create s3 configuration dictionary intended
"""create s3 configuration dictionary intended
to be passed to boto3.client("s3", **s3_config)"""
config = {}
load_dotenv()
Expand Down
29 changes: 17 additions & 12 deletions tests/extract/test_dcatus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,56 @@


def test_extract_dcatus(get_dcatus_job, create_client):
""" download dcat-us json file and store result in s3 bucket.
get_dcatus_job (dict) : fixture containing job data
"""download dcat-us json file and store result in s3 bucket.
get_dcatus_job (dict) : fixture containing job data
create_client (boto3.client) : S3 client object
"""

S3_client = create_client
S3_client.create_bucket(Bucket=bucket_name)

assert main( get_dcatus_job, S3_client )
assert main(get_dcatus_job, S3_client)


def test_extract_bad_url(get_bad_url, create_client):
""" attempt to download a bad url.
"""attempt to download a bad url.
get_bad_url (dict) : fixture containing job data with bad url
create_client (boto3.client) : S3 client object
"""

S3_client = create_client
S3_client.create_bucket(Bucket=bucket_name)

if str(main( get_bad_url, S3_client )) == "non-200 status code":
if str(main(get_bad_url, S3_client)) == "non-200 status code":
assert True


def test_extract_bad_json(get_bad_json, create_client):
""" attempt to download a bad url.
"""attempt to download a bad url.
get_bad_json (dict) : fixture containing job data with bad json
create_client (boto3.client) : S3 client object
"""

S3_client = create_client
S3_client.create_bucket(Bucket=bucket_name)

error = ( "Expecting property name enclosed "
"in double quotes: line 4 column 1 (char 25)" )
if str( main( get_bad_json, S3_client ) ) == error:
error = (
"Expecting property name enclosed "
"in double quotes: line 4 column 1 (char 25)"
)
if str(main(get_bad_json, S3_client)) == error:
assert True


def test_extract_no_dataset_key(get_no_dataset_key_dcatus_json, create_client):
""" attempt to download a invalid dcatus catalog.
get_no_dataset_key_dcatus_json (dict)
"""attempt to download a invalid dcatus catalog.
get_no_dataset_key_dcatus_json (dict)
: fixture containing dcatus with no 'dataset' key
create_client (boto3.client) : S3 client object
"""

S3_client = create_client
S3_client.create_bucket(Bucket=bucket_name)

if main( get_no_dataset_key_dcatus_json, S3_client ) == "invalid dcatus catalog":
if main(get_no_dataset_key_dcatus_json, S3_client) == "invalid dcatus catalog":
assert True

0 comments on commit 6426179

Please sign in to comment.