Skip to content

Commit

Permalink
Review fixes lsst#8
Browse files Browse the repository at this point in the history
Schemeless URIs handled more smoothly in ButlerURI.

Removed s3CheckFileExistsGET and s3CheckFileExistsLIST for
a singular s3CheckFileExists function, as botocore is up
to date now on coda and on pip so GET/HEAD requests make
no performance difference anymore.
Live testing shows possibility that HTTP 403 response is
returned for HEAD requests when file does not exist and
user does not have s3:ListBucket permissions. Added a
more informative error message.

Changed the s3utils function signatures to look more
alike.

Changed an instance of os.path to posixpath join in
test_butler to be more consistent with what is expected
from used URIs.

Corrected Config.__initFromS3Yaml to use an URL instead
of path and Config.__initFromYaml to use ospath.
  • Loading branch information
DinoBektesevic committed Aug 4, 2019
1 parent 61109e5 commit 964e55d
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 139 deletions.
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,10 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat
if isinstance(config, (ButlerConfig, ConfigSubset)):
raise ValueError("makeRepo must be passed a regular Config without defaults applied.")

# for "file" schemes we are assuming POSIX semantics for paths, for
# schemeless URIs we are assuming os.path semantics.
uri = ButlerURI(root)
if uri.scheme == "file":
if uri.scheme == "file" or not uri.scheme:
if not os.path.isdir(uri.ospath):
safeMakeDir(uri.ospath)
elif uri.scheme == "s3":
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/core/butlerConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(self, other=None, searchPaths=None):

if isinstance(other, str):
uri = ButlerURI(other)
if uri.scheme == "file":
if uri.scheme == "file" or not uri.scheme:
if os.path.isdir(uri.ospath):
other = os.path.join(uri.ospath, "butler.yaml")
elif uri.scheme == "s3":
Expand Down
10 changes: 5 additions & 5 deletions python/lsst/daf/butler/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,13 +244,13 @@ def __initFromFile(self, path):
uri = ButlerURI(path)
if uri.path.endswith("yaml"):
if uri.scheme == "s3":
self.__initFromS3YamlFile(path)
self.__initFromS3YamlFile(uri.geturl())
else:
self.__initFromYamlFile(path)
self.__initFromYamlFile(uri.ospath)
else:
raise RuntimeError("Unhandled config file type:%s" % uri)

def __initFromS3YamlFile(self, path):
def __initFromS3YamlFile(self, url):
"""Load a file at a given S3 Bucket uri and attempts to load it from
yaml.
Expand All @@ -263,7 +263,7 @@ def __initFromS3YamlFile(self, path):
raise ModuleNotFoundError("boto3 not found."
"Are you sure it is installed?")

uri = ButlerURI(path)
uri = ButlerURI(url)
s3 = boto3.client("s3")
try:
response = s3.get_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
Expand All @@ -273,7 +273,7 @@ def __initFromS3YamlFile(self, path):
# boto3 response is a `StreamingBody`, but not a valid Python IOStream.
# Loader will raise an error that the stream has no name. A hackish
# solution is to name it explicitly.
response["Body"].name = path
response["Body"].name = url
self.__initFromYaml(response["Body"])
response["Body"].close()

Expand Down
11 changes: 8 additions & 3 deletions python/lsst/daf/butler/core/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,19 @@ def path(self):
@property
def ospath(self):
"""Path component of the URI localized to current OS."""
if self.scheme == 's3':
raise AttributeError('S3 URIs have no OS path.')
return posix2os(self._uri.path)

@property
def relativeToPathRoot(self):
"""Returns path relative to network location.
Effectively, this is the path property with posix separator stipped
Effectively, this is the path property with posix separator stripped
from the left hand side of the path.
"""
return self._uri.path.lstrip('/')
p = PurePosixPath(self._uri.path)
return str(p.relative_to(p.root))

@property
def fragment(self):
Expand Down Expand Up @@ -395,7 +398,9 @@ def relativeToPathRoot(self):
Effectively, this is the path property with posix separator stipped
from the left hand side of the path.
"""
return self.path.lstrip("/")
p = PurePosixPath(os2posix(self.path))
stripped = p.relative_to(p.root)
return str(posix2os(stripped))

def updateExtension(self, ext):
"""Update the file extension associated with this `Location`.
Expand Down
157 changes: 48 additions & 109 deletions python/lsst/daf/butler/core/s3utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,20 @@
from lsst.daf.butler.core.location import ButlerURI, Location


def s3CheckFileExistsGET(client, bucket, filepath):
def s3CheckFileExists(path, bucket=None, client=None):
"""Returns (True, filesize) if file exists in the bucket and (False, -1) if
the file is not found.
Parameters
----------
client : `boto3.client`
S3 Client object to query.
bucket : `str`
Name of the bucket in which to look.
filepath : `str`
Path to file.
path : `Location`, `ButlerURI`, `str`
Location or ButlerURI containing the bucket name and filepath.
bucket : `str`, optional
Name of the bucket in which to look. If provided, path will be assumed
to correspond to be relative to the given bucket.
client : `boto3.client`, optional
S3 Client object to query, if not supplied boto3 will try to resolve
the credentials as in order described in its manual_.
Returns
-------
Expand All @@ -51,131 +53,68 @@ def s3CheckFileExistsGET(client, bucket, filepath):
Notes
-----
A Bucket HEAD request will be charged against your account. This is on
average 10x cheaper than a LIST request (see `s3CheckFileExistsLIST`
function) but can be up to 90% slower whenever additional work is done with
the s3 client. See `PR-1248<https://github.com/boto/botocore/issues/1248>`_
and `PR-1128<https://github.com/boto/boto3/issues/1128>`_ for details. In
boto3 versions >=1.11.0 the loss of performance should not be an issue
anymore.
S3 Paths are sensitive to leading and trailing path separators.
.. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\
configuration.html#configuring-credentials
"""
if boto3 is None:
raise ModuleNotFoundError(("Could not find boto3. "
"Are you sure it is installed?"))

if client is None:
client = boto3.client('s3')

if isinstance(path, str):
if bucket is not None:
filepath = path
else:
uri = ButlerURI(path)
bucket = uri.netloc
filepath = uri.relativeToPathRoot
elif isinstance(path, (ButlerURI, Location)):
bucket = path.netloc
filepath = path.relativeToPathRoot

try:
obj = client.head_object(Bucket=bucket, Key=filepath)
return (True, obj["ContentLength"])
except client.exceptions.ClientError as err:
# resource unreachable error means key does not exist
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return (False, -1)
# head_object returns 404 when object does not exist only when user has
# s3:ListBucket permission. If list permission does not exist a 403 is
# returned. In practical terms this generally means that the file does
# not exist, but it could also mean user lacks s3:GetObject permission:
# https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
# I don't think its possible to discern which case is it with certainty
if err.response["ResponseMetadata"]["HTTPStatusCode"] == 403:
raise PermissionError("Forbidden HEAD operation error occured. "
"Verify s3:ListBucket and s3:GetObject "
"permissions are granted for your IAM user. ") from err
raise


def s3CheckFileExistsLIST(client, bucket, filepath):
"""Returns (True, filesize) if file exists in the bucket and (False, -1) if
the file is not found.
Parameters
----------
client : `boto3.client`
S3 Client object to query.
bucket : `str`
Name of the bucket in which to look.
filepath : `str`
Path to file.
Returns
-------
exists : `bool`
True if key exists, False otherwise
size : `int`
Size of the key, if key exists, in bytes, otherwise -1
Notes
-----
You are getting charged for a Bucket LIST request. This is on average 10x
more expensive than a GET request (see `s3CheckFileExistsGET` fucntion) but
can be up to 90% faster whenever additional work is done with the s3
client. See `PR-1248<https://github.com/boto/botocore/issues/1248>`_
and `PR-1128<https://github.com/boto/boto3/issues/1128>`_ for details. In
boto3 versions >=1.11.0 the loss of performance should not be an issue
anymore.
A LIST request can, by default, return up to 1000 matching keys, however,
the LIST response is filtered on `filepath` key which, in this context, is
expected to be unique. Non-unique matches are treated as a non-existant
file. This function can not be used to retrieve all keys that start with
`filepath`.
S3 Paths are sensitive to leading and trailing path separators.
"""
response = client.list_objects_v2(
Bucket=bucket,
Prefix=filepath
)
# Hopefully multiple identical files will never exist?
matches = [x for x in response.get("Contents", []) if x["Key"] == filepath]
if len(matches) == 1:
return (True, matches[0]["Size"])
else:
return (False, -1)


def s3CheckFileExists(client, path=None, bucket=None, filepath=None, slow=True):
"""Returns (True, filesize) if file exists in the bucket and (False, -1) if
the file is not found.
Accepts a fully specified Location or ButlerURI to the file or can accept
bucket name and filepath as separate strings.
Parameters
----------
client : `boto3.client`
S3 Client object to query.
path : `Location`, `ButlerURI`, optional
Location or ButlerURI containing the bucket name and filepath.
bucket : `str`, optional
Name of the bucket in which to look.
filepath : `str`, optional
Path to file.
slow : `bool`, optional
If True, makes a GET request to S3 instead of a LIST request. This
is cheaper, but also slower. See `s3CheckFileExistsGET` or
`s3CheckFileExistsLIST` for more details.
Returns
-------
exists : `bool`
True if file exists, False otherwise
size : `int`
Size of the key, if key exists, in bytes, otherwise -1
Notes
-----
S3 Paths are sensitive to leading and trailing path separators.
"""
if isinstance(path, (ButlerURI, Location)):
bucket = path.netloc
filepath = path.relativeToPathRoot

if bucket is None and filepath is None:
raise ValueError(("Expected ButlerURI, Location or (bucket, filepath) pair "
f"but got {path}, ({bucket}, {filepath}) instead."))

if slow:
return s3CheckFileExistsGET(client, bucket=bucket, filepath=filepath)
return s3CheckFileExistsLIST(client, bucket=bucket, filepath=filepath)


def bucketExists(bucketName):
def bucketExists(bucketName, client=None):
"""Check if the S3 bucket with the given name actually exists.
Parameters
----------
bucketName : `str`
Name of the S3 Bucket
client : `boto3.client`, optional
S3 Client object to query, if not supplied boto3 will try to resolve
the credentials as in order described in its manual_.
Returns
-------
exists : `bool`
True if it exists, False if no Bucket with specified parameters is
found.
.. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\
configuration.html#configuring-credentials
"""
if boto3 is None:
raise ModuleNotFoundError(("Could not find boto3. "
Expand Down
15 changes: 8 additions & 7 deletions python/lsst/daf/butler/datastores/s3Datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def exists(self, ref):
return False

loc = self.locationFactory.fromPath(storedFileInfo.path)
return s3CheckFileExists(self.client, loc)[0]
return s3CheckFileExists(loc, client=self.client)[0]

def get(self, ref, parameters=None):
"""Load an InMemoryDataset from the store.
Expand Down Expand Up @@ -415,7 +415,7 @@ def put(self, inMemoryDataset, ref):
# instead, if it does not the insert key operation will be equivalent
# to creating a directory and the file together.
location.updateExtension(formatter.extension)
if s3CheckFileExists(self.client, location)[0]:
if s3CheckFileExists(location, client=self.client,)[0]:
raise FileExistsError(f"Cannot write file for ref {ref} as "
f"output file {location.uri} exists.")

Expand Down Expand Up @@ -500,7 +500,7 @@ def ingest(self, path, ref, formatter=None, transfer=None):
elif transfer == "move" or transfer == "copy":
if uri.scheme == "file":
# uploading file from local disk and potentially deleting it
if s3CheckFileExists(self.client, uri)[0]:
if s3CheckFileExists(uri, client=self.client)[0]:
raise FileExistsError(f"File '{path}' exists!")

template = self.templates.getTemplate(ref)
Expand All @@ -512,7 +512,7 @@ def ingest(self, path, ref, formatter=None, transfer=None):
os.remove(path)
elif uri.scheme == "s3":
# copying files between buckets, potentially deleting src files
if s3CheckFileExists(self.client, uri)[0]:
if s3CheckFileExists(uri, client=self.client)[0]:
raise FileExistsError(f"File '{uri}' exists.")

relpath = uri.relativeToPathRoot
Expand All @@ -533,8 +533,9 @@ def ingest(self, path, ref, formatter=None, transfer=None):
location = self.locationFactory.fromPath(path)

# the file should exist on the bucket by now
exists, size = s3CheckFileExists(self.client, bucket=location.netloc,
filepath=uri.relativeToPathRoot)
exists, size = s3CheckFileExists(path=uri.relativeToPathRoot,
bucket=location.netloc,
client=self.client)
self.registry.addDatasetLocation(ref, self.name)

# Associate this dataset with the formatter for later read.
Expand Down Expand Up @@ -622,7 +623,7 @@ def remove(self, ref):
except KeyError:
raise FileNotFoundError(f"Requested dataset ({ref}) does not exist")
location = self.locationFactory.fromPath(storedFileInfo.path)
if not s3CheckFileExists(self.client, location):
if not s3CheckFileExists(location, client=self.client):
raise FileNotFoundError("No such file: {0}".format(location.uri))

# https://github.com/boto/boto3/issues/507 - there is no way of knowing
Expand Down
5 changes: 3 additions & 2 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"""

import os
import posixpath
import unittest
import tempfile
import shutil
Expand Down Expand Up @@ -631,7 +632,7 @@ def setUp(self):
self.datastoreStr = f"datastore={self.root}"
self.datastoreName = [f"S3Datastore@{rooturi}"]
Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
self.tmpConfigFile = os.path.join(rooturi, "butler.yaml")
self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")

def tearDown(self):
s3 = boto3.resource("s3")
Expand All @@ -657,7 +658,7 @@ def checkFileExists(self, root, relpath):
"""
uri = ButlerURI(root)
client = boto3.client("s3")
return s3CheckFileExists(client, uri)[0]
return s3CheckFileExists(uri, client=client)[0]


if __name__ == "__main__":
Expand Down

0 comments on commit 964e55d

Please sign in to comment.