Skip to content

Commit

Permalink
Review fixes lsst#6
Browse files Browse the repository at this point in the history
Changes
-------

- added `ospath` property to ButlerURI that localizes the posix-like
  uri.path
 - Updated docstrings in ButlerURI class to correctly describe
  what properties/methods do.
- File vs Dir is now resolved by checking if "." is present in path
  for both Config and ButlerConfig. If not Dir, path is no longer
  forced to be the top level directory before updating file name.
- Fixed various badly merged Formatter docstrings and file IO.
- Removed file extension update call in 'to/from'Bytes' methods
  in Formatters.
- Restored `super().setConfigRoot` call in SqliteRegistry.
- Removed extra boto3 present check from test_butler.py
- Fixed badly formatted docstrings in s3utils.
- Renamed `cheap` kwarg to `slow` in `s3CheckFileExists`.
  • Loading branch information
DinoBektesevic committed Aug 5, 2019
1 parent 8636422 commit 35dbbb0
Show file tree
Hide file tree
Showing 12 changed files with 86 additions and 101 deletions.
7 changes: 3 additions & 4 deletions python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from .core.exceptions import ValidationError
from .core.repoRelocation import BUTLER_ROOT_TAG
from .core.safeFileIo import safeMakeDir
from . core.location import ButlerURI
from .core.location import ButlerURI

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -182,9 +182,8 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat

uri = ButlerURI(root)
if uri.scheme == 'file':
root = os.path.abspath(root)
if not os.path.isdir(root):
safeMakeDir(root)
if not os.path.isdir(uri.ospath):
safeMakeDir(uri.ospath)
elif uri.scheme == 's3':
s3 = boto3.resource('s3')
# implies bucket exists, if not another level of checks
Expand Down
10 changes: 4 additions & 6 deletions python/lsst/daf/butler/core/butlerConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,11 @@ def __init__(self, other=None, searchPaths=None):
if isinstance(other, str):
uri = ButlerURI(other)
if uri.scheme == 'file':
if os.path.isdir(uri.path):
other = os.path.join(other, "butler.yaml")
if os.path.isdir(uri.ospath):
other = os.path.join(uri.ospath, "butler.yaml")
elif uri.scheme == 's3':
path, ext = posixpath.splitext(uri.path)
if not ext:
if not uri.path.endswith('/'):
uri = ButlerURI(uri.geturl()+'/')
head, filename = posixpath.split(uri.path)
if "." not in filename:
uri.updateFile('butler.yaml')
other = uri.geturl()
else:
Expand Down
19 changes: 7 additions & 12 deletions python/lsst/daf/butler/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ def __initFromS3YamlFile(self, path):
To a persisted config file.
"""
if boto3 is None:
raise ModuleNotFoundError(('boto3 not found.'
'Are you sure it is installed?'))
raise ModuleNotFoundError('boto3 not found.'
'Are you sure it is installed?')

uri = ButlerURI(path)
s3 = boto3.client('s3')
Expand Down Expand Up @@ -782,13 +782,8 @@ def dumpToUri(self, uri, updateFile=True, defaultFileName='butler.yaml'):
uri = ButlerURI(os.path.join(uri.path, defaultFileName))
self.dumpToFile(uri.path)
elif uri.scheme == 's3':
# Assumed a file if it has a filetype extension
path, ext = posixpath.splitext(uri.path)
if not ext and updateFile:
# trailing '/' is mandatory for dirs, otherwise ButlerURI
# has a hard time updating files in URIs.
if not uri.path.endswith('/'):
uri = ButlerURI(uri.geturl()+'/')
head, filename = posixpath.split(uri.path)
if "." not in filename:
uri.updateFile(defaultFileName)
self.dumpToS3File(uri.netloc, uri.path.lstrip('/'))
else:
Expand Down Expand Up @@ -816,8 +811,8 @@ def dumpToS3File(self, bucket, key):
Path to the file to use for output, relative to the bucket.
"""
if boto3 is None:
raise ModuleNotFoundError(("Could not find boto3. "
"Are you sure it is installed?"))
raise ModuleNotFoundError("Could not find boto3. "
"Are you sure it is installed?")

s3 = boto3.client('s3')
with io.StringIO() as stream:
Expand Down Expand Up @@ -900,7 +895,7 @@ def updateParameters(configType, config, full, toUpdate=None, toCopy=None, overw
for key in toCopy:
if key in localConfig and not overwrite:
log.debug("Not overriding key '%s' from defaults in config %s",
key, value, localConfig.__class__.__name__)
key, localConfig.__class__.__name__)
else:
localConfig[key] = localFullConfig[key]

Expand Down
12 changes: 3 additions & 9 deletions python/lsst/daf/butler/core/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def fromBytes(self, serializedDataset, fileDescriptor, component=None):
Parameters
----------
dataset : `bytes`
serializedDataset : `bytes`
Bytes object to unserialize.
fileDescriptor : `FileDescriptor`
Identifies type to read it as and parameters to be used for
Expand All @@ -126,12 +126,6 @@ def fromBytes(self, serializedDataset, fileDescriptor, component=None):
inMemoryDataset : `object`
The requested data as a Python object. The type of object
is controlled by the specific formatter.
Raises
------
ValueError
Component requested but this Dataset does not seem to be a concrete
composite.
"""
raise NotImplementedError("Type does not support reading from bytes.")

Expand All @@ -148,8 +142,8 @@ def toBytes(self, inMemoryDataset, fileDescriptor):
Returns
-------
serializedDataset : `str`
bytes representing the serialized dataset.
serializedDataset : `bytes`
Bytes representing the serialized dataset.
"""
raise NotImplementedError("Type does not support writing to bytes.")

Expand Down
23 changes: 12 additions & 11 deletions python/lsst/daf/butler/core/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ def path(self):
"""The path component of the URI."""
return self._uri.path

@property
def ospath(self):
"""Path component of the URI localized to current OS."""
return posix2os(self._uri.path)

@property
def fragment(self):
"""The fragment component of the URI."""
Expand All @@ -164,8 +169,7 @@ def geturl(self):
url : `str`
String form of URI.
"""
return urllib.parse.urlunparse(self._uri)
# return self._uri.geturl()
return self._uri.geturl()

def replace(self, **kwargs):
"""Replace components in a URI with new values and return a new
Expand Down Expand Up @@ -391,16 +395,16 @@ def pathInStore(self):

@property
def netloc(self):
"""If Location is an S3 protocol, returns the bucket name."""
"""The URI network location."""
return self._datastoreRootUri.netloc

@property
def relativeToNetloc(self):
"""Returns the path in an S3 Bucket, normalized so that directory
structure in the bucket matches that of a local filesystem.
"""Returns the path component of the URI relative to the network
location.
Effectively, this is the path property with posix separator stipped
from the left hand side of the path, i.e. path relative to netloc.
from the left hand side of the path.
"""
return self.path.lstrip('/')

Expand Down Expand Up @@ -452,11 +456,8 @@ def __str__(self):

@property
def netloc(self):
"""If Location is an S3 protocol, returns the bucket name."""
if self._datastoreRootUri.scheme == 's3':
return self._datastoreRootUri.netloc
else:
raise AttributeError(f'Path {self} is not a valid S3 protocol.')
"""Returns the network location of datastoreRoot."""
return self._datastoreRootUri.netloc

def fromPath(self, path):
"""Factory function to create a `Location` from a POSIX path.
Expand Down
36 changes: 22 additions & 14 deletions python/lsst/daf/butler/core/s3utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,20 @@ def s3CheckFileExistsGET(client, bucket, filepath):
Returns
-------
(`bool`, `int`) : `tuple`
Tuple (exists, size). If file exists (True, filesize)
and (False, -1) when the file is not found.
exists : `bool`
True if key exists, False otherwise.
size : `int`
Size of the key, if key exists, in bytes, otherwise -1
Notes
-----
A Bucket GET request will be charged against your account. This is on
A Bucket HEAD request will be charged against your account. This is on
average 10x cheaper than a LIST request (see `s3CheckFileExistsLIST`
function) but can be up to 90% slower whenever additional work is done with
the s3 client. See PR:
``https://github.com/boto/botocore/issues/1248``
for details, in boto3 versions >=1.11.0 the loss of performance should not
be an issue anymore.
the s3 client. See `PR-1248<https://github.com/boto/botocore/issues/1248>`_
and `PR-1128<https://github.com/boto/boto3/issues/1128>`_ for details. In
boto3 versions >=1.11.0 the loss of performance should not be an issue
anymore.
S3 Paths are sensitive to leading and trailing path separators.
"""
try:
Expand All @@ -83,19 +84,26 @@ def s3CheckFileExistsLIST(client, bucket, filepath):
Returns
-------
(`bool`, `int`) : `tuple`
Tuple (exists, size). If file exists (True, filesize)
and (False, -1) when the file is not found.
exists : `bool`
True if key exists, False otherwise
size : `int`
Size of the key, if key exists, in bytes, otherwise -1
Notes
-----
You are getting charged for a Bucket LIST request. This is on average 10x
more expensive than a GET request (see `s3CheckFileExistsGET` fucntion) but
can be up to 90% faster whenever additional work is done with the s3
client. For details, see PR:
``https://github.com/boto/botocore/issues/1248``
Boto3 versions >=1.11.0 the loss of performance should not be an issue
client. See `PR-1248<https://github.com/boto/botocore/issues/1248>`_
and `PR-1128<https://github.com/boto/boto3/issues/1128>`_ for details. In
boto3 versions >=1.11.0 the loss of performance should not be an issue
anymore.
A LIST request can, by default, return up to 1000 matching keys, however,
the LIST response is filtered on `filepath` key which, in this context, is
expected to be unique. Non-unique matches are treated as a non-existant
file.
This function can not be used to retrieve all keys that start with
`filepath`.
"""
response = client.list_objects_v2(
Bucket=bucket,
Expand Down
28 changes: 11 additions & 17 deletions python/lsst/daf/butler/formatters/fileFormatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,16 @@ def _assembleDataset(self, data, fileDescriptor, component):
Identifies the file to read, type to read it into and parameters
to be used for reading.
data : `dict` or `object`
a composite or a dict that, or which component, needs to be
coerced to a ptype
Composite or a dict that, or which component, needs to be
coerced to the python type specified in "fileDescriptor"
fileDescriptor : `FileDescriptor`
Identifies the file to read, type to read it into and parameters
to be used for reading.
component : `str`, optional
Component to read from the file. Only used if the `StorageClass`
for reading differed from the `StorageClass` used to write the
file.
Returns
-------
inMemoryDataset : `object`
Expand Down Expand Up @@ -175,27 +176,28 @@ def read(self, component=None):
ValueError
Component requested but this file does not seem to be a concrete
composite.
NotImplementedError
Formatter does not implement a method to read from files.
"""

# Read the file naively
path = fileDescriptor.location.path
data = self._readFile(path, fileDescriptor.storageClass.pytype)

# Assemble the requested dataset and potentially return only its
# component coercing it to its appropriate ptype
# Assemble the requested dataset return it as its appropriate ptype
data = self._assembleDataset(data, fileDescriptor, component)

if data is None:
raise ValueError(f"Unable to read data with URI {fileDescriptor.location.uri}")

return data

def fromBytes(self, inMemoryDataset, fileDescriptor, component=None):
def fromBytes(self, serializedDataset, fileDescriptor, component=None):
"""Reads serialized data into a Dataset or its component.
Parameters
----------
dataset : `bytes`
serializedDataset : `bytes`
Bytes object to unserialize.
fileDescriptor : `FileDescriptor`
Identifies read type and parameters to be used for reading.
Expand All @@ -209,17 +211,11 @@ def fromBytes(self, inMemoryDataset, fileDescriptor, component=None):
inMemoryDataset : `object`
The requested data as a Python object. The type of object
is controlled by the specific formatter.
Raises
------
ValueError
Component requested but this Dataset does not seem to be a concrete
composite.
"""
if not hasattr(self, '_fromBytes'):
raise NotImplementedError("Type does not support reading from bytes.")

data = self._fromBytes(inMemoryDataset,
data = self._fromBytes(serializedDataset,
fileDescriptor.storageClass.pytype)

# Assemble the requested dataset and potentially return only its
Expand Down Expand Up @@ -258,20 +254,18 @@ def toBytes(self, inMemoryDataset, fileDescriptor):
Parameters
----------
inMemoryDataset : `object`
The Python object to serialize.
Object to serialize.
fileDescriptor : `FileDescriptor`
Identifies read type and parameters to be used for reading.
Returns
-------
serializedDataset : `bytes`
bytes representing the serialized dataset.
Bytes representing the serialized dataset.
"""
if not hasattr(self, '_toBytes'):
raise NotImplementedError("Type does not support reading from bytes.")

fileDescriptor.location.updateExtension(self.extension)

return self._toBytes(inMemoryDataset)

@classmethod
Expand Down
20 changes: 9 additions & 11 deletions python/lsst/daf/butler/formatters/jsonFormatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _readFile(self, path, pytype=None):
if the file could not be opened.
"""
try:
with open(path, "r") as fd:
with open(path, "rb") as fd:
data = self._fromBytes(fd.read())
except FileNotFoundError:
data = None
Expand All @@ -76,26 +76,24 @@ def _writeFile(self, inMemoryDataset):
with open(self.fileDescriptor.location.path, "w") as fd:
if hasattr(inMemoryDataset, "_asdict"):
inMemoryDataset = inMemoryDataset._asdict()
fd.write(self._toBytes(inMemoryDataset).decode())
fd.write(self._toBytes(inMemoryDataset))

def _fromBytes(self, bytesObject, pytype=None):
def _fromBytes(self, serializedDataset):
"""Read the bytes object as a python object.
Parameters
----------
serializedDataset : `bytes`
Bytes object to unserialize.
pytype : `class`, optional
Not used by this implementation.
Returns
-------
data : `object`
Either data as Python object read from bytes, or None if the string
could not be read.
inMemoryDataset : `object`
The requested data as a Python object or None if the string could
not be read.
"""
try:
data = json.loads(bytesObject)
data = json.loads(serializedDataset)
except json.JSONDecodeError:
data = None

Expand All @@ -111,8 +109,8 @@ def _toBytes(self, inMemoryDataset):
Returns
-------
data : `bytes`
Object stored as bytes.
serializedDataset : `bytes`
bytes representing the serialized dataset.
Raises
------
Expand Down

0 comments on commit 35dbbb0

Please sign in to comment.