Skip to content

Commit

Permalink
Merge pull request #304 from 4dn-dcic/minor-smaht-submitr-related-fix…
Browse files Browse the repository at this point in the history
…es-20240412

Minor fix in structured_data to not try to resolve empty refs.
  • Loading branch information
dmichaels-harvard committed Apr 22, 2024
2 parents 9c53bb0 + 480bcde commit 5702f21
Show file tree
Hide file tree
Showing 11 changed files with 734 additions and 56 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -7,6 +7,21 @@ Change Log
----------


8.8.4
=====
* Minor fix in structured_data to not try to resolve empty refs in norefs mode;
and added StructuredDataSet.unchecked_refs; not functionally substantive as
used (only) with smaht-submitr/submit-metadata-bundle --info --refs.
* Added nrows and nsheets to data_reader; convenience for smaht-submitr/submit-metadata-bundle --info.
* Added test_progress_bar module for progress_bar testing; would like to add more tests.
* Fixed up captured_output module to handle UTF-8 encoding to help unit testing progress_bar.
* Added hooks to progress_bar to help unit testing.
* Added a find_nth_from_end and set_nth to misc_utils to help progress_bar unit testing.
* Added format_size and format_duration misc_utils; refactor from smaht-submitr.
* Added format_datetime, parse_datetime to datetime_utils; refactor from smaht-submitr; and some tests.
* Added check_only flag to portal_utils.Portal.{post,patch}_metadata (came up in ad hoc troubleshooting).


8.8.3
=====
* Minor fix in structured_data related to smaht-submitr progress monitoring.
Expand Down
22 changes: 20 additions & 2 deletions dcicutils/captured_output.py
Expand Up @@ -9,7 +9,7 @@


@contextmanager
def captured_output(capture: bool = True):
def captured_output(capture: bool = True, encoding: Optional[str] = None):
"""
Context manager to capture any/all output to stdout or stderr, and not actually output it to stdout
or stderr. Yields and object with a get_captured_output() method to get the output captured thus far,
Expand All @@ -24,7 +24,9 @@ def captured_output(capture: bool = True):

original_stdout = _real_stdout
original_stderr = _real_stderr
captured_output = io.StringIO()
# FYI: This encoding business with _EncodedStringIO was introduced (circa April 2024)
# when ran into issues unit testing progress_bar which outputs those funny block characters.
captured_output = io.StringIO() if not encoding else _EncodedStringIO(encoding)

def set_original_output() -> None:
sys.stdout = original_stdout
Expand Down Expand Up @@ -68,3 +70,19 @@ def uncaptured_output():
finally:
sys.stdout = original_stdout
sys.stderr = original_stderr


class _EncodedStringIO:
def __init__(self, encoding: str = "utf-8"):
self.encoding = encoding
self.buffer = io.BytesIO()
def write(self, s): # noqa
self.buffer.write(s.encode(self.encoding))
def flush(self): # noqa
self.buffer.flush()
def getvalue(self): # noqa
return self.buffer.getvalue().decode(self.encoding)
def __str__(self): # noqa
return self.getvalue()
def __repr__(self): # noqa
return repr(self.getvalue())
11 changes: 11 additions & 0 deletions dcicutils/data_readers.py
Expand Up @@ -66,6 +66,13 @@ def cell_value(self, value: Optional[Any]) -> str:
else:
return value

@property
def nrows(self) -> int:
nrows = 0
for row in self:
nrows += 1
return nrows

def open(self) -> None:
pass

Expand Down Expand Up @@ -192,6 +199,10 @@ def is_hidden_sheet(self, sheet: openpyxl.worksheet.worksheet.Worksheet) -> bool
return True
return False

@property
def nsheets(self) -> int:
return len(self.sheet_names)

def __del__(self) -> None:
if (workbook := self._workbook) is not None:
self._workbook = None
Expand Down
213 changes: 208 additions & 5 deletions dcicutils/datetime_utils.py
@@ -1,6 +1,23 @@
from dcicutils.misc_utils import normalize_spaces
from datetime import datetime, timedelta, timezone
from typing import Optional, Tuple
from dateutil import parser as datetime_parser
from typing import Optional, Tuple, Union

TIMEZONE_LOCAL = datetime.now().astimezone().tzinfo # type: datetime.timezone
TIMEZONE_LOCAL_NAME = TIMEZONE_LOCAL.tzname(None) # type: str
TIMEZONE_LOCAL_OFFSET = TIMEZONE_LOCAL.utcoffset(None) # type: datetime.timedelta
TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES = int(TIMEZONE_LOCAL_OFFSET.total_seconds()) // 60 # type: int
TIMEZONE_LOCAL_OFFSET_HOURS = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES // 60 # type: int
TIMEZONE_LOCAL_OFFSET_MINUTES = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES % 60 # type: int
TIMEZONE_LOCAL_SUFFIX = f"{TIMEZONE_LOCAL_OFFSET_HOURS:+03d}:{TIMEZONE_LOCAL_OFFSET_MINUTES:02d}" # type: str

TIMEZONE_UTC = timezone.utc # type: datetime.timezone
TIMEZONE_UTC_NAME = TIMEZONE_UTC.tzname(None) # type: str
TIMEZONE_UTC_OFFSET = timedelta(0) # type: datetime.timedelta
TIMEZONE_UTC_OFFSET_TOTAL_MINUTES = 0 # type: int
TIMEZONE_UTC_OFFSET_HOURS = 0 # type: int
TIMEZONE_UTC_OFFSET_MINUTES = 0 # type: int
TIMEZONE_UTC_SUFFIX = "Z" # type: str


def parse_datetime_string(value: str) -> Optional[datetime]:
Expand Down Expand Up @@ -82,17 +99,203 @@ def normalize_date_string(value: str) -> Optional[str]:
return d.strftime("%Y-%m-%d") if d else None


def get_timezone(hours_or_timedelta: Union[int, timedelta], minutes: Optional[int] = None) -> timezone:
try:
if isinstance(hours_or_timedelta, timedelta):
return timezone(hours_or_timedelta)
return timezone(timedelta(hours=hours_or_timedelta, minutes=minutes or 0))
except Exception:
return TIMEZONE_LOCAL


def get_timezone_offset(tz: timezone) -> timedelta:
try:
return tz.utcoffset(None)
except Exception:
return TIMEZONE_LOCAL_OFFSET


def get_timezone_hours_minutes(tz: timezone) -> Tuple[int, int]:
"""
Returns a tuple with the integer hours and minutes offset for the given timezone.
If negative then only the hours is negative; the mintutes is always positive;
this is okay because there are no timezones less than one hour from UTC.
"""
tz_offset = get_timezone_offset(tz)
tz_offset_total_minutes = int(tz_offset.total_seconds()) // 60
tz_offset_hours = tz_offset_total_minutes // 60
tz_offset_minutes = abs(tz_offset_total_minutes % 60)
return tz_offset_hours, tz_offset_minutes


def get_utc_timezone() -> timezone:
return TIMEZONE_UTC


def get_local_timezone() -> timezone:
"""
Returns current/local timezone as a datetime.timezone object.
"""
return TIMEZONE_LOCAL


def get_local_timezone_string() -> str:
"""
Returns current/local timezone in format like: "-05:00".
"""
tz_hours, tz_minutes = get_local_timezone_hours_minutes()
return f"{tz_hours:+03d}:{tz_minutes:02d}"
return TIMEZONE_LOCAL_SUFFIX


def get_local_timezone_hours_minutes() -> Tuple[int, int]:
"""
Returns a tuple with the integer hours and minutes offset for the current/local timezone.
If negative then only the hours is negative; the mintutes is always positive;
this is okay because there are no timezones less than one hour from UTC.
"""
tz_minutes = datetime.now(timezone.utc).astimezone().utcoffset().total_seconds() / 60
return int(tz_minutes // 60), int(abs(tz_minutes % 60))
return TIMEZONE_LOCAL_OFFSET_HOURS, TIMEZONE_LOCAL_OFFSET_MINUTES


def parse_datetime(value: str, utc: bool = False, tz: Optional[timezone] = None) -> Optional[datetime]:
"""
Parses the given string into a datetime, if possible, and returns that value,
or None if not able to parse. The timezone of the returned datetime will be the
local timezone; or if the given utc argument is True then it will be UTC; or if the
given tz argument is a datetime.timezone then return datetime will be in that timezone.
"""
if isinstance(value, datetime):
return value
elif not isinstance(value, str):
return None
try:
# This dateutil.parser handles quite a wide variety of formats and suits our needs.
value = datetime_parser.parse(value)
if utc is True:
# If the given utc argument is True then it trumps any tz argument if given.
tz = timezone.utc
if value.tzinfo is not None:
# The given value had an explicit timezone specified.
if isinstance(tz, timezone):
return value.astimezone(tz)
return value
return value.replace(tzinfo=tz if isinstance(tz, timezone) else get_local_timezone())
except Exception:
return None


def format_datetime(value: datetime,
utc: bool = False,
tz: Optional[Union[timezone, bool]] = None,
iso: bool = False,
notz: bool = False,
noseconds: bool = False,
ms: bool = False,
verbose: bool = False,
noseparator: bool = False,
noday: bool = False,
nodate: bool = False,
notime: bool = False) -> str:
"""
Returns the given datetime as a string in "YYYY:MM:DD hh:mm:ss tz" format, for
example "2024-04-17 15:42:26 EDT". If the given notz argument is True then omits
the timezone; if the noseconds argument is given the omits the seconds. If the given
verbose argument is True then returns a really verbose version of the datetime, for
example "Wednesday, April 17, 2024 | 15:42:26 EDT"; if the noseparator argument is
True then omits the "|" separator; if the noday argument is True then omits the day
of week part. The timezone of the returned datetime string will default to the local
one; if the given utc argument is True then it will be UTC; or if the given tz
argument is a datetime.timezone it will be in that timezone.
"""
if nodate is True and notime is True:
return ""
if not isinstance(value, datetime):
if not isinstance(value, str) or not (value := parse_datetime(value)):
return ""
try:
if utc is True:
tz = timezone.utc
elif not isinstance(tz, timezone):
tz = get_local_timezone()
if tz is True:
notz = False
elif tz is False:
notz = True
if noseconds is True:
ms = False
value = value.astimezone(tz)
if iso:
if notz is True:
value = value.replace(tzinfo=None)
if not (ms is True):
value = value.replace(microsecond=0)
if noseconds is True:
if notz is True:
if nodate is True:
return value.strftime(f"%H:%M")
elif notime is True:
return value.strftime(f"%Y-%m-%d")
else:
return value.strftime(f"%Y-%m-%dT%H:%M")
if len(tz := value.strftime("%z")) > 3:
tz = tz[:3] + ":" + tz[3:]
if nodate is True:
return value.strftime(f"%H:%M") + tz
elif notime is True:
return value.strftime(f"%Y-%m-%d") + tz
else:
return value.strftime(f"%Y-%m-%dT%H:%M") + tz
if nodate is True:
if (not (notz is True)) and len(tz := value.strftime("%z")) > 3:
tz = tz[:3] + ":" + tz[3:]
else:
tz = ""
return value.strftime(f"%H:%M:%S{f'.%f' if ms is True else ''}") + tz
elif notime is True:
return value.strftime(f"%Y-%m-%d")
else:
return value.isoformat()
if verbose:
if nodate is True:
return value.strftime(
f"%-I:%M{'' if noseconds is True else ':%S'}"
f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}")
elif notime is True:
return value.strftime(f"{'' if noday is True else '%A, '}%B %-d, %Y")
else:
return value.strftime(
f"{'' if noday is True else '%A, '}%B %-d, %Y{'' if noseparator is True else ' |'}"
f" %-I:%M{'' if noseconds is True else ':%S'}"
f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}")
else:
if nodate is True:
return value.strftime(
f"%H:%M{'' if noseconds is True else ':%S'}"
f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}")
elif notime is True:
return value.strftime(f"%Y-%m-%d")
else:
return value.strftime(
f"%Y-%m-%d %H:%M{'' if noseconds is True else ':%S'}"
f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}")
except Exception:
return None


def format_date(value: datetime,
utc: bool = False,
tz: Optional[Union[timezone, bool]] = None,
verbose: bool = False,
noday: bool = False) -> str:
return format_datetime(value, utc=utc, tz=tz, verbose=verbose, noday=noday, notime=True)


def format_time(value: datetime,
utc: bool = False,
iso: bool = False,
tz: Optional[Union[timezone, bool]] = None,
ms: bool = False,
notz: bool = False,
noseconds: bool = False,
verbose: bool = False,
noday: bool = False) -> str:
return format_datetime(value, utc=utc, tz=tz, iso=iso, ms=ms, notz=notz,
noseconds=noseconds, verbose=verbose, nodate=True)
65 changes: 65 additions & 0 deletions dcicutils/misc_utils.py
Expand Up @@ -2548,6 +2548,71 @@ def normalize_spaces(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()


def find_nth_from_end(string: str, substring: str, nth: int) -> int:
"""
Returns the index of the nth occurrence of the given substring within
the given string from the END of the given string; or -1 if not found.
"""
index = -1
string = string[::-1]
for i in range(0, nth):
index = string.find(substring, index + 1)
return len(string) - index - 1 if index >= 0 else -1


def set_nth(string: str, nth: int, replacement: str) -> str:
"""
Sets the nth character of the given string to the given replacement string.
"""
if not isinstance(string, str) or not isinstance(nth, int) or not isinstance(replacement, str):
return string
if nth < 0:
nth += len(string)
return string[:nth] + replacement + string[nth + 1:] if 0 <= nth < len(string) else string


def format_size(nbytes: Union[int, float], precision: int = 2, nospace: bool = False, terse: bool = False) -> str:
if isinstance(nbytes, str) and nbytes.isdigit():
nbytes = int(nbytes)
elif not isinstance(nbytes, (int, float)):
return ""
UNITS = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
UNITS_TERSE = ['b', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
MAX_UNITS_INDEX = len(UNITS) - 1
ONE_K = 1024
index = 0
if (precision := max(precision, 0)) and (nbytes <= ONE_K):
precision -= 1
while abs(nbytes) >= ONE_K and index < MAX_UNITS_INDEX:
nbytes /= ONE_K
index += 1
if index == 0:
nbytes = int(nbytes)
return f"{nbytes} byte{'s' if nbytes != 1 else ''}"
unit = (UNITS_TERSE if terse else UNITS)[index]
return f"{nbytes:.{precision}f}{'' if nospace else ' '}{unit}"


def format_duration(seconds: Union[int, float]) -> str:
seconds_actual = seconds
seconds = round(max(seconds, 0))
durations = [("year", 31536000), ("day", 86400), ("hour", 3600), ("minute", 60), ("second", 1)]
parts = []
for name, duration in durations:
if seconds >= duration:
count = seconds // duration
seconds %= duration
if count != 1:
name += "s"
parts.append(f"{count} {name}")
if len(parts) == 0:
return f"{seconds_actual:.1f} seconds"
elif len(parts) == 1:
return f"{seconds_actual:.1f} seconds"
else:
return " ".join(parts[:-1]) + " " + parts[-1]


class JsonLinesReader:

def __init__(self, fp, padded=False, padding=None):
Expand Down

0 comments on commit 5702f21

Please sign in to comment.