Merge pull request #304 from 4dn-dcic/minor-smaht-submitr-related-fix…

…es-20240412 Minor fix in structured_data to not try to resolve empty refs.
4dn-dcic · Apr 22, 2024 · 5702f21 · 5702f21
2 parents 9c53bb0 + 480bcde
commit 5702f21
Show file tree

Hide file tree

Showing 11 changed files with 734 additions and 56 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,21 @@ Change Log
 ----------
 
 
+8.8.4
+=====
+* Minor fix in structured_data to not try to resolve empty refs in norefs mode;
+  and added StructuredDataSet.unchecked_refs; not functionally substantive as
+  used (only) with smaht-submitr/submit-metadata-bundle --info --refs.
+* Added nrows and nsheets to data_reader; convenience for smaht-submitr/submit-metadata-bundle --info.
+* Added test_progress_bar module for progress_bar testing; would like to add more tests.
+* Fixed up captured_output module to handle UTF-8 encoding to help unit testing progress_bar.
+* Added hooks to progress_bar to help unit testing.
+* Added a find_nth_from_end and set_nth to misc_utils to help progress_bar unit testing.
+* Added format_size and format_duration misc_utils; refactor from smaht-submitr.
+* Added format_datetime, parse_datetime to datetime_utils; refactor from smaht-submitr; and some tests.
+* Added check_only flag to portal_utils.Portal.{post,patch}_metadata (came up in ad hoc troubleshooting).
+
+
 8.8.3
 =====
 * Minor fix in structured_data related to smaht-submitr progress monitoring.

diff --git a/dcicutils/captured_output.py b/dcicutils/captured_output.py
@@ -9,7 +9,7 @@
 
 
 @contextmanager
-def captured_output(capture: bool = True):
+def captured_output(capture: bool = True, encoding: Optional[str] = None):
     """
     Context manager to capture any/all output to stdout or stderr, and not actually output it to stdout
     or stderr. Yields and object with a get_captured_output() method to get the output captured thus far,
@@ -24,7 +24,9 @@ def captured_output(capture: bool = True):
 
     original_stdout = _real_stdout
     original_stderr = _real_stderr
-    captured_output = io.StringIO()
+    # FYI: This encoding business with _EncodedStringIO was introduced (circa April 2024)
+    # when ran into issues unit testing progress_bar which outputs those funny block characters.
+    captured_output = io.StringIO() if not encoding else _EncodedStringIO(encoding)
 
     def set_original_output() -> None:
         sys.stdout = original_stdout
@@ -68,3 +70,19 @@ def uncaptured_output():
     finally:
         sys.stdout = original_stdout
         sys.stderr = original_stderr
+
+
+class _EncodedStringIO:
+    def __init__(self, encoding: str = "utf-8"):
+        self.encoding = encoding
+        self.buffer = io.BytesIO()
+    def write(self, s):  # noqa
+        self.buffer.write(s.encode(self.encoding))
+    def flush(self):  # noqa
+        self.buffer.flush()
+    def getvalue(self):  # noqa
+        return self.buffer.getvalue().decode(self.encoding)
+    def __str__(self):  # noqa
+        return self.getvalue()
+    def __repr__(self):  # noqa
+        return repr(self.getvalue())
diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py
@@ -66,6 +66,13 @@ def cell_value(self, value: Optional[Any]) -> str:
         else:
             return value
 
+    @property
+    def nrows(self) -> int:
+        nrows = 0
+        for row in self:
+            nrows += 1
+        return nrows
+
     def open(self) -> None:
         pass
 
@@ -192,6 +199,10 @@ def is_hidden_sheet(self, sheet: openpyxl.worksheet.worksheet.Worksheet) -> bool
                 return True
         return False
 
+    @property
+    def nsheets(self) -> int:
+        return len(self.sheet_names)
+
     def __del__(self) -> None:
         if (workbook := self._workbook) is not None:
             self._workbook = None

diff --git a/dcicutils/datetime_utils.py b/dcicutils/datetime_utils.py
@@ -1,6 +1,23 @@
 from dcicutils.misc_utils import normalize_spaces
 from datetime import datetime, timedelta, timezone
-from typing import Optional, Tuple
+from dateutil import parser as datetime_parser
+from typing import Optional, Tuple, Union
+
+TIMEZONE_LOCAL = datetime.now().astimezone().tzinfo  # type: datetime.timezone
+TIMEZONE_LOCAL_NAME = TIMEZONE_LOCAL.tzname(None)  # type: str
+TIMEZONE_LOCAL_OFFSET = TIMEZONE_LOCAL.utcoffset(None)  # type: datetime.timedelta
+TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES = int(TIMEZONE_LOCAL_OFFSET.total_seconds()) // 60  # type: int
+TIMEZONE_LOCAL_OFFSET_HOURS = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES // 60  # type: int
+TIMEZONE_LOCAL_OFFSET_MINUTES = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES % 60  # type: int
+TIMEZONE_LOCAL_SUFFIX = f"{TIMEZONE_LOCAL_OFFSET_HOURS:+03d}:{TIMEZONE_LOCAL_OFFSET_MINUTES:02d}"  # type: str
+
+TIMEZONE_UTC = timezone.utc  # type: datetime.timezone
+TIMEZONE_UTC_NAME = TIMEZONE_UTC.tzname(None)  # type: str
+TIMEZONE_UTC_OFFSET = timedelta(0)  # type: datetime.timedelta
+TIMEZONE_UTC_OFFSET_TOTAL_MINUTES = 0  # type: int
+TIMEZONE_UTC_OFFSET_HOURS = 0  # type: int
+TIMEZONE_UTC_OFFSET_MINUTES = 0  # type: int
+TIMEZONE_UTC_SUFFIX = "Z"  # type: str
 
 
 def parse_datetime_string(value: str) -> Optional[datetime]:
@@ -82,17 +99,203 @@ def normalize_date_string(value: str) -> Optional[str]:
     return d.strftime("%Y-%m-%d") if d else None
 
 
+def get_timezone(hours_or_timedelta: Union[int, timedelta], minutes: Optional[int] = None) -> timezone:
+    try:
+        if isinstance(hours_or_timedelta, timedelta):
+            return timezone(hours_or_timedelta)
+        return timezone(timedelta(hours=hours_or_timedelta, minutes=minutes or 0))
+    except Exception:
+        return TIMEZONE_LOCAL
+
+
+def get_timezone_offset(tz: timezone) -> timedelta:
+    try:
+        return tz.utcoffset(None)
+    except Exception:
+        return TIMEZONE_LOCAL_OFFSET
+
+
+def get_timezone_hours_minutes(tz: timezone) -> Tuple[int, int]:
+    """
+    Returns a tuple with the integer hours and minutes offset for the given timezone.
+    If negative then only the hours is negative; the mintutes is always positive;
+    this is okay because there are no timezones less than one hour from UTC.
+    """
+    tz_offset = get_timezone_offset(tz)
+    tz_offset_total_minutes = int(tz_offset.total_seconds()) // 60
+    tz_offset_hours = tz_offset_total_minutes // 60
+    tz_offset_minutes = abs(tz_offset_total_minutes % 60)
+    return tz_offset_hours, tz_offset_minutes
+
+
+def get_utc_timezone() -> timezone:
+    return TIMEZONE_UTC
+
+
+def get_local_timezone() -> timezone:
+    """
+    Returns current/local timezone as a datetime.timezone object.
+    """
+    return TIMEZONE_LOCAL
+
+
 def get_local_timezone_string() -> str:
     """
     Returns current/local timezone in format like: "-05:00".
     """
-    tz_hours, tz_minutes = get_local_timezone_hours_minutes()
-    return f"{tz_hours:+03d}:{tz_minutes:02d}"
+    return TIMEZONE_LOCAL_SUFFIX
 
 
 def get_local_timezone_hours_minutes() -> Tuple[int, int]:
     """
     Returns a tuple with the integer hours and minutes offset for the current/local timezone.
+    If negative then only the hours is negative; the mintutes is always positive;
+    this is okay because there are no timezones less than one hour from UTC.
     """
-    tz_minutes = datetime.now(timezone.utc).astimezone().utcoffset().total_seconds() / 60
-    return int(tz_minutes // 60), int(abs(tz_minutes % 60))
+    return TIMEZONE_LOCAL_OFFSET_HOURS, TIMEZONE_LOCAL_OFFSET_MINUTES
+
+
+def parse_datetime(value: str, utc: bool = False, tz: Optional[timezone] = None) -> Optional[datetime]:
+    """
+    Parses the given string into a datetime, if possible, and returns that value,
+    or None if not able to parse. The timezone of the returned datetime will be the
+    local timezone; or if the given utc argument is True then it will be UTC; or if the
+    given tz argument is a datetime.timezone then return datetime will be in that timezone.
+    """
+    if isinstance(value, datetime):
+        return value
+    elif not isinstance(value, str):
+        return None
+    try:
+        # This dateutil.parser handles quite a wide variety of formats and suits our needs.
+        value = datetime_parser.parse(value)
+        if utc is True:
+            # If the given utc argument is True then it trumps any tz argument if given.
+            tz = timezone.utc
+        if value.tzinfo is not None:
+            # The given value had an explicit timezone specified.
+            if isinstance(tz, timezone):
+                return value.astimezone(tz)
+            return value
+        return value.replace(tzinfo=tz if isinstance(tz, timezone) else get_local_timezone())
+    except Exception:
+        return None
+
+
+def format_datetime(value: datetime,
+                    utc: bool = False,
+                    tz: Optional[Union[timezone, bool]] = None,
+                    iso: bool = False,
+                    notz: bool = False,
+                    noseconds: bool = False,
+                    ms: bool = False,
+                    verbose: bool = False,
+                    noseparator: bool = False,
+                    noday: bool = False,
+                    nodate: bool = False,
+                    notime: bool = False) -> str:
+    """
+    Returns the given datetime as a string in "YYYY:MM:DD hh:mm:ss tz" format, for
+    example "2024-04-17 15:42:26 EDT". If the given notz argument is True then omits
+    the timezone; if the noseconds argument is given the omits the seconds. If the given
+    verbose argument is True then returns a really verbose version of the datetime, for
+    example "Wednesday, April 17, 2024 | 15:42:26 EDT"; if the noseparator argument is
+    True then omits the "|" separator; if the noday argument is True then omits the day
+    of week part. The timezone of the returned datetime string will default to the local
+    one; if the given utc argument is True then it will be UTC; or if the given tz
+    argument is a datetime.timezone it will be in that timezone.
+    """
+    if nodate is True and notime is True:
+        return ""
+    if not isinstance(value, datetime):
+        if not isinstance(value, str) or not (value := parse_datetime(value)):
+            return ""
+    try:
+        if utc is True:
+            tz = timezone.utc
+        elif not isinstance(tz, timezone):
+            tz = get_local_timezone()
+            if tz is True:
+                notz = False
+            elif tz is False:
+                notz = True
+        if noseconds is True:
+            ms = False
+        value = value.astimezone(tz)
+        if iso:
+            if notz is True:
+                value = value.replace(tzinfo=None)
+            if not (ms is True):
+                value = value.replace(microsecond=0)
+            if noseconds is True:
+                if notz is True:
+                    if nodate is True:
+                        return value.strftime(f"%H:%M")
+                    elif notime is True:
+                        return value.strftime(f"%Y-%m-%d")
+                    else:
+                        return value.strftime(f"%Y-%m-%dT%H:%M")
+                if len(tz := value.strftime("%z")) > 3:
+                    tz = tz[:3] + ":" + tz[3:]
+                if nodate is True:
+                    return value.strftime(f"%H:%M") + tz
+                elif notime is True:
+                    return value.strftime(f"%Y-%m-%d") + tz
+                else:
+                    return value.strftime(f"%Y-%m-%dT%H:%M") + tz
+            if nodate is True:
+                if (not (notz is True)) and len(tz := value.strftime("%z")) > 3:
+                    tz = tz[:3] + ":" + tz[3:]
+                else:
+                    tz = ""
+                return value.strftime(f"%H:%M:%S{f'.%f' if ms is True else ''}") + tz
+            elif notime is True:
+                return value.strftime(f"%Y-%m-%d")
+            else:
+                return value.isoformat()
+        if verbose:
+            if nodate is True:
+                return value.strftime(
+                    f"%-I:%M{'' if noseconds is True else ':%S'}"
+                    f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}")
+            elif notime is True:
+                return value.strftime(f"{'' if noday is True else '%A, '}%B %-d, %Y")
+            else:
+                return value.strftime(
+                    f"{'' if noday is True else '%A, '}%B %-d, %Y{'' if noseparator is True else ' |'}"
+                    f" %-I:%M{'' if noseconds is True else ':%S'}"
+                    f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}")
+        else:
+            if nodate is True:
+                return value.strftime(
+                    f"%H:%M{'' if noseconds is True else ':%S'}"
+                    f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}")
+            elif notime is True:
+                return value.strftime(f"%Y-%m-%d")
+            else:
+                return value.strftime(
+                    f"%Y-%m-%d %H:%M{'' if noseconds is True else ':%S'}"
+                    f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}")
+    except Exception:
+        return None
+
+
+def format_date(value: datetime,
+                utc: bool = False,
+                tz: Optional[Union[timezone, bool]] = None,
+                verbose: bool = False,
+                noday: bool = False) -> str:
+    return format_datetime(value, utc=utc, tz=tz, verbose=verbose, noday=noday, notime=True)
+
+
+def format_time(value: datetime,
+                utc: bool = False,
+                iso: bool = False,
+                tz: Optional[Union[timezone, bool]] = None,
+                ms: bool = False,
+                notz: bool = False,
+                noseconds: bool = False,
+                verbose: bool = False,
+                noday: bool = False) -> str:
+    return format_datetime(value, utc=utc, tz=tz, iso=iso, ms=ms, notz=notz,
+                           noseconds=noseconds, verbose=verbose, nodate=True)
diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -2548,6 +2548,71 @@ def normalize_spaces(value: str) -> str:
     return re.sub(r"\s+", " ", value).strip()
 
 
+def find_nth_from_end(string: str, substring: str, nth: int) -> int:
+    """
+    Returns the index of the nth occurrence of the given substring within
+    the given string from the END of the given string; or -1 if not found.
+    """
+    index = -1
+    string = string[::-1]
+    for i in range(0, nth):
+        index = string.find(substring, index + 1)
+    return len(string) - index - 1 if index >= 0 else -1
+
+
+def set_nth(string: str, nth: int, replacement: str) -> str:
+    """
+    Sets the nth character of the given string to the given replacement string.
+    """
+    if not isinstance(string, str) or not isinstance(nth, int) or not isinstance(replacement, str):
+        return string
+    if nth < 0:
+        nth += len(string)
+    return string[:nth] + replacement + string[nth + 1:] if 0 <= nth < len(string) else string
+
+
+def format_size(nbytes: Union[int, float], precision: int = 2, nospace: bool = False, terse: bool = False) -> str:
+    if isinstance(nbytes, str) and nbytes.isdigit():
+        nbytes = int(nbytes)
+    elif not isinstance(nbytes, (int, float)):
+        return ""
+    UNITS = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
+    UNITS_TERSE = ['b', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
+    MAX_UNITS_INDEX = len(UNITS) - 1
+    ONE_K = 1024
+    index = 0
+    if (precision := max(precision, 0)) and (nbytes <= ONE_K):
+        precision -= 1
+    while abs(nbytes) >= ONE_K and index < MAX_UNITS_INDEX:
+        nbytes /= ONE_K
+        index += 1
+    if index == 0:
+        nbytes = int(nbytes)
+        return f"{nbytes} byte{'s' if nbytes != 1 else ''}"
+    unit = (UNITS_TERSE if terse else UNITS)[index]
+    return f"{nbytes:.{precision}f}{'' if nospace else ' '}{unit}"
+
+
+def format_duration(seconds: Union[int, float]) -> str:
+    seconds_actual = seconds
+    seconds = round(max(seconds, 0))
+    durations = [("year", 31536000), ("day", 86400), ("hour", 3600), ("minute", 60), ("second", 1)]
+    parts = []
+    for name, duration in durations:
+        if seconds >= duration:
+            count = seconds // duration
+            seconds %= duration
+            if count != 1:
+                name += "s"
+            parts.append(f"{count} {name}")
+    if len(parts) == 0:
+        return f"{seconds_actual:.1f} seconds"
+    elif len(parts) == 1:
+        return f"{seconds_actual:.1f} seconds"
+    else:
+        return " ".join(parts[:-1]) + " " + parts[-1]
+
+
 class JsonLinesReader:
 
     def __init__(self, fp, padded=False, padding=None):