Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guess start and end date when no time bounds are available #116

Merged
merged 3 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 90 additions & 32 deletions src/access_nri_intake/source/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import re
import warnings
from datetime import timedelta
from pathlib import Path

import cftime
Expand All @@ -15,10 +16,56 @@
pass


def get_timeinfo(ds, time_dim="time"):
def _add_month_start(time, n):
"""Add months to cftime datetime and truncate to start"""
year = time.year + ((time.month + n - 1) // 12)
month = (time.month + n - 1) % 12 + 1
return time.replace(
year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0
)


def _add_year_start(time, n):
"""Add years to cftime datetime and truncate to start"""
return time.replace(
year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0
)


def _guess_start_end_dates(ts, te, frequency):
"""Guess the start and end bounded times for a given frequency"""
warnings.warn(
"Time coordinate does not include bounds information. Guessing "
"start and end times."
)
num, unit = frequency
if unit == "yr":
step_back = -int(num / 2)
step_fwd = num + step_back
ts = _add_year_start(ts, step_back)
te = _add_year_start(te, step_fwd)
elif unit == "mon":
step_back = -int(num / 2)
step_fwd = num + step_back
ts = _add_month_start(ts, step_back)
te = _add_month_start(te, step_fwd)
elif unit == "day":
dt = timedelta(days=num) / 2
ts = ts - dt
te = te + dt
elif unit == "hr":
dt = timedelta(hours=num) / 2
ts = ts - dt
te = te + dt
else:
warnings.warn("Cannot infer start and end times for subhourly frequencies.")

Check warning on line 61 in src/access_nri_intake/source/utils.py

View check run for this annotation

Codecov / codecov/patch

src/access_nri_intake/source/utils.py#L61

Added line #L61 was not covered by tests
return ts, te


def get_timeinfo(ds, filename_frequency, time_dim):
"""
Get start time, end time and frequency of a xarray dataset. Stolen and slightly adapted
from cosima cookbook, see
Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the
cosima cookbook, see
https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565

Parameters
Expand All @@ -33,11 +80,12 @@
return cftime.num2date(t, time_var.units, calendar=time_var.calendar)

time_format = "%Y-%m-%d, %H:%M:%S"
start_time = "none"
end_time = "none"
start_date = "none"
end_date = "none"
frequency = "fx"
has_time = time_dim in ds

if time_dim in ds:
if has_time:
time_var = ds[time_dim]

if len(time_var) == 0:
Expand All @@ -64,22 +112,40 @@
# TODO: This is not a very good way to get the frequency
if dt.days >= 365:
years = round(dt.days / 365)
frequency = f"{years}yr"
frequency = (years, "yr")
elif dt.days >= 28:
months = round(dt.days / 30)
frequency = f"{months}mon"
frequency = (months, "mon")
elif dt.days >= 1:
frequency = f"{dt.days}day"
frequency = (dt.days, "day")
elif dt.seconds >= 3600:
hours = round(dt.seconds / 3600)
frequency = f"{hours}hr"
frequency = (hours, "hr")
else:
frequency = "subhr"
frequency = (None, "subhr")

if filename_frequency:
if filename_frequency != frequency:
msg = (
f"The frequency '{filename_frequency}' determined from filename does not "
f"match the frequency '{frequency}' determined from the file contents."
)
if frequency == "fx":
frequency = filename_frequency
warnings.warn(f"{msg} Using '{frequency}'.")

start_time = ts.strftime(time_format)
end_time = te.strftime(time_format)
if has_time & (frequency != "fx"):
if not has_bounds:
ts, te = _guess_start_end_dates(ts, te, frequency)
start_date = ts.strftime(time_format)
dougiesquire marked this conversation as resolved.
Show resolved Hide resolved
end_date = te.strftime(time_format)

return (start_time, end_time, frequency)
if frequency[0]:
frequency = f"{str(frequency[0])}{frequency[1]}"
else:
frequency = frequency[1]

return start_date, end_date, frequency


def parse_access_filename(filename):
Expand Down Expand Up @@ -117,12 +183,12 @@
}
# Frequency translations
frequencies = {
"daily": "1day",
"_dai$": "1day",
"month": "1mon",
"_mon$": "1mon",
"yearly": "1yr",
"_ann$": "1yr",
"daily": (1, "day"),
"_dai$": (1, "day"),
"month": (1, "mon"),
"_mon$": (1, "mon"),
"yearly": (1, "yr"),
"_ann$": (1, "yr"),
}
redaction_fill = "X"

Expand Down Expand Up @@ -151,14 +217,16 @@
return file_id, timestamp, frequency


def parse_access_ncfile(file):
def parse_access_ncfile(file, time_dim="time"):
"""
Get Intake-ESM datastore entry info from an ACCESS netcdf file

Parameters
----------
file: str
The path to the netcdf file
time_dim: str
The name of the time dimension

Returns
-------
Expand Down Expand Up @@ -190,21 +258,11 @@
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])

start_date, end_date, frequency = get_timeinfo(ds)
start_date, end_date, frequency = get_timeinfo(ds, filename_frequency, time_dim)

if not variable_list:
raise EmptyFileError("This file contains no variables")

if filename_frequency:
if filename_frequency != frequency:
msg = (
f"The frequency '{filename_frequency}' determined from filename {filename} does not "
f"match the frequency '{frequency}' determined from the file contents."
)
if frequency == "fx":
frequency = filename_frequency
warnings.warn(f"{msg} Using '{frequency}'.")

outputs = (
filename,
file_id,
Expand Down
Loading
Loading