Calendar mapping bug (#70)

* Move to ruff for formatting and import sorting * Add test and fix calendar mapping issue * Pin pandas to <2.2 for now * Update changelog * Add changes to linters/formatters to changelog
AI4S2S · Feb 29, 2024 · 4c336c0 · 4c336c0
1 parent c0ef954
commit 4c336c0
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
 
 ## [Unreleased]
 
+### Changed
+ - Moved to ruff formatter instead of black ([#70](https://github.com/AI4S2S/lilio/pull/70))
+ - Do import sorting with ruff instead of isort ([#70](https://github.com/AI4S2S/lilio/pull/70))
+
+### Fixed
+ - Fixed issue with calendar generation when the (rightmost) target period crossed into the new year ([#70](https://github.com/AI4S2S/lilio/pull/70)).
+
 ## 0.4.2 (2024-01-19)
 ### Changed
 - Consistent output type of train-test split as input ([#62](https://github.com/AI4S2S/lilio/pull/62)).

diff --git a/lilio/calendar.py b/lilio/calendar.py
@@ -488,7 +488,7 @@ def _set_year_range_from_timestamps(self):
 
         # ensure that the input data could always cover the advent calendar
         # last date check
-        if self._map_year(max_year).iloc[0].right > self._last_timestamp:
+        while self._map_year(max_year).iloc[0].right > self._last_timestamp:
             max_year -= 1
         # first date check
         while self._map_year(min_year).iloc[-1].right <= self._first_timestamp:

diff --git a/lilio/resampling.py b/lilio/resampling.py
@@ -42,7 +42,7 @@ def _check_valid_resampling_methods(method: ResamplingMethod):
 
 
 def _mark_target_period(
-    input_data: Union[pd.DataFrame, xr.Dataset]
+    input_data: Union[pd.DataFrame, xr.Dataset],
 ) -> Union[pd.DataFrame, xr.Dataset]:
     """Mark interval periods that fall within the given number of target periods.
 
@@ -77,7 +77,7 @@ def _mark_target_period(
 
 
 def _resample_bins_constructor(
-    intervals: Union[pd.Series, pd.DataFrame]
+    intervals: Union[pd.Series, pd.DataFrame],
 ) -> pd.DataFrame:
     """Restructures the interval object into a tidy DataFrame.
 

diff --git a/lilio/utils.py b/lilio/utils.py
@@ -16,7 +16,7 @@
 
 
 def check_timeseries(
-    data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset]
+    data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset],
 ) -> None:
     """Check if input data contains valid time data.
 
@@ -96,7 +96,7 @@ def check_empty_intervals(indices_list: list[np.ndarray]) -> None:
 
 
 def infer_input_data_freq(
-    data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset]
+    data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset],
 ) -> pd.Timedelta:
     """Infer the frequency of the input data, for comparison with the calendar freq.
 
@@ -207,7 +207,7 @@ def convert_interval_to_bounds(data: xr.Dataset) -> xr.Dataset:
 
 
 def check_reserved_names(
-    input_data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset]
+    input_data: Union[pd.Series, pd.DataFrame, xr.DataArray, xr.Dataset],
 ) -> None:
     """Check if reserved names are already in the input data. E.g. "anchor_year"."""
     reserved_names_pd = ["anchor_year", "i_interval", "is_target"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,7 @@ classifiers = [
 dependencies = [
   "netcdf4",
   "numpy",
-  "pandas",
+  "pandas<2.2",
   "matplotlib",
   "xarray",
   "scikit-learn",
@@ -69,8 +69,6 @@ dev = [
   "bump-my-version",
   "hatch",
   "ruff",
-  "isort",
-  "black",
   "mypy",
   "pytest",
   "pytest-cov",
@@ -96,10 +94,9 @@ features = ["dev", "bokeh"]
 lint = [
   "ruff check .",
   "mypy .",
-  "isort --check-only --diff .",
-  "black --check --diff .",
+  "ruff format . --check",
 ]
-format = ["isort .", "black .", "lint",]
+format = ["ruff format .", "ruff check . --fix", "lint",]
 test = ["pytest ./lilio/ ./tests/ --doctest-modules",]
 coverage = [
   "pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml tests/",
@@ -120,19 +117,19 @@ testpaths = ["tests"]
 ignore_missing_imports = true
 python_version = "3.10"
 
-[tool.black]
+[tool.ruff]
+target-version = "py39"
 line-length = 88
-target-version = ['py39', 'py310', 'py311']
-include = '\.pyi?$'
+exclude = ["docs", "build"]
 
-[tool.ruff]
+[tool.ruff.lint]
 select = [
   "E",  # pycodestyle
   "F",  # pyflakes
   "B",  # flake8-bugbear
   "D",  # pydocstyle
   "C90",  # mccabe complexity
-#  "I",  # isort (autosort not working correctly, disabled for now).
+  "I",  # isort (autosort not working correctly, disabled for now).
   "N",  # PEP8-naming
   "UP",  # pyupgrade (upgrade syntax to current syntax)
   "PLE",  # Pylint error https://github.com/charliermarsh/ruff#error-ple
@@ -148,41 +145,20 @@ extend-select = [
 ignore = [
   "PLR2004",  # magic value used in comparsion (i.e. `if ndays == 28: month_is_feb`).
 ]
-# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = ["A", "B", "C90", "D", "E", "F", "TID252", "UP"]
 unfixable = []
-line-length = 88
-exclude = ["docs", "build"]
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-target-version = "py39"
+pydocstyle.convention = "google"
+mccabe.max-complexity = 10
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "tests/**" = ["D"]
 
-[tool.ruff.pydocstyle]
-convention = "google"
-
-[tool.ruff.mccabe]
-max-complexity = 10
-
-# Configuration for when ruff's import sorting is fixed.
-# [tool.ruff.isort]
-# known-first-party = ["lilio"]
-# force-single-line = true
-# lines-after-imports = 2
-# no-lines-before = ["future","standard-library","third-party","first-party","local-folder"]
-
-[tool.isort]
-py_version=39
-skip = [".gitignore", ".dockerignore"]
-skip_glob = ["docs/*"]
-force_single_line = true
-lines_after_imports = 2
-no_lines_before = ["FUTURE","STDLIB","THIRDPARTY","FIRSTPARTY","LOCALFOLDER"]
-known_first_party = ["lilio"]
-src_paths = ["lilio", "tests"]
-line_length = 120
+[tool.ruff.lint.isort]
+known-first-party = ["lilio"]
+force-single-line = true
+lines-after-imports = 2
+no-lines-before = ["future","standard-library","third-party","first-party","local-folder"]
 
 [tool.coverage.run]
 branch = true

diff --git a/tests/test_calendar.py b/tests/test_calendar.py
@@ -223,6 +223,30 @@ def test_allow_overlap(self, allow_overlap, expected_anchors):
         cal.map_years(2020, 2022)
         assert np.array_equal(expected_anchors, cal.get_intervals().index.values)
 
+    def test_extra_year_edgecase(self):
+        """Weird things can happen when a calendar interval crosses over the new year.
+
+        2 years have to be subtracted from the last datapoint's year.
+        """
+        cal = Calendar("12-25")
+        cal.add_intervals("target", length="1M")
+        dates_inclusive = pd.date_range(start="2007-01-01", end="2010-01-01", freq="1d")
+        dates_exclusive = pd.date_range(start="2007-01-01", end="2009-12-31", freq="1d")
+        test_data_incl = pd.Series(
+            data=np.zeros(len(dates_inclusive)), index=dates_inclusive
+        )
+        test_data_excl = pd.Series(
+            data=np.zeros(len(dates_exclusive)), index=dates_exclusive
+        )
+
+        cal.map_to_data(test_data_incl)
+        n_years_inclusive = len(cal.get_intervals())
+        cal.map_to_data(test_data_excl)
+        n_years_exclusive = len(cal.get_intervals())
+
+        assert n_years_exclusive == 2
+        assert n_years_inclusive == 2
+
 
 class TestAnchorKwarg:
     correct_inputs = [  # format: (anchor, anchor_fmt, anchor_str)