Skip to content

Commit

Permalink
Fix: Force index type to str (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
adrien-berchet committed Aug 3, 2022
1 parent e61fbe2 commit 04db804
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
6 changes: 5 additions & 1 deletion data_validation_framework/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,14 @@ def kwargs(self):
def read_dataset(self):
"""Import the dataset to a :class:`pandas.DataFrame`.
Note that the index column is loaded as a string.
This method can be overridden to load custom data (e.g. GeoDataFrame, etc.).
The dataset should always be loaded from the path given by `self.dataset_df`.
"""
return pd.read_csv(self.dataset_df, index_col=self.input_index_col)
return pd.read_csv(
self.dataset_df, index_col=self.input_index_col, dtype={self.input_index_col: str}
)

def pre_process(self, df, args, kwargs):
"""Method executed before applying the external function."""
Expand Down
4 changes: 3 additions & 1 deletion tests/test_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,7 +1274,9 @@ def check_exception(failed_task, exception): # pylint: disable=unused-variable
assert not luigi.build([failing_task], local_scheduler=True)

assert failed_tasks == [str(failing_task)]
assert exceptions == [str(IndexError("The following index values are duplicated: [0, 1]"))]
assert exceptions == [
str(IndexError("The following index values are duplicated: ['0', '1']"))
]

def test_change_index(self, tmpdir, TestTask):
dataset_df_path = str(tmpdir / "dataset.csv")
Expand Down

0 comments on commit 04db804

Please sign in to comment.