Skip to content

Commit

Permalink
Merge pull request #33 from Breaka84/add_null_cleanser
Browse files Browse the repository at this point in the history
fix cleanser logs type mismatch bug and bump new version 3.3.6
  • Loading branch information
emavgl authored Nov 19, 2021
2 parents 1eb91c8 + 0f85540 commit d5294cf
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
Changelog
=========

3.3.6 (2021-11-19)
-------------------
* [FIX] Fix Cleaners logs in case of field type different than string

3.3.5 (2021-11-16)
-------------------
* [ADD] Add Null Cleaner spooq.transformer.NullCleaner
Expand Down
2 changes: 1 addition & 1 deletion spooq/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.3.5"
__version__ = "3.3.6"
2 changes: 1 addition & 1 deletion spooq/transformer/base_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _only_keep_cleansed_values(col_name, temporary_col_name):
return (F.when(F.col(temporary_col_name) == F.col(col_name), F.lit(None))
.otherwise(F.when(F.col(temporary_col_name).isNull() & F.col(col_name).isNull(), F.lit(None))
.otherwise(F.when(F.col(temporary_col_name).isNull() & F.col(col_name).isNotNull(), F.lit("null"))
.otherwise(F.col(temporary_col_name))))
.otherwise(F.col(temporary_col_name).cast("string"))))
)

for column_name, temporary_column_name in zip(column_names, temporary_column_names):
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/transformer/test_null_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,26 @@ def test_single_cleansing_rule_log_as_map(self, input_df, spark_session):
output_df = NullCleaner(
cleansing_definitions, column_to_log_cleansed_values="cleansed_values_null", store_as_map=True
).transform(input_df)
assert_df_equality(expected_output_df, output_df)

def test_single_cleansing_boolean_log_as_map(self, input_df, spark_session):
input_boolean_df = spark_session.createDataFrame([Row(b=True), Row(b=None), Row(b=False)])
expected_output_schema = T.StructType(
[
T.StructField("b", T.BooleanType(), True),
T.StructField("cleansed_values_null", T.MapType(T.StringType(), T.StringType(), True), True),
]
)
expected_output_df = spark_session.createDataFrame(
[
(True, None),
(False, {"b": "null"}),
(False, None),
],
schema=expected_output_schema,
)
cleansing_definitions = {"b": {"default": False}}
output_df = NullCleaner(
cleansing_definitions, column_to_log_cleansed_values="cleansed_values_null", store_as_map=True
).transform(input_boolean_df)
assert_df_equality(expected_output_df, output_df)

0 comments on commit d5294cf

Please sign in to comment.