Fix FixedString pandas type (#357)

ClickHouse · Jun 4, 2024 · 402e21d · 402e21d
1 parent 8515e1d
commit 402e21d
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ release (0.8.0), unrecognized arguments/keywords for these methods of creating a
 instead of being passed as ClickHouse server settings. This is in conjunction with some refactoring in Client construction.
 The supported method of passing ClickHouse server settings is to prefix such arguments/query parameters with`ch_`.  
 
+## 0.7.12, 2024-06-04
+### Bug Fix
+- When using `query_df` with a FixedString column with a read format of 'string' (and the default `query_df` setting
+`use_extended_dtypes=True`), the resulting column in the dataframe will now be correctly set to the (extended) String dtype.
+Fixes https://github.com/ClickHouse/clickhouse-connect/issues/356
+
 ## 0.7.11, 2024-05-26
 ### Improvement
 - Python or Pandas float value to ClickHouse Decimal now correctly rounds Float values for more accurate conversions.  Thanks

diff --git a/clickhouse_connect/__version__.py b/clickhouse_connect/__version__.py
@@ -1 +1 @@
-version = '0.7.11'
+version = '0.7.12'
diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py
@@ -80,6 +80,11 @@ def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryConte
             return source.read_fixed_str_col(self.byte_size, num_rows, ctx.encoding or self.encoding )
         return source.read_bytes_col(self.byte_size, num_rows)
 
+    def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence:
+        if ctx.use_extended_dtypes and self.read_format(ctx) == 'string':
+            return pd.array(column, dtype=pd.StringDtype())
+        return column
+
     # pylint: disable=too-many-branches,duplicate-code
     def _write_column_binary(self, column: Union[Sequence, MutableSequence], dest: bytearray, ctx: InsertContext):
         ext = dest.extend

diff --git a/tests/integration_tests/test_pandas.py b/tests/integration_tests/test_pandas.py
@@ -247,15 +247,18 @@ def test_pandas_date32(test_client: Client, table_context:Callable):
 
 
 def test_pandas_row_df(test_client: Client, table_context:Callable):
-    with table_context('test_pandas_row_df', ['key UInt64', 'dt DateTime64(6)']):
+    with table_context('test_pandas_row_df', ['key UInt64', 'dt DateTime64(6)', 'fs FixedString(5)']):
         df = pd.DataFrame({'key': [1, 2],
-                          'dt': [pd.Timestamp(2023, 5, 4, 10, 20), pd.Timestamp(2023, 10, 15, 14, 50, 2, 4038)]})
+                          'dt': [pd.Timestamp(2023, 5, 4, 10, 20), pd.Timestamp(2023, 10, 15, 14, 50, 2, 4038)],
+                           'fs': ['seven', 'bit']})
         df = df.iloc[1:]
         source_df = df.copy()
         test_client.insert_df('test_pandas_row_df', df)
-        result_df = test_client.query_df('SELECT * FROM test_pandas_row_df')
+        result_df = test_client.query_df('SELECT * FROM test_pandas_row_df', column_formats={'fs': 'string'})
+        assert str(result_df.dtypes[2]) == 'string'
         assert result_df.iloc[0]['key'] == 2
         assert result_df.iloc[0]['dt'] == pd.Timestamp(2023, 10, 15, 14, 50, 2, 4038)
+        assert result_df.iloc[0]['fs'] == 'bit\0\0'
         assert len(result_df) == 1
         assert source_df.equals(df)