Skip to content

Commit

Permalink
v0.5.4 -- Numpy/Pandas improvements, stream refactoring (#105)
Browse files Browse the repository at this point in the history
* Add C conversion for IPv4 native

* Tiny cleanup

* Add DateTime to c data_conv

* First work on specialized numpy columns

* numpy rework checkpoint

* numpy rework checkpoint

* date calculation checkpoint

* date calculation checkpoint

* Cleanup checkpoint

* query context cleanup

* numpy checkpoint

* Numpy read checkpoint

* context rework checkpoint

* format rework checkpoint

* format rework checkpoint

* progress toward pd/np reads

* progress toward pd/np reads

* stream cleanup checkpoint

* Add C optimization for UUID

* Minor cleanup, ready for tests

* Fix lint

* First work on optimizing None/NULL handling in Cython

* Tweak retries/connection resets

* Context cleanup checkpoint

* Fix lint

* Finalize for 0.5.4 release

* Fix lint

* Fix weird ORDER BY in test

* Tweak Cython UUID transform

* Add product_name, fix Numpy for Superset

* Downgrade Numpy Build for Python 3.7

* Clean up numpy build dependencies

* Fix numpy typo

* Don't build PyPy 3.7 binary wheels

* build debugging

* Tweak release/test actions

* Fix client_name refs

* Update test matrix, try to make cloud tests less flaky

* One more attempt at flaky cloud tests
  • Loading branch information
genzgd committed Feb 1, 2023
1 parent 71c2ea7 commit d6bfb61
Show file tree
Hide file tree
Showing 56 changed files with 1,788 additions and 767 deletions.
13 changes: 7 additions & 6 deletions .github/workflows/on_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install setuptools wheel
pip install -r tests/test_requirements.txt
pip install -r tests/superset_requirements.txt
pip install -r tests/test_requirements.txt
pip install pylint==2.14
python setup.py build_ext --inplace
- name: Run Pylint
Expand All @@ -51,9 +51,9 @@ jobs:
clickhouse-version:
- '22.3'
- '22.8'
- '22.10'
- '22.11'
- '22.12'
- '23.1'
- latest

name: Local Tests Py=${{ matrix.python-version }} CH=${{ matrix.clickhouse-version }}
Expand All @@ -73,13 +73,13 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install -r tests/test_requirements.txt
- name: Install pip
run: python -m pip install --upgrade pip
- name: Install Superset dependencies
if: contains(fromJson('["3.8", "3.9"]'), matrix.python-version)
run: pip install -r tests/superset_requirements.txt
- name: Install Test Dependencies
run: pip install -r tests/test_requirements.txt
- name: Build cython extensions
run: python setup.py build_ext --inplace
- name: Run tests
Expand Down Expand Up @@ -117,6 +117,7 @@ jobs:
CLICKHOUSE_CONNECT_TEST_FUZZ: 10
CLICKHOUSE_CONNECT_TEST_DOCKER: 'False'
CLICKHOUSE_CONNECT_TEST_PORT: 8443
CLICKHOUSE_CONNECT_TEST_INSERT_QUORUM: 3
CLICKHOUSE_CONNECT_TEST_HOST: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_HOST }}
CLICKHOUSE_CONNECT_TEST_PASSWORD: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_PASSWORD }}
run: pytest tests/integration_tests
70 changes: 62 additions & 8 deletions .github/workflows/on_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,26 @@ on:

env:
CIBW_SKIP: 'cp36-*'
CIBW_BEFORE_BUILD: pip install "cython<3"

jobs:
build_x86_linux_wheels:
name: Build x86 wheels on Linux
build_x86_manylinux_wheels:
name: Build x86 manylinux wheels on Linux
runs-on: ubuntu-latest
env:
CIBW_SKIP: 'cp36-* *-musllinux*'
steps:
- uses: actions/checkout@v3
- name: Build wheels
uses: pypa/cibuildwheel@v2.11.2
- uses: actions/upload-artifact@v3
with:
path: ./wheelhouse/*.whl

build_x86_musllinux_wheels:
name: Build x86 musllinux wheels on Linux
runs-on: ubuntu-latest
env:
CIBW_SKIP: 'cp36-* *-manylinux*'
steps:
- uses: actions/checkout@v3
- name: Build wheels
Expand All @@ -23,11 +37,48 @@ jobs:
with:
path: ./wheelhouse/*.whl

build_nonx86_wheels:
name: Build non x86 wheels
build_aarch64_manylinux_wheels:
name: Build aarch64 manylinux wheels
runs-on: ubuntu-latest
env:
CIBW_ARCHS_LINUX: aarch64
CIBW_SKIP: 'cp36-* pp* *-musllinux*'
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
with:
platforms: all
- name: Build wheels
uses: pypa/cibuildwheel@v2.11.2
- uses: actions/upload-artifact@v3
with:
path: ./wheelhouse/*.whl

build_aarch64_musllinux_wheels:
name: Build aarch64 musllinux wheels
runs-on: ubuntu-latest
env:
CIBW_ARCHS_LINUX: aarch64
CIBW_SKIP: 'cp36-* pp* *-manylinux*'
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
with:
platforms: all
- name: Build wheels
uses: pypa/cibuildwheel@v2.11.2
- uses: actions/upload-artifact@v3
with:
path: ./wheelhouse/*.whl

build_aarch64_pypy_wheels:
name: Build aarch64 PyPy wheels
runs-on: ubuntu-latest
env:
CIBW_ARCHS_LINUX: aarch64
CIBW_BUILD: 'pp*'
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
Expand All @@ -54,7 +105,7 @@ jobs:
path: ./wheelhouse/*.whl

build_windows_wheels:
name: Build x86 wheels on Windows
name: Build wheels on Windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -68,8 +119,11 @@ jobs:
needs:
- build_macos_wheels
- build_windows_wheels
- build_x86_linux_wheels
- build_nonx86_wheels
- build_x86_musllinux_wheels
- build_x86_manylinux_wheels
- build_aarch64_musllinux_wheels
- build_aarch64_manylinux_wheels
- build_aarch64_pypy_wheels
name: Publish to PyPI
runs-on: ubuntu-latest
steps:
Expand Down
50 changes: 49 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,54 @@
# ClickHouse Connect ChangeLog

## 0.5.4, 2023-01-31

### Deprecation Warning -- Context interface and stream* methods to be removed from QueryResult
In 0.5.x releases, streaming was implemented by returning generators from the QueryResult methods
`stream_column_blocks`, `stream_row_blocks`, and `stream_rows`. Safe usage of these methods required executing them
within a Python context created `with` the QueryResult itself. Otherwise, the HTTP response could be left open in the
case of an exception or other failure to consume the stream.

Using QueryResult as a context is unintuitive, and that usage pattern is deprecated and will be completely disabled in
a future release. Instead, streaming query results should be obtained using the new Client `*stream` methods described
under New Features, below.

### New Features
* Several streaming query methods have been added to the core ClickHouse Connect client. Each of these methods returns a StreamContext object, which must be used as a Python `with` Context to stream data (this ensures the underlying
streaming response is properly closed/consumed.) For simple examples, see the basic [tests](https://github.com/ClickHouse/clickhouse-connect/blob/main/tests/integration_tests/test_streaming.py).
* `query_column_block_stream` -- returns a generator of blocks in column oriented (Native) format. Fastest method for retrieving data in native Python format
* `query_row_block_stream` -- returns a generator of blocks in row oriented format. Used for processing data in a "batch" of rows at time while limiting memory usage
* `query_rows_stream` -- returns a convenience generator to process rows one at a time (data is still loaded in ClickHouse blocks to preserve memory)
* `query_np_stream` -- returns a generator where each ClickHouse data block is transformed into a Numpy array
* `query_df_stream` -- returns a generator where each ClickHouse data block is transformed into a Pandas Dataframe
* The `client_name` is now reported in a standardized way to ClickHouse (as the `http_user_agent`). For better tracking of your
Python application, use the new `product_name` common setting or set `client_name` `get_client` parameter to identify your product
as "<your-product-name>/<app-version>".

### Performance Improvements
* C/Cython optimizations for transforming ClickHouse data to Python types have been improved, and additional datatypes have been
optimized in Cython. The performance increase over the previous 0.5.x version is approximately 10% for "normal" read queries.
* Transformation of Numpy arrays and Pandas Dataframes has been completely rewritten to avoid an intermediate conversion to
Python types. As a result, querying in Numpy format, and especially Pandas format, has been **significantly** improved -- from 2x
for small datasets to 5x or more for very large Pandas DataFrames (even without streaming). Queries including Numpy datetime64 or
Pandas Timestamp objects have particularly benefited from the new implementation.

### Bug Fixes
* The default `maxsize` for concurrent HTTP connections to a single host was accidentally dropped in the 0.5.x release. It
has been restored to 8 for better performance when using multiple client objects.
* A single low level retry has been restored for HTTP connections on ConnectionReset or RemoteDisconnected exceptions. This
should reduce connection errors related to ClickHouse closing expired KeepAlive connections.

### Internal Changes
* As noted above, streaming, contexts and exception handling have been tightened up to avoid leaving HTTP responses open
when querying streams.
* Previous versions used `threading.local()` variables to store context information during query processing. The architecture
has been changed to pass the relevant Query or Insert Context to transformation methods instead of relying on thread local
variables. This is significantly safer in an environment where multiple queries can conceivably be open at the same on the
same thread (for example, if using async functions).
* Per query formatting logic has moved from `ClickHouseType` to the `QueryContext`.
* `ClickHouseType` methods have been renamed to remove outdated references to `native` format (everything is native now)
* Upgraded Cython Build to 3.0.11alpha release

## 0.5.3, 2023-01-23

### Bug Fix
Expand Down Expand Up @@ -191,7 +240,6 @@ clickhouse-sqlalchemy to clickhouse-connect. Thanks to [Eugene Torap](https://g
* Update QueryContext.updated_copy method to preserve settings, parameters, etc. https://github.com/ClickHouse/clickhouse-connect/issues/65



## 0.3.5, 2022-10-28

### New Features
Expand Down
2 changes: 1 addition & 1 deletion clickhouse_connect/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.5.3
0.5.4
3 changes: 3 additions & 0 deletions clickhouse_connect/cc_superset/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
from typing import Dict, List, Optional, Type

from flask import current_app
from flask_babel import gettext as __
from marshmallow import Schema, fields
from marshmallow.validate import Range
Expand All @@ -16,6 +17,7 @@
from superset.models.core import Database

from clickhouse_connect import driver_name
from clickhouse_connect.common import set_setting
from clickhouse_connect.driver import default_port
from clickhouse_connect.cc_sqlalchemy.datatypes.base import sqla_type_from_name
from clickhouse_connect.cc_superset.datatypes import configure_types
Expand All @@ -24,6 +26,7 @@
logger = logging.getLogger(__name__)

configure_types()
set_setting('product_name', f"superset/{current_app.config.get('VERSION_STRING', 'dev')}")


class ClickHouseParametersSchema(Schema):
Expand Down
12 changes: 11 additions & 1 deletion clickhouse_connect/common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
from dataclasses import dataclass
from typing import Any, Sequence, Optional, Dict

Expand All @@ -24,6 +25,14 @@ class CommonSetting:
_common_settings: Dict[str, CommonSetting] = {}


def build_client_name(client_name: str):
product_name = get_setting('product_name')
product_name = product_name.strip() + ' ' if product_name else ''
client_name = client_name.strip() + ' ' if client_name else ''
py_version = sys.version.split(' ', maxsplit=1)[0]
return f'{client_name}{product_name}clickhouse-connect/{version()} (lv:py/{py_version}; os:{sys.platform})'


def get_setting(name: str):
setting = _common_settings.get(name)
if setting is None:
Expand All @@ -35,7 +44,7 @@ def set_setting(name: str, value: Any):
setting = _common_settings.get(name)
if setting is None:
raise ProgrammingError(f'Unrecognized common setting {name}')
if value not in setting.options:
if setting.options and value not in setting.options:
raise ProgrammingError(f'Unrecognized option {value} for setting {name})')
if value == setting.default:
setting.value = None
Expand All @@ -50,3 +59,4 @@ def _init_common(name: str, options: Sequence[Any], default: Any):
_init_common('autogenerate_session_id', (True, False), True)
_init_common('dict_parameter_format', ('json', 'map'), 'json')
_init_common('invalid_setting_action', ('send', 'drop', 'error'), 'error')
_init_common('product_name', (), '')

0 comments on commit d6bfb61

Please sign in to comment.