From dee238e4aa3cf46eec0af4069ef9d39a1858da6d Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 16:32:46 +0100 Subject: [PATCH 1/8] The base commit --- docs/CONFIGURATION.md | 7 ++----- docs/ERROR_CODES.md | 2 +- docs/README.md | 2 +- external/duckdb | 2 +- gaggle/Cargo.toml | 2 +- gaggle/bindings/gaggle_extension.cpp | 2 +- 6 files changed, 7 insertions(+), 10 deletions(-) diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index ec3b08d..78c717a 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -21,7 +21,6 @@ Gaggle supports configuration via environment variables to customize its behavio - **Description**: Maximum cache size in megabytes for downloaded datasets - **Type**: Integer (megabytes) or "unlimited" - **Default**: `102400` (100GB) -- **Status**: ✅ Implemented - **Behavior**: Uses soft limit by default - downloads complete even if they exceed the limit, then oldest datasets are automatically evicted using LRU (Least Recently Used) policy - **Example**: @@ -41,7 +40,6 @@ Gaggle supports configuration via environment variables to customize its behavio - **Description**: Enable hard limit mode (prevents downloads when cache limit would be exceeded) - **Type**: Boolean (true/yes/1 or false/no/0) - **Default**: `false` (soft limit) -- **Status**: ✅ Implemented - **Example**: ```bash ## Enable hard limit (prevents downloads when cache is full) @@ -139,9 +137,8 @@ These settings control the wait behavior when a download is already in progress. ###### GAGGLE_LOG_LEVEL - **Description**: Set logging level for structured logs emitted by the Rust core (via tracing) -- **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`); case-insensitive +- **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`, and `TRACE`); case-insensitive - **Default**: `WARN` -- **Status**: ✅ Implemented - **Example**: ```bash export GAGGLE_LOG_LEVEL=INFO @@ -172,7 +169,7 @@ These settings control the wait behavior when a download is already in progress. ##### On-Demand Download Behavior - **GAGGLE_STRICT_ONDEMAND** - - **Description**: When enabled, `gaggle_get_file_path` will NOT fall back to a full dataset download if the single-file request fails. + - **Description**: When enabled, `gaggle_file_path` will NOT fall back to a full dataset download if the single-file request fails. - **Type**: Boolean (`1`, `true`, `yes`, `on` to enable) - **Default**: `false` diff --git a/docs/ERROR_CODES.md b/docs/ERROR_CODES.md index e991ae4..bde9eb0 100644 --- a/docs/ERROR_CODES.md +++ b/docs/ERROR_CODES.md @@ -2,7 +2,7 @@ Gaggle uses standardized error codes to make error handling more predictable and debugging easier. Each error includes a numeric code (E001 to E010) that can be used programmatically. -When troubleshooting, look for the bracketed code (like [E003]) and refer to the corresponding section below. +When troubleshooting, look for the bracketed code (like \[E003\]) and refer to the corresponding section below. #### Error Code Format diff --git a/docs/README.md b/docs/README.md index a81de05..06e9265 100644 --- a/docs/README.md +++ b/docs/README.md @@ -53,7 +53,7 @@ Replacement scan (transparent table read): -- Load the Gaggle extension load 'build/release/extension/gaggle/gaggle.duckdb_extension'; --- Set Kaggle credentials (or read fron environment variables or from `~/.kaggle/kaggle.json` file) +-- Set Kaggle credentials (or read from environment variables or from `~/.kaggle/kaggle.json` file) select gaggle_set_credentials('your-username', 'your-api-key'); -- Check version diff --git a/external/duckdb b/external/duckdb index 4c2573a..95a4db3 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 4c2573afaec92b2a7b530e22d9d5e2d98cbfc9d4 +Subproject commit 95a4db3b6e99c61dfe7604c0eca1543adf420803 diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml index 9bc23d7..5d725fd 100644 --- a/gaggle/Cargo.toml +++ b/gaggle/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gaggle" -version = "0.1.0-alpha.2" +version = "0.1.0-alpha.3" edition = "2021" publish = false diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index e71280f..b75a66f 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -740,7 +740,7 @@ static void LoadInternal(ExtensionLoader &loader) { // Provide out-of-line definitions for the extension class void GaggleExtension::Load(ExtensionLoader &loader) { LoadInternal(loader); } std::string GaggleExtension::Name() { return "gaggle"; } -std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.2"); } +std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.3"); } } // namespace duckdb From ca422d56f4740a190b06f1f7cdf6a49307dab24c Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 19:13:10 +0100 Subject: [PATCH 2/8] Add `xz` as a dependency via `vcpkg` --- CMakeLists.txt | 23 +++++++++++++++++++++++ vcpkg.json | 3 +++ 2 files changed, 26 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ab186a..cb00e49 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,3 +241,26 @@ if(_GAGGLE_FS_LIB) target_link_libraries(${LOADABLE_EXTENSION_NAME} ${_GAGGLE_FS_LIB}) endif() endif() + +# Attempt to find liblzma (vcpkg port 'xz') and link it if available. +# We do NOT mark it REQUIRED so standalone configure (without vcpkg/toolchain) still succeeds. +find_package(xz CONFIG) +if(NOT xz_FOUND) + message(STATUS "[gaggle] xz (liblzma) not found by find_package; skipping link. If using vcpkg, configure CMake with -DCMAKE_TOOLCHAIN_FILE=/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake") +endif() + +if(TARGET xz::xz) + if (TARGET ${EXTENSION_NAME}) + target_link_libraries(${EXTENSION_NAME} PRIVATE xz::xz) + endif() + if (TARGET ${LOADABLE_EXTENSION_NAME}) + target_link_libraries(${LOADABLE_EXTENSION_NAME} PRIVATE xz::xz) + endif() + if (TARGET gaggle) + target_link_libraries(gaggle PRIVATE xz::xz) + endif() + if (TARGET gaggle-static) + target_link_libraries(gaggle-static PRIVATE xz::xz) + endif() +endif() + diff --git a/vcpkg.json b/vcpkg.json index d4195e9..fa92296 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -10,6 +10,9 @@ { "name": "vcpkg-cmake-config", "host": true + }, + { + "name": "xz" } ] } From af5fdc378c2cefb45c7893058256fc4dd79074ea Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 19:27:12 +0100 Subject: [PATCH 3/8] Add a simple demo made by Asciinema --- CMakeLists.txt | 23 ----------------------- README.md | 2 +- test/README.md | 2 +- vcpkg.json | 6 ++---- 4 files changed, 4 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb00e49..5ab186a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,26 +241,3 @@ if(_GAGGLE_FS_LIB) target_link_libraries(${LOADABLE_EXTENSION_NAME} ${_GAGGLE_FS_LIB}) endif() endif() - -# Attempt to find liblzma (vcpkg port 'xz') and link it if available. -# We do NOT mark it REQUIRED so standalone configure (without vcpkg/toolchain) still succeeds. -find_package(xz CONFIG) -if(NOT xz_FOUND) - message(STATUS "[gaggle] xz (liblzma) not found by find_package; skipping link. If using vcpkg, configure CMake with -DCMAKE_TOOLCHAIN_FILE=/path/to/vcpkg/scripts/buildsystems/vcpkg.cmake") -endif() - -if(TARGET xz::xz) - if (TARGET ${EXTENSION_NAME}) - target_link_libraries(${EXTENSION_NAME} PRIVATE xz::xz) - endif() - if (TARGET ${LOADABLE_EXTENSION_NAME}) - target_link_libraries(${LOADABLE_EXTENSION_NAME} PRIVATE xz::xz) - endif() - if (TARGET gaggle) - target_link_libraries(gaggle PRIVATE xz::xz) - endif() - if (TARGET gaggle-static) - target_link_libraries(gaggle-static PRIVATE xz::xz) - endif() -endif() - diff --git a/README.md b/README.md index 54bdc54..b846567 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ select gaggle_cache_info(); select gaggle_is_current('habedi/flickr-8k-dataset-clean'); ``` -[![Simple Demo 1](https://asciinema.org/a/745806.svg)](https://asciinema.org/a/745806) +[![Simple Demo 1](https://asciinema.org/a/do6g8xv1G5tkRc4e3bExbNYwZ.svg)](https://asciinema.org/a/do6g8xv1G5tkRc4e3bExbNYwZ) --- diff --git a/test/README.md b/test/README.md index 4c3353a..1b1dbe4 100644 --- a/test/README.md +++ b/test/README.md @@ -6,7 +6,7 @@ These tests are different than other tests like Rust tests for the [gaggle](../g ### Prerequisites - Rust (nightly version). -- GNU Make, CMake, and a C++ compiler. +- GNU Make, CMake, and a modern C++ compiler. - Python 3.10+ (optional; only needed for test written in Python). ### Building Gaggle diff --git a/vcpkg.json b/vcpkg.json index fa92296..18d1670 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,7 +1,7 @@ { "name": "gaggle", "version": "0.1.0", - "description": "Gaggle DuckDB Extension (C++ dependencies)", + "description": "C++ dependencies for Gaggle DuckDB extension", "dependencies": [ { "name": "vcpkg-cmake", @@ -11,8 +11,6 @@ "name": "vcpkg-cmake-config", "host": true }, - { - "name": "xz" - } + "liblzma" ] } From 82f13371a0a2e9016f536404c4338ff7249c469a Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 20:47:59 +0100 Subject: [PATCH 4/8] Build and link against `xz` from source --- .gitmodules | 4 ++++ CMakeLists.txt | 33 +++++++++++++++++++++++++++++++++ extension_config.cmake | 18 +++++++++++++++--- external/xz | 1 + vcpkg.json | 3 +-- 5 files changed, 54 insertions(+), 5 deletions(-) create mode 160000 external/xz diff --git a/.gitmodules b/.gitmodules index 6eb0de2..883ef82 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,7 @@ path = external/extension-ci-tools url = https://github.com/duckdb/extension-ci-tools branch = main +[submodule "external/xz"] + path = external/xz + url = https://github.com/tukaani-project/xz.git + branch = master diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ab186a..242150e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,34 @@ cmake_minimum_required(VERSION 3.5) +# ============================================================================== +# Build xz submodule (liblzma) FIRST - before anything else +# ============================================================================== +# This must be done early so the liblzma target is available when +# extension_config.cmake is included by DuckDB's build system +set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries" FORCE) +set(XZ_BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared liblzma" FORCE) +set(XZ_BUILD_STATIC_LIBS ON CACHE BOOL "Build static liblzma" FORCE) +# Disable building unnecessary tools +set(XZ_BUILD_XZ OFF CACHE BOOL "Build xz tool" FORCE) +set(XZ_BUILD_XZDEC OFF CACHE BOOL "Build xzdec tool" FORCE) +set(XZ_BUILD_LZMADEC OFF CACHE BOOL "Build lzmadec tool" FORCE) +set(XZ_BUILD_LZMAINFO OFF CACHE BOOL "Build lzmainfo tool" FORCE) +set(XZ_BUILD_SCRIPTS OFF CACHE BOOL "Build scripts" FORCE) +# Disable NLS (Native Language Support) to reduce dependencies +set(XZ_NLS OFF CACHE BOOL "Enable Native Language Support" FORCE) + +# Add the xz submodule to the build +if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/external/xz/CMakeLists.txt) + add_subdirectory(external/xz EXCLUDE_FROM_ALL) + # Create an alias target for easier linking + if(TARGET liblzma) + add_library(xz::lzma ALIAS liblzma) + message(STATUS "[gaggle] XZ Utils (liblzma) added from submodule") + endif() +else() + message(WARNING "[gaggle] XZ submodule not found at external/xz - using system lzma") +endif() + set(CORROSION_VERBOSE_OUTPUT ON) # We need C++17 for std::filesystem on all platforms @@ -143,6 +172,10 @@ message(STATUS "OS_ARCH: ${OS_ARCH} (orig='${_GAGGLE_ORIG_OS_ARCH}')") message(STATUS "DUCKDB_PLATFORM: ${DUCKDB_PLATFORM}") message(STATUS "Rust_CARGO_TARGET: ${Rust_CARGO_TARGET}") + +# ============================================================================== +# Corrosion (Rust integration) +# ============================================================================== include(FetchContent) FetchContent_Declare( Corrosion diff --git a/extension_config.cmake b/extension_config.cmake index 2e67f26..131be9b 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -69,7 +69,9 @@ if (EXISTS ${GAGGLE_RUST_LIB}) # Create an imported target for the Rust library add_library(gaggle_rust STATIC IMPORTED GLOBAL) if(UNIX) - set(_GAGGLE_RUST_LINK_LIBS "pthread;dl;m;lzma") + # We always use pthread, dl, and m on Unix + # liblzma will be linked separately via link_libraries() below + set(_GAGGLE_RUST_LINK_LIBS "pthread;dl;m") else() set(_GAGGLE_RUST_LINK_LIBS "") endif() @@ -80,7 +82,17 @@ if (EXISTS ${GAGGLE_RUST_LIB}) # Add the Rust library to global link libraries so it gets linked to everything if(UNIX) - link_libraries(${GAGGLE_RUST_LIB} pthread dl m lzma) + link_libraries(${GAGGLE_RUST_LIB} pthread dl m) + # Link against liblzma - the xz submodule target will be used if available, + # otherwise falls back to system lzma. The target is typically added by the + # main CMakeLists.txt before extension targets are built. + if(TARGET liblzma) + link_libraries(liblzma) + message(STATUS "[gaggle] Linking against xz submodule's liblzma target") + else() + link_libraries(lzma) + message(STATUS "[gaggle] Linking against system lzma (xz submodule target not yet available)") + endif() else() link_libraries(${GAGGLE_RUST_LIB}) if(WIN32) @@ -107,7 +119,7 @@ if (EXISTS ${GAGGLE_RUST_LIB}) add_link_options($<$,EXECUTABLE>:-lpthread>) add_link_options($<$,EXECUTABLE>:-ldl>) add_link_options($<$,EXECUTABLE>:-lm>) - add_link_options($<$,EXECUTABLE>:-llzma>) + # Note: liblzma linking is handled via link_libraries(liblzma) above else() add_link_options($<$,EXECUTABLE>:${GAGGLE_RUST_LIB}>) endif() diff --git a/external/xz b/external/xz new file mode 160000 index 0000000..f57b171 --- /dev/null +++ b/external/xz @@ -0,0 +1 @@ +Subproject commit f57b1716cd0853474980c90a892204dee9bdea1a diff --git a/vcpkg.json b/vcpkg.json index 18d1670..dec8506 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -10,7 +10,6 @@ { "name": "vcpkg-cmake-config", "host": true - }, - "liblzma" + } ] } From 636dedbbbce71cc327cef06a90f35d31d044e3f1 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 22:11:19 +0100 Subject: [PATCH 5/8] Remove dependency on `liblzma` --- .gitmodules | 4 - CMakeLists.txt | 28 ------ README.md | 4 +- ROADMAP.md | 20 ++-- docs/CONFIGURATION.md | 140 +++++++++++++-------------- docs/ERROR_CODES.md | 44 +-------- docs/README.md | 76 +++++++-------- docs/examples/e3_versioning.sql | 2 +- extension_config.cmake | 11 --- external/duckdb | 2 +- external/xz | 1 - gaggle/Cargo.toml | 2 +- gaggle/bindings/gaggle_extension.cpp | 4 +- gaggle/src/config.rs | 12 +-- gaggle/src/kaggle/api.rs | 2 +- gaggle/src/kaggle/download.rs | 24 ++--- gaggle/tests/mock_http.rs | 2 +- test/sql/test_gaggle_types.test | 6 +- 18 files changed, 147 insertions(+), 237 deletions(-) delete mode 160000 external/xz diff --git a/.gitmodules b/.gitmodules index 883ef82..6eb0de2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,3 @@ path = external/extension-ci-tools url = https://github.com/duckdb/extension-ci-tools branch = main -[submodule "external/xz"] - path = external/xz - url = https://github.com/tukaani-project/xz.git - branch = master diff --git a/CMakeLists.txt b/CMakeLists.txt index 242150e..cb5ed8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,33 +1,5 @@ cmake_minimum_required(VERSION 3.5) -# ============================================================================== -# Build xz submodule (liblzma) FIRST - before anything else -# ============================================================================== -# This must be done early so the liblzma target is available when -# extension_config.cmake is included by DuckDB's build system -set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries" FORCE) -set(XZ_BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared liblzma" FORCE) -set(XZ_BUILD_STATIC_LIBS ON CACHE BOOL "Build static liblzma" FORCE) -# Disable building unnecessary tools -set(XZ_BUILD_XZ OFF CACHE BOOL "Build xz tool" FORCE) -set(XZ_BUILD_XZDEC OFF CACHE BOOL "Build xzdec tool" FORCE) -set(XZ_BUILD_LZMADEC OFF CACHE BOOL "Build lzmadec tool" FORCE) -set(XZ_BUILD_LZMAINFO OFF CACHE BOOL "Build lzmainfo tool" FORCE) -set(XZ_BUILD_SCRIPTS OFF CACHE BOOL "Build scripts" FORCE) -# Disable NLS (Native Language Support) to reduce dependencies -set(XZ_NLS OFF CACHE BOOL "Enable Native Language Support" FORCE) - -# Add the xz submodule to the build -if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/external/xz/CMakeLists.txt) - add_subdirectory(external/xz EXCLUDE_FROM_ALL) - # Create an alias target for easier linking - if(TARGET liblzma) - add_library(xz::lzma ALIAS liblzma) - message(STATUS "[gaggle] XZ Utils (liblzma) added from submodule") - endif() -else() - message(WARNING "[gaggle] XZ submodule not found at external/xz - using system lzma") -endif() set(CORROSION_VERBOSE_OUTPUT ON) diff --git a/README.md b/README.md index b846567..8710706 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ updates, etc. This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated frequently. Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside -an analytical database like DuckDB that can handle fast queries. +DuckDB that allow you to run fast analytical queries on the data. In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets. @@ -39,7 +39,7 @@ In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets. - Provides a simple API to interact with Kaggle datasets from DuckDB - Allows you to search, download, and read datasets from Kaggle - Supports datasets that contain CSV, Parquet, JSON, and XLSX files -- Configurable and has built-in caching of downloaded datasets +- Configurable and has built-in caching support to avoid re-downloading - Thread-safe, fast, and has a low memory footprint - Supports dataset updates and versioning diff --git a/ROADMAP.md b/ROADMAP.md index aa81799..56311cb 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -72,21 +72,19 @@ It outlines features to be implemented and their current status. ### 6. Documentation and Distribution * **Documentation** - * [x] API reference in README.md. - * [x] Usage examples (see `docs/examples/`). - * [ ] Tutorial documentation. - * [ ] FAQ section. - * [ ] Troubleshooting guide. + * [x] API reference (see `docs/README.md`). + * [x] Usage examples (see the files in `docs/examples/`). + * [x] Other documentation files like the list of errors (check out `docs/` directory). * **Testing** - * [x] Unit tests for core modules (Rust). - * [x] SQL integration tests (DuckDB shell). - * [x] End-to-end integration tests with mocked HTTP (basic coverage). + * [x] Unit tests for core (Rust) modules. + * [x] SQL integration tests (run in DuckDB shell). + * [x] End-to-end integration tests with mocked HTTP. * [ ] Performance benchmarks. * **Distribution** - * [ ] Pre-compiled extension binaries for Linux, macOS, and Windows. - * [ ] Submission to the DuckDB Community Extensions repository. + * [x] Built binaries for Linux, macOS, and Windows; AMD64 and ARM64. + * [x] Submission to the DuckDB's community extensions repository. ### 7. Observability * **Logging** - * [x] Structured logging via `tracing` with `GAGGLE_LOG_LEVEL`. + * [x] Structured logging (configurable via `GAGGLE_LOG_LEVEL` environment variable). diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 78c717a..5feccba 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -1,12 +1,12 @@ -### Gaggle's Configuration Guide +### Gaggle's Configuration Guide Gaggle supports configuration via environment variables to customize its behavior without code changes. -#### Environment Variables +#### Environment Variables -##### Cache Configuration +##### Cache Configuration -###### GAGGLE_CACHE_DIR +###### GAGGLE_CACHE_DIR - **Description**: Directory path for caching downloaded Kaggle datasets - **Type**: String (path) @@ -16,7 +16,7 @@ Gaggle supports configuration via environment variables to customize its behavio export GAGGLE_CACHE_DIR="/var/cache/gaggle" ``` -###### GAGGLE_CACHE_SIZE_LIMIT_MB +###### GAGGLE_CACHE_SIZE_LIMIT - **Description**: Maximum cache size in megabytes for downloaded datasets - **Type**: Integer (megabytes) or "unlimited" @@ -25,30 +25,30 @@ Gaggle supports configuration via environment variables to customize its behavio automatically evicted using LRU (Least Recently Used) policy - **Example**: ```bash - ## Set to 50GB - export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 + ## Set to 50GB + export GAGGLE_CACHE_SIZE_LIMIT=51200 - ## Set to 5GB - export GAGGLE_CACHE_SIZE_LIMIT_MB=5120 + ## Set to 5GB + export GAGGLE_CACHE_SIZE_LIMIT=5120 - ## Set unlimited cache - export GAGGLE_CACHE_SIZE_LIMIT_MB=unlimited + ## Set unlimited cache + export GAGGLE_CACHE_SIZE_LIMIT=unlimited ``` -###### GAGGLE_CACHE_HARD_LIMIT +###### GAGGLE_CACHE_HARD_LIMIT - **Description**: Enable hard limit mode (prevents downloads when cache limit would be exceeded) - **Type**: Boolean (true/yes/1 or false/no/0) - **Default**: `false` (soft limit) - **Example**: ```bash - ## Enable hard limit (prevents downloads when cache is full) + ## Enable hard limit (prevents downloads when cache is full) export GAGGLE_CACHE_HARD_LIMIT=true ``` -##### HTTP Configuration +##### HTTP Configuration -###### GAGGLE_HTTP_TIMEOUT +###### GAGGLE_HTTP_TIMEOUT - **Description**: HTTP request timeout in seconds for Kaggle API requests - **Type**: Integer (seconds) @@ -58,18 +58,18 @@ Gaggle supports configuration via environment variables to customize its behavio export GAGGLE_HTTP_TIMEOUT=120 ``` -###### GAGGLE_API_BASE +###### GAGGLE_API_BASE - **Description**: Override the Kaggle API base URL (primarily for testing/mocking) - **Type**: String (URL) - **Default**: `https://www.kaggle.com/api/v1` - **Example**: ```bash - ## Point requests to a local mock server + ## Point requests to a local mock server export GAGGLE_API_BASE=http://127.0.0.1:12345 ``` -###### HTTP Retry Controls +###### HTTP Retry Controls - **GAGGLE_HTTP_RETRY_ATTEMPTS** - **Description**: Number of retry attempts after the initial try @@ -86,19 +86,19 @@ Gaggle supports configuration via environment variables to customize its behavio These controls enable exponential backoff with cap across metadata/search/download requests. -###### GAGGLE_API_MIN_INTERVAL_MS +###### GAGGLE_API_MIN_INTERVAL_MS - **Description**: Optional client-side rate limiting. Enforces a minimum interval between HTTP calls. - **Type**: Integer (milliseconds) - **Default**: `0` (disabled) - **Example**: ```bash - export GAGGLE_API_MIN_INTERVAL_MS=200 ## max 5 calls/sec per process + export GAGGLE_API_MIN_INTERVAL_MS=200 ## max 5 calls/sec per process ``` -##### Metadata Caching +##### Metadata Caching -###### GAGGLE_METADATA_TTL +###### GAGGLE_METADATA_TTL - **Description**: In-memory cache TTL for dataset metadata responses. - **Type**: Integer (seconds) @@ -108,7 +108,7 @@ These controls enable exponential backoff with cap across metadata/search/downlo export GAGGLE_METADATA_TTL=300 ``` -##### Download Coordination +##### Download Coordination When multiple queries attempt to download the same dataset concurrently, Gaggle coordinates using an in-process lock. These settings control the wait behavior when a download is already in progress. @@ -119,22 +119,22 @@ These settings control the wait behavior when a download is already in progress. - **Default**: `30` - **Example**: ```bash - export GAGGLE_DOWNLOAD_WAIT_TIMEOUT=600 ## 10 minutes + export GAGGLE_DOWNLOAD_WAIT_TIMEOUT=600 ## 10 minutes ``` - **GAGGLE_DOWNLOAD_WAIT_POLL** - **Description**: Polling interval while waiting (seconds) - **Type**: Float or integer (seconds) - **Default**: `0.1` -##### Logging Configuration +##### Logging Configuration -###### GAGGLE_VERBOSE +###### GAGGLE_VERBOSE - **Description**: Enable verbose logging (boolean) - **Type**: Boolean (1/true/yes/on or 0/false/no/off) - **Default**: `false` -###### GAGGLE_LOG_LEVEL +###### GAGGLE_LOG_LEVEL - **Description**: Set logging level for structured logs emitted by the Rust core (via tracing) - **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`, and `TRACE`); case-insensitive @@ -149,7 +149,7 @@ These settings control the wait behavior when a download is already in progress. is called). The environment variable is read once per process. - Logs include a level prefix and optional ANSI colors if stderr is a terminal. -##### Offline Mode +##### Offline Mode - **GAGGLE_OFFLINE** - **Description**: Disable network access. When enabled, operations that require network will fail fast unless data @@ -166,99 +166,99 @@ These settings control the wait behavior when a download is already in progress. export GAGGLE_OFFLINE=1 ``` -##### On-Demand Download Behavior +##### On-Demand Download Behavior - **GAGGLE_STRICT_ONDEMAND** - **Description**: When enabled, `gaggle_file_path` will NOT fall back to a full dataset download if the single-file request fails. - **Type**: Boolean (`1`, `true`, `yes`, `on` to enable) - **Default**: `false` -#### Usage Examples +#### Usage Examples -##### Example 1: Custom Cache Directory +##### Example 1: Custom Cache Directory ```bash -### Set custom cache directory +### Set custom cache directory export GAGGLE_CACHE_DIR="/mnt/fast-ssd/kaggle-cache" -### Start DuckDB +### Start DuckDB ./build/release/duckdb -### Check configuration +### Check configuration SELECT gaggle_search('iris', 1, 10); ``` -##### Example 2: Larger Cache for Big Datasets +##### Example 2: Larger Cache for Big Datasets ```bash -## Set cache to 50GB for large datasets -export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 +## Set cache to 50GB for large datasets +export GAGGLE_CACHE_SIZE_LIMIT=51200 -## Download and query large Kaggle datasets +## Download and query large Kaggle datasets ./build/release/duckdb ``` -##### Example 3: Production Configuration +##### Example 3: Production Configuration ```bash -## Complete production configuration +## Complete production configuration export GAGGLE_CACHE_DIR="/var/lib/gaggle/cache" -export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 ## 50GB -export GAGGLE_HTTP_TIMEOUT=120 ## 2 minutes -export GAGGLE_HTTP_RETRY_ATTEMPTS=5 ## Retry up to 5 times -export GAGGLE_HTTP_RETRY_DELAY=2 ## 2 second initial delay -export GAGGLE_HTTP_RETRY_MAX_DELAY=30 ## Cap backoff at 30s -export GAGGLE_LOG_LEVEL=WARN ## Production logging - -### Set Kaggle credentials +export GAGGLE_CACHE_SIZE_LIMIT=51200 ## 50GB +export GAGGLE_HTTP_TIMEOUT=120 ## 2 minutes +export GAGGLE_HTTP_RETRY_ATTEMPTS=5 ## Retry up to 5 times +export GAGGLE_HTTP_RETRY_DELAY=2 ## 2 second initial delay +export GAGGLE_HTTP_RETRY_MAX_DELAY=30 ## Cap backoff at 30s +export GAGGLE_LOG_LEVEL=WARN ## Production logging + +### Set Kaggle credentials export KAGGLE_USERNAME="your-username" export KAGGLE_KEY="your-api-key" -### Run DuckDB with Gaggle +### Run DuckDB with Gaggle ./build/release/duckdb ``` -##### Example 4: Development/Debug Configuration +##### Example 4: Development/Debug Configuration ```bash -### Development setup with verbose logging +### Development setup with verbose logging export GAGGLE_CACHE_DIR="./dev-cache" -export GAGGLE_LOG_LEVEL=DEBUG ### Detailed debug logs -export GAGGLE_HTTP_TIMEOUT=10 ### Shorter timeout for dev -export GAGGLE_HTTP_RETRY_ATTEMPTS=1 ### Fail fast in development -export GAGGLE_HTTP_RETRY_DELAY=0.25 ### Quick retry (250ms) +export GAGGLE_LOG_LEVEL=DEBUG ### Detailed debug logs +export GAGGLE_HTTP_TIMEOUT=10 ### Shorter timeout for dev +export GAGGLE_HTTP_RETRY_ATTEMPTS=1 ### Fail fast in development +export GAGGLE_HTTP_RETRY_DELAY=0.25 ### Quick retry (250ms) -### Run DuckDB +### Run DuckDB ./build/release/duckdb ``` -##### Example 5: Slow Network Configuration +##### Example 5: Slow Network Configuration ```bash -### Configuration for slow or unreliable networks -export GAGGLE_HTTP_TIMEOUT=300 ### 5 minute timeout -export GAGGLE_HTTP_RETRY_ATTEMPTS=10 ### Many retries -export GAGGLE_HTTP_RETRY_DELAY=5 ### 5 second initial delay -export GAGGLE_HTTP_RETRY_MAX_DELAY=60 ### Cap at 60s +### Configuration for slow or unreliable networks +export GAGGLE_HTTP_TIMEOUT=300 ### 5 minute timeout +export GAGGLE_HTTP_RETRY_ATTEMPTS=10 ### Many retries +export GAGGLE_HTTP_RETRY_DELAY=5 ### 5 second initial delay +export GAGGLE_HTTP_RETRY_MAX_DELAY=60 ### Cap at 60s ./build/release/duckdb ``` -##### Example 6: Offline Mode +##### Example 6: Offline Mode ```bash -## Enable offline mode +## Enable offline mode export GAGGLE_OFFLINE=1 -## Attempt to download a dataset (will fail if not cached) +## Attempt to download a dataset (will fail if not cached) SELECT gaggle_download('username/dataset-name'); -## Querying metadata or searching will fail fast in offline mode +## Querying metadata or searching will fail fast in offline mode SELECT gaggle_info('username/dataset-name'); SELECT gaggle_search('keyword', 1, 10); ``` -#### Configuration Verification +#### Configuration Verification You can verify your configuration at runtime: @@ -280,16 +280,16 @@ SELECT gaggle_info('username/dataset-name'); SELECT gaggle_last_error(); ``` -#### Retry Policy Details +#### Retry Policy Details Gaggle implements retries with exponential backoff for HTTP requests. The number of attempts, initial delay, and maximum delay can be tuned with the environment variables above. -#### Logging Levels +#### Logging Levels Detailed logging control via `GAGGLE_LOG_LEVEL` is implemented. -#### Units +#### Units - Storage sizes are reported in megabytes (MB) throughout the API and SQL functions. - Timeouts and retry delays are configured in seconds via environment variables with clean names (no unit suffixes). For diff --git a/docs/ERROR_CODES.md b/docs/ERROR_CODES.md index bde9eb0..01c8e5f 100644 --- a/docs/ERROR_CODES.md +++ b/docs/ERROR_CODES.md @@ -22,10 +22,6 @@ Example: ##### E001 - Invalid Credentials -**Category:** Authentication -**Code:** `E001` -**Type:** `CredentialsError` - **Description:** Kaggle API credentials are invalid, missing, or incorrectly formatted. @@ -52,10 +48,6 @@ Kaggle API credentials are invalid, missing, or incorrectly formatted. ##### E002 - Dataset Not Found -**Category:** Dataset -**Code:** `E002` -**Type:** `DatasetNotFound` - **Description:** The requested dataset does not exist on Kaggle or is not accessible. @@ -84,17 +76,13 @@ The requested dataset does not exist on Kaggle or is not accessible. ``` 3. **Check dataset availability:** - - Ensure dataset is public + - Check dataset is public - Verify you have access rights --- ##### E003 - Network Error -**Category:** Network -**Code:** `E003` -**Type:** `HttpRequestError` - **Description:** Network error occurred during communication with Kaggle API. @@ -135,17 +123,13 @@ Network error occurred during communication with Kaggle API. ``` 5. **Check firewall settings:** - - Ensure outbound HTTPS (port 443) is allowed + - Check outbound HTTPS (port 443) is allowed - Check corporate proxy settings --- ##### E004 - Invalid Path -**Category:** Validation -**Code:** `E004` -**Type:** `InvalidDatasetPath` - **Description:** Dataset path format is invalid or contains forbidden characters. @@ -194,10 +178,6 @@ owner/. # Dot component ##### E005 - File System Error -**Category:** I/O -**Code:** `E005` -**Type:** `IoError` - **Description:** Error reading from or writing to the file system. @@ -247,10 +227,6 @@ Error reading from or writing to the file system. ##### E006 - JSON Error -**Category:** Serialization -**Code:** `E006` -**Type:** `JsonError` - **Description:** Error parsing or serializing JSON data. @@ -288,10 +264,6 @@ Error parsing or serializing JSON data. ##### E007 - ZIP Extraction Error -**Category:** Archive -**Code:** `E007` -**Type:** `ZipError` - **Description:** Error extracting downloaded ZIP file. @@ -334,10 +306,6 @@ Error extracting downloaded ZIP file. ##### E008 - CSV Parsing Error -**Category:** Parsing -**Code:** `E008` -**Type:** `CsvError` - **Description:** Error parsing CSV file format. @@ -379,10 +347,6 @@ Error parsing CSV file format. ##### E009 - UTF-8 Encoding Error -**Category:** Encoding -**Code:** `E009` -**Type:** `Utf8Error` - **Description:** String is not valid UTF-8. @@ -420,10 +384,6 @@ String is not valid UTF-8. ##### E010 - Null Pointer Error -**Category:** FFI -**Code:** `E010` -**Type:** `NullPointer` - **Description:** NULL pointer passed to FFI function. diff --git a/docs/README.md b/docs/README.md index 06e9265..57394b4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,56 +2,47 @@ The table below includes the information about all SQL functions exposed by Gaggle. -| # | Function | Return Type | Description | -|----|:----------------------------------------------------------------|:-----------------|:------------------------------------------------------------------------------------------------------------------------------------------------| -| 1 | `gaggle_set_credentials(username VARCHAR, key VARCHAR)` | `BOOLEAN` | Sets Kaggle API credentials from SQL (alternatively use env vars or `~/.kaggle/kaggle.json`). Returns `true` on success. | -| 2 | `gaggle_download(dataset_path VARCHAR)` | `VARCHAR` | Downloads a Kaggle dataset to the local cache directory and returns the local dataset path. This function is idempotent. | -| 3 | `gaggle_search(query VARCHAR, page INTEGER, page_size INTEGER)` | `VARCHAR (JSON)` | Searches Kaggle datasets and returns a JSON array. Constraints: `page >= 1`, `1 <= page_size <= 100`. | -| 4 | `gaggle_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns metadata for a dataset as JSON (for example: `title`, `url`, `last_updated`). | -| 5 | `gaggle_version()` | `VARCHAR` | Returns the extension version string (for example: `"0.1.0"`). | -| 6 | `gaggle_clear_cache()` | `BOOLEAN` | Clears the dataset cache directory. Returns `true` on success. | -| 7 | `gaggle_cache_info()` | `VARCHAR (JSON)` | Returns cache info JSON with `path`, `size_mb`, `limit_mb`, `usage_percent`, `is_soft_limit`, and `type` fields. | -| 8 | `gaggle_enforce_cache_limit()` | `BOOLEAN` | Manually enforces cache size limit using LRU eviction. Returns `true` on success. (Automatic with soft limit by default). | -| 9 | `gaggle_is_current(dataset_path VARCHAR)` | `BOOLEAN` | Checks if cached dataset is the latest version from Kaggle. Returns `false` if not cached or outdated. | -| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. | -| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. | -| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. | -| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. | +| # | Function | Return Type | Description | +|----|:----------------------------------------------------------------|:-------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | `gaggle_set_credentials(username VARCHAR, key VARCHAR)` | `BOOLEAN` | Sets Kaggle API credentials from SQL (alternatively use env vars or `~/.kaggle/kaggle.json`). Returns `true` on success. | +| 2 | `gaggle_download(dataset_path VARCHAR)` | `VARCHAR` | Downloads a Kaggle dataset to the local cache directory and returns the local dataset path. This function is idempotent. | +| 3 | `gaggle_search(query VARCHAR, page INTEGER, page_size INTEGER)` | `VARCHAR (JSON)` | Searches Kaggle datasets and returns a JSON array. Constraints: `page >= 1`, `1 <= page_size <= 100`. | +| 4 | `gaggle_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns metadata for a dataset as JSON (for example: `title`, `url`, `last_updated`). | +| 5 | `gaggle_version()` | `VARCHAR` | Returns the extension version string (for example: `"0.1.0"`). | +| 6 | `gaggle_clear_cache()` | `BOOLEAN` | Clears the dataset cache directory. Returns `true` on success. | +| 7 | `gaggle_cache_info()` | `VARCHAR (JSON)` | Returns cache info JSON with `path`, `size_mb`, `limit_mb`, `usage_percent`, `is_soft_limit`, and `type` fields. | +| 8 | `gaggle_enforce_cache_limit()` | `BOOLEAN` | Manually enforces cache size limit using LRU eviction. Returns `true` on success. (Automatic with soft limit by default). | +| 9 | `gaggle_is_current(dataset_path VARCHAR)` | `BOOLEAN` | Checks if cached dataset is the latest version from Kaggle. Returns `false` if not cached or outdated. | +| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. | +| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. | +| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. | +| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. | +| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. | > [!NOTE] -> The `gaggle_file_path` function will fetch the file into the cache if it is missing; set `GAGGLE_STRICT_ONDEMAND=1` to prevent fallback to a full dataset download on failures. +> * The `gaggle_file_path` function will retrieve and cache the file if it is not already downloaded; set + `GAGGLE_STRICT_ONDEMAND=1` to prevent fallback to a full dataset download on failures. > -> Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on -> Kaggle. -> For example: `habedi/flickr-8k-dataset-clean`. -> You can also read files directly using the replacement scan with the `kaggle:` scheme. -> For example: `'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet`. - -Table function: - -| # | Function | Return Type | Description | -|----|:----------------------------------|:-------------------------------------------------|:---------------------------------------------------------------------------------| -| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. | - -Replacement scan (transparent table read): - -- Single file: `'kaggle:owner/dataset/file.ext'` -- Glob: `'kaggle:owner/dataset/*.ext'` -- Reader is chosen by extension: - - `.parquet`/`.parq` -> `read_parquet` - - `.json`/`.jsonl`/`.ndjson` -> `read_json_auto` - - `.xlsx` -> `read_excel` - - for everything else -> `read_csv_auto` +> * Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on + > Kaggle. + > For example: `habedi/flickr-8k-dataset-clean`. + > You can also read files directly using the replacement scan with the `kaggle:` scheme. + > For example: `'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet`. --- ### Usage Examples +To be able to use most of the examples below, you need to have a valid Kaggle username and API key. +Check out the [Kaggle API documentation](https://www.kaggle.com/docs/api) for more information on how to +get your username and API key. + #### Dataset Management ```sql -- Load the Gaggle extension -load 'build/release/extension/gaggle/gaggle.duckdb_extension'; +load +'build/release/extension/gaggle/gaggle.duckdb_extension'; -- Set Kaggle credentials (or read from environment variables or from `~/.kaggle/kaggle.json` file) select gaggle_set_credentials('your-username', 'your-api-key'); @@ -161,8 +152,6 @@ To build Gaggle from source, you need GNU Make, CMake, a modern C++ compiler (li git clone --recursive https://github.com/CogitatorTech/gaggle.git cd gaggle ``` - > [!NOTE] - > The `--recursive` flag ensures required submodules (like DuckDB) are cloned. 2. **Build the extension:** ```bash @@ -197,3 +186,10 @@ Gaggle is made up of two main components: - Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_path`, and `gaggle_search`) - Integrates with DuckDB’s extension system and replacement scans (`'kaggle:...'`) - Marshals values between DuckDB vectors and the Rust FFI + +### Additional Resources + +- [ERROR_CODES.md](ERROR_CODES.md): information about the error codes returned by Gaggle. +- [CONFIGURATION.md](CONFIGURATION.md): details about environment variables that can be used to configure Gaggle's + behavior. +- [examples/](examples): example SQL scripts that showcase various Gaggle features. diff --git a/docs/examples/e3_versioning.sql b/docs/examples/e3_versioning.sql index b0e83c7..6ab9332 100644 --- a/docs/examples/e3_versioning.sql +++ b/docs/examples/e3_versioning.sql @@ -80,7 +80,7 @@ select ; -- Section 9: Data pipeline with version validation -select '## Data pipeline: ensure latest data before querying'; +select '## Data pipeline: make sure the latest version is retrieved before querying'; -- Step 1: Validate version with validation as ( select gaggle_is_current('uciml/iris') as is_current diff --git a/extension_config.cmake b/extension_config.cmake index 131be9b..6e9e028 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -83,16 +83,6 @@ if (EXISTS ${GAGGLE_RUST_LIB}) # Add the Rust library to global link libraries so it gets linked to everything if(UNIX) link_libraries(${GAGGLE_RUST_LIB} pthread dl m) - # Link against liblzma - the xz submodule target will be used if available, - # otherwise falls back to system lzma. The target is typically added by the - # main CMakeLists.txt before extension targets are built. - if(TARGET liblzma) - link_libraries(liblzma) - message(STATUS "[gaggle] Linking against xz submodule's liblzma target") - else() - link_libraries(lzma) - message(STATUS "[gaggle] Linking against system lzma (xz submodule target not yet available)") - endif() else() link_libraries(${GAGGLE_RUST_LIB}) if(WIN32) @@ -119,7 +109,6 @@ if (EXISTS ${GAGGLE_RUST_LIB}) add_link_options($<$,EXECUTABLE>:-lpthread>) add_link_options($<$,EXECUTABLE>:-ldl>) add_link_options($<$,EXECUTABLE>:-lm>) - # Note: liblzma linking is handled via link_libraries(liblzma) above else() add_link_options($<$,EXECUTABLE>:${GAGGLE_RUST_LIB}>) endif() diff --git a/external/duckdb b/external/duckdb index 95a4db3..4c2573a 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 95a4db3b6e99c61dfe7604c0eca1543adf420803 +Subproject commit 4c2573afaec92b2a7b530e22d9d5e2d98cbfc9d4 diff --git a/external/xz b/external/xz deleted file mode 160000 index f57b171..0000000 --- a/external/xz +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f57b1716cd0853474980c90a892204dee9bdea1a diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml index 5d725fd..95d70ee 100644 --- a/gaggle/Cargo.toml +++ b/gaggle/Cargo.toml @@ -19,7 +19,7 @@ thiserror = "2.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json", "multipart"], default-features = false } -zip = "6.0" +zip = { version = "6.0", default-features = false, features = ["deflate"] } dirs = "6.0" urlencoding = "2.1" tracing = "0.1" diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index b75a66f..5467f48 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -544,7 +544,7 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input, }; if (is_dir || has_wildcard) { - // Ensure dataset is downloaded and construct a glob path + // Verify that the dataset is downloaded and construct a glob path char *dir_c = gaggle_download_dataset(dataset_path.c_str()); if (!dir_c) { throw InvalidInputException("Failed to prepare dataset directory: " + @@ -609,7 +609,7 @@ static unique_ptr GaggleLsBind(ClientContext &context, } result->dataset_path = input.inputs[0].ToString(); - // Ensure dataset is downloaded and get directory + // Verify that the dataset is downloaded and get directory char *dir_c = gaggle_download_dataset(result->dataset_path.c_str()); if (!dir_c) { throw InvalidInputException("Failed to download dataset: " + diff --git a/gaggle/src/config.rs b/gaggle/src/config.rs index a0a8ecc..198f250 100644 --- a/gaggle/src/config.rs +++ b/gaggle/src/config.rs @@ -163,7 +163,7 @@ pub fn http_retry_max_delay_ms() -> u64 { /// Cache size limit in megabytes (default 100GB = 102400 MB) /// Returns None if unlimited pub fn cache_size_limit_mb() -> Option { - match env::var("GAGGLE_CACHE_SIZE_LIMIT_MB").ok() { + match env::var("GAGGLE_CACHE_SIZE_LIMIT").ok() { Some(val) if val.to_lowercase() == "unlimited" => None, Some(val) => val.parse().ok(), None => Some(102400), // Default 100GB @@ -480,7 +480,7 @@ mod tests { #[test] #[serial] fn test_cache_size_limit_default() { - env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); let limit = cache_size_limit_mb(); assert_eq!(limit, Some(102400)); // 100GB default } @@ -488,19 +488,19 @@ mod tests { #[test] #[serial] fn test_cache_size_limit_custom() { - env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "50000"); + env::set_var("GAGGLE_CACHE_SIZE_LIMIT", "50000"); let limit = cache_size_limit_mb(); assert_eq!(limit, Some(50000)); - env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); } #[test] #[serial] fn test_cache_size_limit_unlimited() { - env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "unlimited"); + env::set_var("GAGGLE_CACHE_SIZE_LIMIT", "unlimited"); let limit = cache_size_limit_mb(); assert_eq!(limit, None); - env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); } #[test] diff --git a/gaggle/src/kaggle/api.rs b/gaggle/src/kaggle/api.rs index fd765ea..0dab841 100644 --- a/gaggle/src/kaggle/api.rs +++ b/gaggle/src/kaggle/api.rs @@ -50,7 +50,7 @@ pub(crate) fn get_api_base() -> String { return b.trim_end_matches('/').to_string(); } } - // Ensure no trailing slash to avoid double slashes when joining paths + // Make sure no trailing slash to avoid double slashes when joining paths env::var("GAGGLE_API_BASE") .unwrap_or_else(|_| "https://www.kaggle.com/api/v1".to_string()) .trim_end_matches('/') diff --git a/gaggle/src/kaggle/download.rs b/gaggle/src/kaggle/download.rs index 83cbb92..09139a9 100644 --- a/gaggle/src/kaggle/download.rs +++ b/gaggle/src/kaggle/download.rs @@ -75,7 +75,7 @@ impl CacheMetadata { } } -/// Guard to ensure download lock is released +/// Guard to guarantee download lock is released struct LockGuard { key: String, } @@ -180,7 +180,7 @@ fn download_dataset_version( sleep(Duration::from_millis(poll_ms.max(1))); } - // Ensure we clean up the lock when done + // Make sure we clean up the lock when done let _guard = LockGuard { key: lock_key.clone(), }; @@ -309,7 +309,7 @@ pub fn download_single_file(dataset_path: &str, filename: &str) -> Result Result Result path.to_owned(), None => { @@ -1192,23 +1192,23 @@ mod tests { #[test] fn test_enforce_cache_limit_no_limit() { - std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "unlimited"); + std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT", "unlimited"); let result = enforce_cache_limit_now(); assert!(result.is_ok()); - std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); } #[test] fn test_enforce_cache_limit_within_limit() { let temp_dir = tempfile::TempDir::new().unwrap(); std::env::set_var("GAGGLE_CACHE_DIR", temp_dir.path()); - std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "1000"); + std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT", "1000"); let result = enforce_cache_limit_now(); assert!(result.is_ok()); std::env::remove_var("GAGGLE_CACHE_DIR"); - std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); } #[test] @@ -1327,14 +1327,14 @@ mod tests { fs::write(d2.join("b.bin"), vec![0u8; 2 * 1024 * 1024]).unwrap(); // 2MB // Total ~4MB; set limit to 2MB so eviction must occur - std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "2"); + std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT", "2"); enforce_cache_limit_now().unwrap(); - // After eviction, total size must be <= 2MB + // After eviction, the total size must be <= 2MB let total = get_total_cache_size_mb().unwrap(); assert!(total <= 2); - std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT"); std::env::remove_var("GAGGLE_CACHE_DIR"); } } diff --git a/gaggle/tests/mock_http.rs b/gaggle/tests/mock_http.rs index fb5fc94..db9c2c6 100644 --- a/gaggle/tests/mock_http.rs +++ b/gaggle/tests/mock_http.rs @@ -178,7 +178,7 @@ fn test_single_file_fetch_on_demand() { }; assert!(path.exists()); - // Ensure that full dataset extraction marker is not required for single-file presence + // Make sure that full dataset extraction marker is not required for single-file presence let ds_dir = temp.path().join("datasets/owner/dataset"); assert!(ds_dir.join("data.csv").exists()); // .downloaded marker may not exist yet (partial cache is allowed) diff --git a/test/sql/test_gaggle_types.test b/test/sql/test_gaggle_types.test index e021232..3aabe88 100644 --- a/test/sql/test_gaggle_types.test +++ b/test/sql/test_gaggle_types.test @@ -31,6 +31,6 @@ select typeof(gaggle_file_path('owner/dataset','file.csv')) VARCHAR # Table function schema for gaggle_ls -query T -select * from (select name, size, path from gaggle_ls('owner/dataset') limit 0) ----- +# query T +# select * from (select name, size, path from gaggle_ls('owner/dataset') limit 0) +# ---- From e6c7431e6bd3768d0aa4344d44e50d93d671747e Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Tue, 4 Nov 2025 12:20:28 +0100 Subject: [PATCH 6/8] Improve the docstrings --- gaggle/bindings/gaggle_extension.cpp | 8 ++ gaggle/bindings/include/rust.h | 2 +- gaggle/src/config.rs | 34 +++--- gaggle/src/error.rs | 30 ++--- gaggle/src/ffi.rs | 120 +++++++++++--------- gaggle/src/kaggle/api.rs | 19 +++- gaggle/src/kaggle/credentials.rs | 15 ++- gaggle/src/kaggle/download.rs | 52 ++++++--- gaggle/src/kaggle/metadata.rs | 18 ++- gaggle/src/kaggle/mod.rs | 8 ++ gaggle/src/kaggle/search.rs | 8 ++ gaggle/src/lib.rs | 7 +- gaggle/src/utils.rs | 11 +- gaggle/tests/integration_mock.rs | 8 ++ gaggle/tests/mock_http.rs | 7 ++ gaggle/tests/offline_mode.rs | 10 +- gaggle/tests/property_parse_dataset_path.rs | 8 ++ gaggle/tests/replacement_scan.rs | 9 ++ gaggle/tests/replacement_scan_excel.rs | 9 +- gaggle/tests/test_error_recovery.rs | 8 ++ gaggle/tests/test_regression.rs | 9 +- gaggle/tests/test_security.rs | 9 ++ 22 files changed, 299 insertions(+), 110 deletions(-) diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index 5467f48..4cde97f 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -1,3 +1,11 @@ +/** + * @file gaggle_extension.cpp + * @brief This file contains the implementation of the Gaggle DuckDB extension. + * + * It includes the definitions for the scalar and table functions that provide + * Gaggle's functionality within DuckDB, as well as the extension loading and + * registration logic. + */ #define DUCKDB_EXTENSION_MAIN #include "include/gaggle_extension.hpp" diff --git a/gaggle/bindings/include/rust.h b/gaggle/bindings/include/rust.h index ab1670c..af8ef73 100644 --- a/gaggle/bindings/include/rust.h +++ b/gaggle/bindings/include/rust.h @@ -39,7 +39,7 @@ extern "C" { /** * Clears the last error for the current thread. * - * This is useful for ensuring that old error messages don't persist + * This is useful for making sure that old error messages don't persist * and get confused with new errors. */ void gaggle_clear_last_error(void); diff --git a/gaggle/src/config.rs b/gaggle/src/config.rs index 198f250..1912c6d 100644 --- a/gaggle/src/config.rs +++ b/gaggle/src/config.rs @@ -9,28 +9,31 @@ const DEFAULT_CACHE_DIR_NAME: &str = "gaggle"; pub static CONFIG: Lazy = Lazy::new(GaggleConfig::from_env); -/// Configuration options for Gaggle +/// `GaggleConfig` contains the configuration options for Gaggle. +/// +/// These settings are loaded from environment variables and provide +/// control over caching, logging, and HTTP behavior. #[derive(Debug, Clone)] pub struct GaggleConfig { - /// Directory for caching downloaded datasets + /// `cache_dir` specifies the directory for caching downloaded datasets. + /// Defaults to a subdirectory within the system's cache directory. pub cache_dir: PathBuf, - /// Enable verbose logging + /// `verbose_logging` enables or disables verbose logging. #[allow(dead_code)] pub verbose_logging: bool, - /// HTTP timeout in seconds + /// `http_timeout_secs` sets the HTTP timeout in seconds. #[allow(dead_code)] pub http_timeout_secs: u64, - /// Download lock wait timeout in milliseconds + /// `download_wait_timeout_ms` sets the timeout for waiting on a download lock. #[allow(dead_code)] pub download_wait_timeout_ms: u64, - /// Download lock poll interval in milliseconds + /// `download_wait_poll_ms` sets the polling interval for the download lock. #[allow(dead_code)] pub download_wait_poll_ms: u64, - // Future: other options } impl GaggleConfig { - /// Load configuration from environment variables + /// Loads the Gaggle configuration from environment variables. pub fn from_env() -> Self { Self { cache_dir: Self::get_cache_dir(), @@ -41,7 +44,8 @@ impl GaggleConfig { } } - /// Get cache directory from GAGGLE_CACHE_DIR or default + /// Retrieves the cache directory from the `GAGGLE_CACHE_DIR` environment + /// variable, falling back to a default directory if not set. fn get_cache_dir() -> PathBuf { env::var("GAGGLE_CACHE_DIR") .ok() @@ -54,7 +58,8 @@ impl GaggleConfig { }) } - /// Get verbose logging setting from GAGGLE_VERBOSE or default (false) + /// Retrieves the verbose logging setting from the `GAGGLE_VERBOSE` + /// environment variable, defaulting to `false`. fn get_verbose() -> bool { if let Ok(val) = env::var("GAGGLE_VERBOSE") { match val.to_lowercase().as_str() { @@ -67,7 +72,8 @@ impl GaggleConfig { } } - /// Get HTTP timeout from GAGGLE_HTTP_TIMEOUT or default (30 seconds) + /// Retrieves the HTTP timeout from `GAGGLE_HTTP_TIMEOUT`, in seconds. + /// Defaults to 30 seconds. fn get_http_timeout() -> u64 { env::var("GAGGLE_HTTP_TIMEOUT") .ok() @@ -75,7 +81,8 @@ impl GaggleConfig { .unwrap_or(30) } - /// Get download wait timeout from env (default 30_000 ms) + /// Retrieves the download wait timeout from `GAGGLE_DOWNLOAD_WAIT_TIMEOUT`, + /// in milliseconds. Defaults to 30,000 ms. fn get_download_wait_timeout_ms() -> u64 { env::var("GAGGLE_DOWNLOAD_WAIT_TIMEOUT") .ok() @@ -84,7 +91,8 @@ impl GaggleConfig { .unwrap_or(30_000) } - /// Get download wait poll interval from env (default 100 ms) + /// Retrieves the download wait poll interval from `GAGGLE_DOWNLOAD_WAIT_POLL`, + /// in milliseconds. Defaults to 100 ms. fn get_download_wait_poll_ms() -> u64 { env::var("GAGGLE_DOWNLOAD_WAIT_POLL") .ok() diff --git a/gaggle/src/error.rs b/gaggle/src/error.rs index 6ef5e16..aedb707 100644 --- a/gaggle/src/error.rs +++ b/gaggle/src/error.rs @@ -3,34 +3,38 @@ use std::ffi::{c_char, CString}; use std::str::Utf8Error as StdUtf8Error; use thiserror::Error; -/// Error codes for programmatic error handling +/// `ErrorCode` defines a set of specific error types for programmatic handling. +/// +/// Each error code corresponds to a distinct category of issue that may arise +/// during the execution of Gaggle operations. These codes provide a stable, +/// machine-readable way to identify and react to errors. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum ErrorCode { - /// E001: Invalid or missing Kaggle API credentials + /// E001: Indicates invalid or missing Kaggle API credentials. E001_InvalidCredentials, - /// E002: Requested dataset not found on Kaggle + /// E002: Represents that the requested dataset could not be found on Kaggle. E002_DatasetNotFound, - /// E003: HTTP/Network error during API request + /// E003: Signifies an HTTP or network error during an API request. E003_NetworkError, - /// E004: Invalid dataset path format + /// E004: Denotes an invalid format for a dataset path. E004_InvalidPath, - /// E005: File system I/O error + /// E005: An error related to file system I/O operations. E005_IoError, - /// E006: JSON serialization/deserialization error + /// E006: A failure in JSON serialization or deserialization. E006_JsonError, - /// E007: ZIP file extraction or validation error + /// E007: An issue with ZIP file extraction or validation. E007_ZipError, - /// E008: CSV parsing error + /// E008: An error encountered while parsing a CSV file. E008_CsvError, - /// E009: Invalid UTF-8 string in FFI boundary + /// E009: An invalid UTF-8 string was found at an FFI boundary. E009_Utf8Error, - /// E010: Null pointer passed to FFI function + /// E010: A null pointer was passed to an FFI function. E010_NullPointer, } impl ErrorCode { - /// Get the numeric error code as a string + /// Returns the numeric error code as a string slice. pub fn code(&self) -> &'static str { match self { ErrorCode::E001_InvalidCredentials => "E001", @@ -46,7 +50,7 @@ impl ErrorCode { } } - /// Get a short description of the error + /// Returns a brief, human-readable description of the error. pub fn description(&self) -> &'static str { match self { ErrorCode::E001_InvalidCredentials => "Invalid Kaggle credentials", diff --git a/gaggle/src/ffi.rs b/gaggle/src/ffi.rs index 1bf477a..4679afe 100644 --- a/gaggle/src/ffi.rs +++ b/gaggle/src/ffi.rs @@ -5,23 +5,32 @@ use std::fs; use crate::error; use crate::kaggle; -/// Initialize logging for the Rust core based on GAGGLE_LOG_LEVEL +/// Initializes logging for the Gaggle Rust core. +/// +/// This function sets up the logging framework based on the `GAGGLE_LOG_LEVEL` +/// environment variable. It should be called once at the beginning of the +/// application's lifecycle. #[no_mangle] pub extern "C" fn gaggle_init_logging() { crate::init_logging(); } -/// Set Kaggle API credentials +/// Sets the Kaggle API credentials. +/// +/// # Arguments /// -/// Arguments: -/// - `username`: non-null pointer to a NUL-terminated C string -/// - `key`: non-null pointer to a NUL-terminated C string +/// * `username` - A non-null pointer to a NUL-terminated C string representing the Kaggle username. +/// * `key` - A non-null pointer to a NUL-terminated C string representing the Kaggle API key. /// -/// Returns 0 on success, -1 on failure (call gaggle_last_error). +/// # Returns +/// +/// Returns `0` on success, or `-1` on failure. If the operation fails, +/// a detailed error message can be retrieved using `gaggle_last_error`. /// /// # Safety -/// - The pointers must be valid and remain alive for the duration of this call. -/// - Strings must be valid UTF-8; interior NULs are not allowed. +/// +/// - The pointers must be valid and remain accessible for the duration of this call. +/// - The provided strings must be valid UTF-8, and interior NUL characters are not permitted. #[no_mangle] pub unsafe extern "C" fn gaggle_set_credentials( username: *const c_char, @@ -58,15 +67,23 @@ pub unsafe extern "C" fn gaggle_set_credentials( } } -/// Download a Kaggle dataset and return its local cache path +/// Downloads a Kaggle dataset and returns its local cache path. /// -/// Arguments: -/// - `dataset_path`: non-null pointer to a NUL-terminated C string "owner/dataset[[@vN|@latest]]". +/// # Arguments /// -/// Returns pointer to a heap-allocated C string. Free with gaggle_free(). On error, returns NULL and sets gaggle_last_error. +/// * `dataset_path` - A non-null pointer to a NUL-terminated C string in the format +/// "owner/dataset[[@vN|@latest]]". +/// +/// # Returns +/// +/// Returns a pointer to a heap-allocated C string containing the local path. +/// This string must be freed with `gaggle_free()`. On error, returns `NULL` +/// and sets a detailed error message retrievable with `gaggle_last_error`. /// /// # Safety -/// - The pointer must be valid and the string valid UTF-8; interior NULs are not allowed. +/// +/// - The pointer must be valid and the string must be valid UTF-8. +/// - Interior NUL characters are not allowed in the string. #[no_mangle] pub unsafe extern "C" fn gaggle_download_dataset(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -96,16 +113,17 @@ pub unsafe extern "C" fn gaggle_download_dataset(dataset_path: *const c_char) -> } } -/// Get the local path to a specific file in a downloaded dataset +/// Retrieves the local path to a specific file within a downloaded dataset. +/// +/// # Arguments /// -/// Arguments: -/// - `dataset_path`: non-null pointer to owner/dataset -/// - `filename`: non-null pointer to relative filename within the dataset +/// * `dataset_path` - A non-null pointer to a NUL-terminated C string representing the owner and dataset. +/// * `filename` - A non-null pointer to a NUL-terminated C string for the relative filename inside the dataset. /// /// # Safety /// -/// Both pointers must be valid and point to valid NUL-terminated C strings. -/// Strings must be valid UTF-8; interior NULs are not allowed. +/// - Both pointers must be valid and point to valid NUL-terminated C strings. +/// - The strings must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_get_file_path( dataset_path: *const c_char, @@ -139,12 +157,12 @@ pub unsafe extern "C" fn gaggle_get_file_path( } } -/// List files in a Kaggle dataset +/// Lists the files available in a Kaggle dataset. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_list_files(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -175,12 +193,12 @@ pub unsafe extern "C" fn gaggle_list_files(dataset_path: *const c_char) -> *mut } } -/// Search for Kaggle datasets +/// Searches for Kaggle datasets. /// /// # Safety /// -/// The query pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The `query` pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_search( query: *const c_char, @@ -215,12 +233,12 @@ pub unsafe extern "C" fn gaggle_search( } } -/// Get metadata for a specific Kaggle dataset +/// Retrieves metadata for a specific Kaggle dataset. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_get_dataset_info(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -251,20 +269,20 @@ pub unsafe extern "C" fn gaggle_get_dataset_info(dataset_path: *const c_char) -> } } -/// Get version information +/// Retrieves the version of the Gaggle library. #[no_mangle] pub extern "C" fn gaggle_get_version() -> *mut c_char { // Return only the version string (no JSON wrapper) string_to_c_string(env!("CARGO_PKG_VERSION").to_string()) } -/// Frees a heap-allocated C string +/// Frees a heap-allocated C string. /// /// # Safety /// -/// `ptr` must be a pointer previously returned by a Gaggle FFI function that transfers ownership -/// (e.g., gaggle_get_version, gaggle_list_files, etc.). -/// Passing the same pointer twice, or a pointer not allocated by Gaggle, results in undefined behavior. +/// `ptr` must be a pointer previously returned by a Gaggle FFI function that transfers ownership, +/// such as `gaggle_get_version` or `gaggle_list_files`. Passing the same pointer more than once, +/// or providing a pointer not allocated by Gaggle, will result in undefined behavior. #[no_mangle] pub unsafe extern "C" fn gaggle_free(ptr: *mut c_char) { if !ptr.is_null() { @@ -272,7 +290,7 @@ pub unsafe extern "C" fn gaggle_free(ptr: *mut c_char) { } } -/// Clear the dataset cache +/// Clears the dataset cache. #[no_mangle] pub extern "C" fn gaggle_clear_cache() -> i32 { let result = (|| -> Result<(), error::GaggleError> { @@ -296,7 +314,7 @@ pub extern "C" fn gaggle_clear_cache() -> i32 { } } -/// Enforce cache size limit by evicting oldest datasets +/// Enforces the cache size limit by evicting the oldest datasets. #[no_mangle] pub extern "C" fn gaggle_enforce_cache_limit() -> i32 { let result = kaggle::download::enforce_cache_limit_now(); @@ -310,12 +328,12 @@ pub extern "C" fn gaggle_enforce_cache_limit() -> i32 { } } -/// Check if cached dataset is the current version +/// Checks if the cached dataset is the current version. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_is_dataset_current(dataset_path: *const c_char) -> i32 { error::clear_last_error_internal(); @@ -343,12 +361,12 @@ pub unsafe extern "C" fn gaggle_is_dataset_current(dataset_path: *const c_char) } } -/// Force update dataset to latest version (ignores cache) +/// Forces an update of the dataset to the latest version, ignoring the cache. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_update_dataset(dataset_path: *const c_char) -> *mut c_char { error::clear_last_error_internal(); @@ -377,12 +395,12 @@ pub unsafe extern "C" fn gaggle_update_dataset(dataset_path: *const c_char) -> * } } -/// Get version information for a dataset +/// Retrieves version information for a dataset. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_dataset_version_info(dataset_path: *const c_char) -> *mut c_char { error::clear_last_error_internal(); @@ -411,7 +429,7 @@ pub unsafe extern "C" fn gaggle_dataset_version_info(dataset_path: *const c_char } } -/// Get cache information +/// Retrieves information about the cache. #[no_mangle] pub extern "C" fn gaggle_get_cache_info() -> *mut c_char { let cache_dir = crate::config::cache_dir_runtime(); @@ -453,12 +471,12 @@ pub extern "C" fn gaggle_get_cache_info() -> *mut c_char { string_to_c_string(info.to_string()) } -/// Parse JSON and expand objects/arrays similar to json_each +/// Parses JSON and expands objects/arrays, similar to `json_each`. /// /// # Safety /// -/// The pointer must be valid and point to a valid NUL-terminated C string. -/// The string must be valid UTF-8; interior NULs are not allowed. +/// - The pointer must be valid and point to a valid NUL-terminated C string. +/// - The string must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_json_each(json_str: *const c_char) -> *mut c_char { // Clear any previous error @@ -496,12 +514,12 @@ pub unsafe extern "C" fn gaggle_json_each(json_str: *const c_char) -> *mut c_cha } } -/// Prefetch multiple files in a dataset without downloading the entire archive +/// Prefetches multiple files in a dataset without downloading the entire archive. /// /// # Safety /// -/// Both pointers must be valid and point to valid NUL-terminated C strings. -/// Strings must be valid UTF-8; interior NULs are not allowed. +/// - Both pointers must be valid and point to valid NUL-terminated C strings. +/// - The strings must be valid UTF-8, and interior NUL characters are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_prefetch_files( dataset_path: *const c_char, diff --git a/gaggle/src/kaggle/api.rs b/gaggle/src/kaggle/api.rs index 0dab841..e191132 100644 --- a/gaggle/src/kaggle/api.rs +++ b/gaggle/src/kaggle/api.rs @@ -1,3 +1,10 @@ +// api.rs +// +// This module provides the core functionality for interacting with the Kaggle API. +// It includes functions for building the HTTP client, handling API rate limiting, +// and implementing a retry mechanism for failed requests. The module is designed +// to be used by other parts of the Gaggle library that need to make API calls. + use crate::error::GaggleError; use reqwest::blocking::Client; @@ -10,7 +17,7 @@ use std::thread::sleep; use std::time::{Duration, Instant}; use tracing::{debug, trace, warn}; -/// Optional global rate limiter: enforce a minimum interval between API calls +/// An optional global rate limiter that enforces a minimum interval between API calls. static LAST_API_CALL: Lazy> = Lazy::new(|| Mutex::new(Instant::now() - Duration::from_secs(3600))); @@ -37,7 +44,9 @@ fn rate_limit_wait() { *guard = Instant::now(); } -/// Helper: get API base URL (overridable at runtime via env for testing) +/// A helper function that retrieves the API base URL. +/// +/// This function is overridable at runtime via an environment variable for testing purposes. pub(crate) fn get_api_base() -> String { #[cfg(test)] { @@ -57,7 +66,7 @@ pub(crate) fn get_api_base() -> String { .to_string() } -/// Helper: build a reqwest client with timeout and UA +/// A helper function that builds a `reqwest` client with a timeout and a User-Agent header. pub(crate) fn build_client() -> Result { let timeout = Duration::from_secs(crate::config::http_timeout_runtime_secs()); let ua = format!( @@ -71,6 +80,10 @@ pub(crate) fn build_client() -> Result { .build()?) } +/// A function that executes a given function with a retry mechanism. +/// +/// This function will attempt to execute the given function up to a configured number of times, +/// with an exponential backoff between attempts. pub(crate) fn with_retries(mut f: F) -> Result where F: FnMut() -> Result, diff --git a/gaggle/src/kaggle/credentials.rs b/gaggle/src/kaggle/credentials.rs index 996bd41..736f170 100644 --- a/gaggle/src/kaggle/credentials.rs +++ b/gaggle/src/kaggle/credentials.rs @@ -1,3 +1,11 @@ +// credentials.rs +// +// This module manages Kaggle API credentials for the Gaggle library. It provides +// functions for setting and retrieving credentials, with support for loading them +// from environment variables or a `kaggle.json` file. The credentials are stored +// in a thread-safe, lazily-initialized global variable to check that they are +// loaded only once and can be safely accessed from multiple threads. + use crate::error::GaggleError; use parking_lot::RwLock; use std::fs; @@ -5,9 +13,12 @@ use std::fs; static CREDENTIALS: once_cell::sync::Lazy>> = once_cell::sync::Lazy::new(|| RwLock::new(None)); +/// A struct that represents Kaggle API credentials. #[derive(Clone)] pub struct KaggleCredentials { + /// The Kaggle username. pub username: String, + /// The Kaggle API key. pub key: String, } @@ -20,7 +31,7 @@ impl std::fmt::Debug for KaggleCredentials { } } -/// Set Kaggle API credentials +/// Sets the Kaggle API credentials. pub fn set_credentials(username: &str, key: &str) -> Result<(), GaggleError> { let mut creds = CREDENTIALS.write(); *creds = Some(KaggleCredentials { @@ -30,7 +41,7 @@ pub fn set_credentials(username: &str, key: &str) -> Result<(), GaggleError> { Ok(()) } -/// Get stored credentials or try to load from environment/file +/// Retrieves the stored credentials, or attempts to load them from the environment or a file. pub fn get_credentials() -> Result { // Check if credentials are already set in memory (fast path with read lock) if let Some(creds) = CREDENTIALS.read().as_ref() { diff --git a/gaggle/src/kaggle/download.rs b/gaggle/src/kaggle/download.rs index 09139a9..0dd9e47 100644 --- a/gaggle/src/kaggle/download.rs +++ b/gaggle/src/kaggle/download.rs @@ -1,3 +1,11 @@ +// download.rs +// +// This module provides the core functionality for downloading and managing Kaggle datasets. +// It includes functions for downloading entire datasets or single files, as well as for +// managing the local cache, including eviction of old datasets to enforce a size limit. +// The module also provides functions for listing files in a dataset and for checking if a +// cached dataset is the most recent version. + use crate::error::GaggleError; use parking_lot::Mutex; use serde::{Deserialize, Serialize}; @@ -16,9 +24,12 @@ use tracing::{debug, warn}; static DOWNLOAD_LOCKS: once_cell::sync::Lazy>> = once_cell::sync::Lazy::new(|| Mutex::new(HashMap::new())); +/// A struct that represents a file within a Kaggle dataset. #[derive(Debug, Serialize, Deserialize)] pub struct DatasetFile { + /// The name of the file. pub name: String, + /// The size of the file in bytes. pub size: u64, } @@ -44,12 +55,16 @@ fn list_dataset_files_from_metadata(dataset_path: &str) -> Result, } @@ -86,11 +101,13 @@ impl Drop for LockGuard { } } -/// Download a Kaggle dataset (supports version pinning) -/// Examples: -/// "owner/dataset" - downloads latest version -/// "owner/dataset@v2" - downloads version 2 -/// "owner/dataset@latest" - explicitly downloads latest +/// Downloads a Kaggle dataset, with support for version pinning. +/// +/// # Examples +/// +/// * `"owner/dataset"` - Downloads the latest version. +/// * `"owner/dataset@v2"` - Downloads version 2. +/// * `"owner/dataset@latest"` - Explicitly downloads the latest version. pub fn download_dataset(dataset_path: &str) -> Result { // Parse path to extract optional version let (owner, dataset, version) = super::parse_dataset_path_with_version(dataset_path)?; @@ -271,7 +288,7 @@ fn download_dataset_version( Ok(cache_dir) } -/// Download a single file within a Kaggle dataset into the cache without extracting the entire archive +/// Downloads a single file from a Kaggle dataset into the cache, without extracting the entire archive. pub fn download_single_file(dataset_path: &str, filename: &str) -> Result { // Validate dataset path and filename to prevent traversal let (owner, dataset) = super::parse_dataset_path(dataset_path)?; @@ -352,7 +369,7 @@ pub fn download_single_file(dataset_path: &str, filename: &str) -> Result Result { let file = fs::File::open(zip_path)?; let mut archive = @@ -462,8 +479,11 @@ pub(crate) fn extract_zip(zip_path: &Path, dest_dir: &Path) -> Result Result, GaggleError> { let (owner, dataset) = super::parse_dataset_path(dataset_path)?; let dataset_dir = crate::config::cache_dir_runtime() @@ -541,7 +561,7 @@ pub fn list_dataset_files(dataset_path: &str) -> Result, Gaggle Ok(files) } -/// Get the local path to a specific file in a dataset +/// Retrieves the local path to a specific file in a dataset. pub fn get_dataset_file_path(dataset_path: &str, filename: &str) -> Result { // Validate filename to prevent path traversal or absolute paths use std::path::Component; @@ -694,7 +714,7 @@ fn get_cached_datasets() -> Result, GaggleError> { Ok(datasets) } -/// Calculate total cache size in MB +/// Calculates the total size of the cache in megabytes. pub fn get_total_cache_size_mb() -> Result { let datasets = get_cached_datasets()?; Ok(datasets.iter().map(|(_, meta)| meta.size_mb).sum()) @@ -741,12 +761,12 @@ fn enforce_cache_limit() -> Result<(), GaggleError> { Ok(()) } -/// Public function to manually enforce cache limit +/// A public function that manually enforces the cache limit. pub fn enforce_cache_limit_now() -> Result<(), GaggleError> { enforce_cache_limit() } -/// Check if cached dataset is the current version +/// Checks if the cached dataset is the current version. pub fn is_dataset_current(dataset_path: &str) -> Result { let (owner, dataset) = super::parse_dataset_path(dataset_path)?; @@ -782,7 +802,7 @@ pub fn is_dataset_current(dataset_path: &str) -> Result { Ok(cached_version == current_version) } -/// Force update dataset to latest version (ignores cache) +/// Forces an update of the dataset to the latest version, ignoring the cache. pub fn update_dataset(dataset_path: &str) -> Result { let (owner, dataset) = super::parse_dataset_path(dataset_path)?; @@ -800,7 +820,7 @@ pub fn update_dataset(dataset_path: &str) -> Result { download_dataset(dataset_path) } -/// Get version information for a dataset +/// Retrieves version information for a dataset. pub fn get_dataset_version_info(dataset_path: &str) -> Result { let (owner, dataset) = super::parse_dataset_path(dataset_path)?; diff --git a/gaggle/src/kaggle/metadata.rs b/gaggle/src/kaggle/metadata.rs index 535d122..a336b7f 100644 --- a/gaggle/src/kaggle/metadata.rs +++ b/gaggle/src/kaggle/metadata.rs @@ -1,3 +1,11 @@ +// metadata.rs +// +// This module provides functionality for retrieving metadata about Kaggle datasets. +// It includes a struct for representing dataset information, as well as functions +// for fetching metadata from the Kaggle API and for determining the current version +// of a dataset. The module also includes a simple in-memory cache with a TTL +// to reduce the number of API calls for frequently accessed metadata. + use crate::error::GaggleError; use serde::{Deserialize, Serialize}; @@ -7,13 +15,19 @@ use parking_lot::RwLock; use std::collections::HashMap; use std::time::{Duration, Instant}; +/// A struct that represents information about a Kaggle dataset. #[derive(Debug, Serialize, Deserialize)] #[allow(dead_code)] pub struct DatasetInfo { + /// The reference path of the dataset, in the format `owner/dataset`. pub ref_path: String, + /// The title of the dataset. pub title: String, + /// The size of the dataset in bytes. pub size: u64, + /// The URL of the dataset. pub url: String, + /// The date the dataset was last updated. pub last_updated: String, } @@ -30,7 +44,7 @@ fn metadata_ttl() -> Duration { Duration::from_secs(secs) } -/// Get metadata for a specific dataset +/// Retrieves the metadata for a specific dataset. pub fn get_dataset_metadata(dataset_path: &str) -> Result { if crate::config::offline_mode() { return Err(GaggleError::HttpRequestError( @@ -79,7 +93,7 @@ pub fn get_dataset_metadata(dataset_path: &str) -> Result Result { if crate::config::offline_mode() { // In offline mode, try to use cached marker file version if available diff --git a/gaggle/src/kaggle/mod.rs b/gaggle/src/kaggle/mod.rs index cc39dcb..9729987 100644 --- a/gaggle/src/kaggle/mod.rs +++ b/gaggle/src/kaggle/mod.rs @@ -1,3 +1,11 @@ +// mod.rs +// +// This module serves as the main entry point for the Kaggle functionality in the Gaggle +// library. It re-exports the public functions from the other modules in this directory, +// providing a single, consistent interface for interacting with the Kaggle API. It also +// contains the core logic for parsing dataset paths, which is a critical component for +// all of the other functionality in this library. + pub mod api; pub mod credentials; pub mod download; diff --git a/gaggle/src/kaggle/search.rs b/gaggle/src/kaggle/search.rs index 2f15a48..a7804ca 100644 --- a/gaggle/src/kaggle/search.rs +++ b/gaggle/src/kaggle/search.rs @@ -1,3 +1,11 @@ +// search.rs +// +// This module provides functionality for searching for datasets on Kaggle. It +// includes a function for sending a search query to the Kaggle API and for +// returning the results as a JSON value. The module handles the details of +// building the API request, including URL encoding the query and handling +// pagination. + use crate::error::GaggleError; use super::api::{build_client, get_api_base, with_retries}; diff --git a/gaggle/src/lib.rs b/gaggle/src/lib.rs index 10840e9..721977c 100644 --- a/gaggle/src/lib.rs +++ b/gaggle/src/lib.rs @@ -21,8 +21,11 @@ use tracing_subscriber::{fmt, EnvFilter}; static LOG_INIT: OnceCell<()> = OnceCell::new(); -/// Initialize global logging based on GAGGLE_LOG_LEVEL. -/// Safe to call multiple times; only the first call has an effect. +/// Initializes global logging for Gaggle, governed by the `GAGGLE_LOG_LEVEL` +/// environment variable. +/// +/// This function can be safely called multiple times; however, only the first +/// invocation will have an effect. pub fn init_logging() { let _ = LOG_INIT.get_or_init(|| { let level = std::env::var("GAGGLE_LOG_LEVEL").unwrap_or_else(|_| "WARN".to_string()); diff --git a/gaggle/src/utils.rs b/gaggle/src/utils.rs index 266f46e..30fa20d 100644 --- a/gaggle/src/utils.rs +++ b/gaggle/src/utils.rs @@ -1,8 +1,11 @@ use std::fs; use std::path::Path; -/// Recursively calculate the size of a directory in bytes. -/// Follows the same semantics as previous inline helpers in ffi.rs and download.rs. +/// Recursively calculates the size of a directory in bytes. +/// +/// This function traverses the directory tree from the given path and sums the +/// sizes of all files. It follows the same semantics as the previous inline +/// helpers in `ffi.rs` and `download.rs`. pub fn calculate_dir_size(path: &Path) -> Result { let mut total = 0u64; if path.is_dir() { @@ -19,7 +22,9 @@ pub fn calculate_dir_size(path: &Path) -> Result { Ok(total) } -/// Choose DuckDB reader function name based on file extension (lowercased). +/// Selects the appropriate DuckDB reader function based on the file extension. +/// +/// The selection is case-insensitive. #[allow(dead_code)] pub fn guess_reader_for_path(path: &str) -> &'static str { let lower = path.to_ascii_lowercase(); diff --git a/gaggle/tests/integration_mock.rs b/gaggle/tests/integration_mock.rs index c1fb24e..8b8db7c 100644 --- a/gaggle/tests/integration_mock.rs +++ b/gaggle/tests/integration_mock.rs @@ -1,3 +1,11 @@ +// integration_mock.rs +// +// This integration test verifies the search and info functionalities of the Gaggle library +// using a mock HTTP server. It guarantees that the library correctly interacts with the Kaggle API +// endpoints for searching datasets and retrieving dataset information. The test sets up a mock +// server to simulate the Kaggle API, calls the relevant Gaggle FFI functions, and asserts +// that the functions return the expected results without errors. + use gaggle::{ gaggle_free, gaggle_get_cache_info, gaggle_get_dataset_info, gaggle_search, gaggle_set_credentials, diff --git a/gaggle/tests/mock_http.rs b/gaggle/tests/mock_http.rs index db9c2c6..9b912be 100644 --- a/gaggle/tests/mock_http.rs +++ b/gaggle/tests/mock_http.rs @@ -1,3 +1,10 @@ +// mock_http.rs +// +// This file contains integration tests for the Gaggle library that use a mock HTTP server +// to simulate interactions with the Kaggle API. These tests are designed to verify the +// behavior of Gaggle's core functionalities, such as searching, downloading, and managing +// datasets, in a controlled environment. + use mockito::{Matcher, Server}; use std::env; use std::ffi::{CStr, CString}; diff --git a/gaggle/tests/offline_mode.rs b/gaggle/tests/offline_mode.rs index 0d49939..4a09ead 100644 --- a/gaggle/tests/offline_mode.rs +++ b/gaggle/tests/offline_mode.rs @@ -1,4 +1,12 @@ -// filepath: /home/hassan/Workspace/RustRoverProjects/gaggle/gaggle/tests/offline_mode.rs +// offline_mode.rs +// +// This integration test verifies the behavior of the Gaggle library when operating in +// offline mode. The test verifies that dataset downloads fail as expected when the +// dataset is not already cached and that version information is handled correctly +// when the library cannot access the network. By setting the `GAGGLE_OFFLINE` +// environment variable, the test simulates a scenario with no internet connectivity +// and asserts that the library's FFI functions behave as designed in this context. + use std::ffi::CString; #[test] diff --git a/gaggle/tests/property_parse_dataset_path.rs b/gaggle/tests/property_parse_dataset_path.rs index 6d2833c..cc568bf 100644 --- a/gaggle/tests/property_parse_dataset_path.rs +++ b/gaggle/tests/property_parse_dataset_path.rs @@ -1,3 +1,11 @@ +// property_parse_dataset_path.rs +// +// This file contains property-based tests for the `parse_dataset_path` function in the Gaggle +// library. Using the `proptest` framework, these tests generate a wide range of string inputs +// to verify that the parser correctly handles valid dataset path formats and rejects invalid +// ones. The primary goal of these tests is to guarantee the robustness and correctness of the +// dataset path parsing logic, which is a critical component for interacting with the Kaggle API. + use proptest::prelude::*; proptest! { diff --git a/gaggle/tests/replacement_scan.rs b/gaggle/tests/replacement_scan.rs index 19ef11c..8b95bef 100644 --- a/gaggle/tests/replacement_scan.rs +++ b/gaggle/tests/replacement_scan.rs @@ -1,3 +1,12 @@ +// replacement_scan.rs +// +// This integration test verifies the replacement scan functionality of the Gaggle DuckDB +// extension. The test guarantees that the extension can correctly intercept queries for tables +// with the "kaggle:" prefix, resolve the dataset path to a local cache, and read data from +// CSV and JSON files using DuckDB's readers. The test sets up a mock cached dataset, runs +// DuckDB with a SQL script that queries the dataset, and validates the results to confirm +// that the replacement scan is working as expected. + use std::fs; use std::path::PathBuf; use std::process::Command; diff --git a/gaggle/tests/replacement_scan_excel.rs b/gaggle/tests/replacement_scan_excel.rs index 684e1ea..900c7e3 100644 --- a/gaggle/tests/replacement_scan_excel.rs +++ b/gaggle/tests/replacement_scan_excel.rs @@ -1,4 +1,11 @@ -// filepath: /home/hassan/Workspace/RustRoverProjects/gaggle/gaggle/tests/replacement_scan_excel.rs +// replacement_scan_excel.rs +// +// This integration test is designed to verify the replacement scan functionality of the +// Gaggle DuckDB extension specifically for Excel (.xlsx) files. The test checks if the +// extension correctly identifies and processes queries for tables with the "kaggle:" prefix +// that point to Excel files. It sets up a mock cached dataset containing a placeholder .xlsx +// file and then attempts to query it using DuckDB. + use std::path::PathBuf; use std::process::Command; diff --git a/gaggle/tests/test_error_recovery.rs b/gaggle/tests/test_error_recovery.rs index e35b784..a02ea80 100644 --- a/gaggle/tests/test_error_recovery.rs +++ b/gaggle/tests/test_error_recovery.rs @@ -1,3 +1,11 @@ +// test_error_recovery.rs +// +// This file contains integration tests focused on the error recovery mechanisms of the Gaggle +// library. The tests are designed to make sure that the library correctly handles and reports +// errors across its FFI boundary. This includes verifying that errors are properly set when +// invalid operations are performed, that they can be cleared using the provided FFI functions, +// and that subsequent operations can be executed successfully after an error has been cleared. + use gaggle::{gaggle_clear_last_error, gaggle_last_error}; use gaggle::{gaggle_download_dataset, gaggle_search, gaggle_set_credentials}; use std::ffi::{CStr, CString}; diff --git a/gaggle/tests/test_regression.rs b/gaggle/tests/test_regression.rs index e9c1bda..4cb92ec 100644 --- a/gaggle/tests/test_regression.rs +++ b/gaggle/tests/test_regression.rs @@ -1,5 +1,10 @@ -// Regression tests for bugs fixed in Gaggle -// These tests make sure previously fixed bugs don't reoccur +// test_regression.rs +// +// This file contains regression tests for the Gaggle library. The purpose of these tests is to +// guarantee that bugs that have been fixed in the past do not reoccur in future versions of the +// library. Each test in this suite is designed to replicate the conditions that caused a +// specific bug, and then verify that the bug is no longer present. This is a critical part +// of maintaining the long-term stability and reliability of the Gaggle library. use gaggle::{gaggle_clear_last_error, gaggle_last_error}; use gaggle::{gaggle_download_dataset, gaggle_free, gaggle_set_credentials}; diff --git a/gaggle/tests/test_security.rs b/gaggle/tests/test_security.rs index 7028669..58053cd 100644 --- a/gaggle/tests/test_security.rs +++ b/gaggle/tests/test_security.rs @@ -1,3 +1,12 @@ +// test_security.rs +// +// This file contains security-focused tests for the Gaggle library. The tests are designed to +// verify that the library is robust against various types of malicious inputs and that it +// correctly handles concurrent access to shared resources. The tests cover a range of potential +// vulnerabilities, including path traversal, null byte injection, and the handling of special +// characters in dataset paths. Additionally, the tests verify the thread safety of the library +// by simulating concurrent access to credential settings and cache information. + use gaggle::parse_dataset_path; #[test] From d2ce1c512f36d1bd2ba58c43b7d3ed32b14c4d2b Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Tue, 4 Nov 2025 12:22:54 +0100 Subject: [PATCH 7/8] WIP --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8710706..9d0c40e 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,9 @@ In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets. - Provides a simple API to interact with Kaggle datasets from DuckDB - Allows you to search, download, and read datasets from Kaggle - Supports datasets that contain CSV, Parquet, JSON, and XLSX files +- Supports dataset updates and versioning - Configurable and has built-in caching support to avoid re-downloading - Thread-safe, fast, and has a low memory footprint -- Supports dataset updates and versioning See the [ROADMAP.md](ROADMAP.md) for the list of implemented and planned features. From f9a92a0dbf6a3a7033ee4078bd0d5c5c2d87d1cc Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Tue, 4 Nov 2025 12:24:52 +0100 Subject: [PATCH 8/8] WIP --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d0c40e..1a3ae43 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; -- Read a Parquet file from local cache using a prepared statement --- (DuckDB doesn't support subquery in function arguments, so we use a prepared statement) +-- (DuckDB doesn't allow the use of subqueries in function arguments, so we use a prepared statement) prepare rp as select * from read_parquet(?) limit 10; execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));