From 02dbd9c90a2fae32b47c93d7bcab73722f1f8a9f Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Sun, 2 Nov 2025 23:15:22 +0100 Subject: [PATCH 1/4] The base commit --- README.md | 20 ++------------------ gaggle/Cargo.toml | 2 +- gaggle/bindings/gaggle_extension.cpp | 2 +- gaggle/src/kaggle/api.rs | 6 +++--- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 418a84b..9be8d85 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ This workflow can quickly become complex, especially when working with multiple frequently. Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside an analytical database like DuckDB that can handle fast queries. + In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets. ### Features @@ -95,14 +96,11 @@ make release -- Load the Gaggle extension (only needed if you built from source) --load 'build/release/extension/gaggle/gaggle.duckdb_extension'; --- Manually, set your Kaggle credentials (or use `~/.kaggle/kaggle.json`) -select gaggle_set_credentials('your-username', 'your-api-key'); - -- Get extension version select gaggle_version(); -- List files in the dataset --- (Note that if the datasets is not downloaded, it will be downloaded and cached automatically) +-- (Note that if the datasets is not downloaded, it will be downloaded and cached) select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; @@ -115,25 +113,11 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet' select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet'; --- Or glob Parquet files in a dataset directory -select count(*) -from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet'; - -- Optionally, we check cache info select gaggle_cache_info(); --- Clear cache and enforce cache size limit manually -select gaggle_clear_cache(); -select gaggle_enforce_cache_limit(); - -- Check if cached dataset is current (is newest version?) select gaggle_is_current('habedi/flickr-8k-dataset-clean'); - --- Force update to latest version if needed ---select gaggle_update_dataset('habedi/flickr-8k-dataset-clean'); - --- Download specific version (version pinning for reproducibility) ---select gaggle_download('habedi/flickr-8k-dataset-clean@v2'); ``` [![Simple Demo 1](https://asciinema.org/a/745806.svg)](https://asciinema.org/a/745806) diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml index e8217f5..54eaa81 100644 --- a/gaggle/Cargo.toml +++ b/gaggle/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gaggle" -version = "0.1.0-alpha.1" +version = "0.1.0-alpha.2" edition = "2021" publish = false diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index 4e4c3dc..e71280f 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -740,7 +740,7 @@ static void LoadInternal(ExtensionLoader &loader) { // Provide out-of-line definitions for the extension class void GaggleExtension::Load(ExtensionLoader &loader) { LoadInternal(loader); } std::string GaggleExtension::Name() { return "gaggle"; } -std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.1"); } +std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.2"); } } // namespace duckdb diff --git a/gaggle/src/kaggle/api.rs b/gaggle/src/kaggle/api.rs index d99f8f1..95cdee8 100644 --- a/gaggle/src/kaggle/api.rs +++ b/gaggle/src/kaggle/api.rs @@ -165,7 +165,7 @@ mod tests { #[test] fn test_with_retries_exponential_backoff() { - env::set_var("GAGGLE_HTTP_RETRY_DELAY", "0.01"); + env::set_var("GAGGLE_HTTP_RETRY_DELAY", "0.05"); env::set_var("GAGGLE_HTTP_RETRY_MAX_DELAY", "0.1"); let start = std::time::Instant::now(); @@ -180,8 +180,8 @@ mod tests { }); let elapsed = start.elapsed(); - // Should have some delay between retries (at least ~10ms) - assert!(elapsed.as_millis() >= 10); + // Should have some delay between retries (at least ~50ms) + assert!(elapsed.as_millis() >= 20); env::remove_var("GAGGLE_HTTP_RETRY_DELAY"); env::remove_var("GAGGLE_HTTP_RETRY_MAX_DELAY"); From cf24e93b2ea986757b5794d714a50b051fe9c400 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 12:43:09 +0100 Subject: [PATCH 2/4] Fix linter warnings --- .github/workflows/tests.yml | 2 +- gaggle/Cargo.toml | 1 - gaggle/src/kaggle/download.rs | 20 +++++++++++++++----- gaggle/src/kaggle/metadata.rs | 9 ++------- gaggle/src/kaggle/search.rs | 14 ++++---------- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae4424a..32fef03 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -59,7 +59,7 @@ jobs: continue-on-error: false sqllogictest: - name: Sqllogicest Tests + name: Sqllogictest Tests runs-on: ubuntu-latest steps: - name: Checkout Code diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml index 54eaa81..9bc23d7 100644 --- a/gaggle/Cargo.toml +++ b/gaggle/Cargo.toml @@ -24,7 +24,6 @@ dirs = "6.0" urlencoding = "2.1" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } -is-terminal = "0.4" [dev-dependencies] tempfile = "3.10" diff --git a/gaggle/src/kaggle/download.rs b/gaggle/src/kaggle/download.rs index 6ac3c43..960454e 100644 --- a/gaggle/src/kaggle/download.rs +++ b/gaggle/src/kaggle/download.rs @@ -531,6 +531,11 @@ pub fn is_dataset_current(dataset_path: &str) -> Result { // Get current version from Kaggle let current_version = super::metadata::get_current_version(dataset_path)?; + // If we cannot determine current version, conservatively report not current + if current_version == "unknown" { + return Ok(false); + } + Ok(cached_version == current_version) } @@ -579,10 +584,15 @@ pub fn get_dataset_version_info(dataset_path: &str) -> Result { - panic!("Should not have path validation error") - } - _ => {} // HTTP or credentials error expected - } + if let Err(GaggleError::InvalidDatasetPath(_)) = result { + panic!("Should not have path validation error"); } std::env::remove_var("KAGGLE_USERNAME"); diff --git a/gaggle/src/kaggle/search.rs b/gaggle/src/kaggle/search.rs index b9fa45c..2f15a48 100644 --- a/gaggle/src/kaggle/search.rs +++ b/gaggle/src/kaggle/search.rs @@ -122,11 +122,8 @@ mod tests { } Err(e) => { // Should be HTTP error, not validation error - match e { - GaggleError::InvalidDatasetPath(_) => { - panic!("Should not have validation error for valid params") - } - _ => {} // HTTP or credentials error is expected + if let GaggleError::InvalidDatasetPath(_) = e { + panic!("Should not have validation error for valid params"); } } } @@ -147,11 +144,8 @@ mod tests { Ok(_) => {} // Succeeded with real credentials Err(e) => { // Should not be a validation error - match e { - GaggleError::InvalidDatasetPath(_) => { - panic!("Should not have validation error for page=1, size=1") - } - _ => {} // HTTP or credentials error is OK + if let GaggleError::InvalidDatasetPath(_) = e { + panic!("Should not have validation error for page=1, size=1"); } } } From 98a428367ac1e28b8c56fec3770cc0ed60a29b34 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 12:59:11 +0100 Subject: [PATCH 3/4] Improve the storage handling --- README.md | 12 +- ROADMAP.md | 4 +- docs/CONFIGURATION.md | 171 ++++++++------- docs/ERROR_CODES.md | 219 +++---------------- docs/README.md | 115 ++++------ gaggle/bindings/include/rust.h | 188 +++-------------- gaggle/src/config.rs | 23 ++ gaggle/src/ffi.rs | 331 +++++++++++++++++------------ gaggle/src/kaggle/api.rs | 40 +++- gaggle/src/kaggle/credentials.rs | 22 +- gaggle/src/kaggle/download.rs | 350 +++++++++++++++++++++++++++---- gaggle/src/kaggle/metadata.rs | 29 +++ gaggle/src/kaggle/mod.rs | 63 ++++-- gaggle/src/lib.rs | 3 +- gaggle/tests/mock_http.rs | 167 +++++++++++++++ 15 files changed, 1034 insertions(+), 703 deletions(-) diff --git a/README.md b/README.md index 9be8d85..54bdc54 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,10 @@ Access and query Kaggle datasets from DuckDB --- -Gaggle is a DuckDB extension that allows you to work with Kaggle datasets directly in SQL queries, as if -they were DuckDB tables. -It is written in Rust and uses the Kaggle API to search, download, and manage the datasets. +Gaggle is a DuckDB extension that allows you to work with [Kaggle datasets](https://www.kaggle.com/datasets) +directly in SQL queries, as if they were DuckDB tables. +It is written in Rust and uses the [Kaggle API](https://www.kaggle.com/docs/api) +to search, download, and manage the datasets. Kaggle hosts a large collection of very useful datasets for data science and machine learning. Accessing these datasets typically involves manually downloading a dataset (as a ZIP file), @@ -93,9 +94,6 @@ make release #### Trying Gaggle ```sql --- Load the Gaggle extension (only needed if you built from source) ---load 'build/release/extension/gaggle/gaggle.duckdb_extension'; - -- Get extension version select gaggle_version(); @@ -105,7 +103,7 @@ select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; -- Read a Parquet file from local cache using a prepared statement --- (Note that DuckDB doesn't support subquery in function arguments, so we use a prepared statement) +-- (DuckDB doesn't support subquery in function arguments, so we use a prepared statement) prepare rp as select * from read_parquet(?) limit 10; execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet')); diff --git a/ROADMAP.md b/ROADMAP.md index cef6b1d..aa81799 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -6,7 +6,7 @@ It outlines features to be implemented and their current status. > [!IMPORTANT] > This roadmap is a work in progress and is subject to change. -### 1. Kaggle API Integration +### 1. Kaggle API * **Authentication** * [x] Set Kaggle API credentials programmatically. @@ -57,7 +57,7 @@ It outlines features to be implemented and their current status. * [ ] Incremental cache updates. * [ ] Background cache synchronization. -### 5. Error Handling and Resilience +### 5. Error Handling * **Error Messages** * [x] Clear error messages for invalid credentials. diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 03f855d..ec3b08d 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -1,22 +1,22 @@ -## Gaggle's Configuration Guide +### Gaggle's Configuration Guide Gaggle supports configuration via environment variables to customize its behavior without code changes. -### Environment Variables +#### Environment Variables -#### Cache Configuration +##### Cache Configuration -##### GAGGLE_CACHE_DIR +###### GAGGLE_CACHE_DIR - **Description**: Directory path for caching downloaded Kaggle datasets - **Type**: String (path) -- **Default**: `$XDG_CACHE_HOME/gaggle` (typically `~/.cache/gaggle`) +- **Default**: `$XDG_CACHE_HOME/gaggle` (normally `~/.cache/gaggle`) - **Example**: ```bash export GAGGLE_CACHE_DIR="/var/cache/gaggle" ``` -##### GAGGLE_CACHE_SIZE_LIMIT_MB +###### GAGGLE_CACHE_SIZE_LIMIT_MB - **Description**: Maximum cache size in megabytes for downloaded datasets - **Type**: Integer (megabytes) or "unlimited" @@ -26,31 +26,31 @@ Gaggle supports configuration via environment variables to customize its behavio automatically evicted using LRU (Least Recently Used) policy - **Example**: ```bash - # Set to 50GB + ## Set to 50GB export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 - # Set to 5GB + ## Set to 5GB export GAGGLE_CACHE_SIZE_LIMIT_MB=5120 - # Set unlimited cache + ## Set unlimited cache export GAGGLE_CACHE_SIZE_LIMIT_MB=unlimited ``` -##### GAGGLE_CACHE_HARD_LIMIT +###### GAGGLE_CACHE_HARD_LIMIT - **Description**: Enable hard limit mode (prevents downloads when cache limit would be exceeded) -- **Type**: Boolean (accepts: true, yes, 1 for hard limit; false, no, 0 for soft limit) +- **Type**: Boolean (true/yes/1 or false/no/0) - **Default**: `false` (soft limit) - **Status**: ✅ Implemented - **Example**: ```bash - # Enable hard limit (prevents downloads when cache is full) + ## Enable hard limit (prevents downloads when cache is full) export GAGGLE_CACHE_HARD_LIMIT=true ``` -#### HTTP Configuration +##### HTTP Configuration -##### GAGGLE_HTTP_TIMEOUT +###### GAGGLE_HTTP_TIMEOUT - **Description**: HTTP request timeout in seconds for Kaggle API requests - **Type**: Integer (seconds) @@ -60,18 +60,18 @@ Gaggle supports configuration via environment variables to customize its behavio export GAGGLE_HTTP_TIMEOUT=120 ``` -##### GAGGLE_API_BASE +###### GAGGLE_API_BASE - **Description**: Override the Kaggle API base URL (primarily for testing/mocking) - **Type**: String (URL) - **Default**: `https://www.kaggle.com/api/v1` - **Example**: ```bash - # Point requests to a local mock server + ## Point requests to a local mock server export GAGGLE_API_BASE=http://127.0.0.1:12345 ``` -##### HTTP Retry Controls +###### HTTP Retry Controls - **GAGGLE_HTTP_RETRY_ATTEMPTS** - **Description**: Number of retry attempts after the initial try @@ -88,7 +88,29 @@ Gaggle supports configuration via environment variables to customize its behavio These controls enable exponential backoff with cap across metadata/search/download requests. -#### Download Coordination +###### GAGGLE_API_MIN_INTERVAL_MS + +- **Description**: Optional client-side rate limiting. Enforces a minimum interval between HTTP calls. +- **Type**: Integer (milliseconds) +- **Default**: `0` (disabled) +- **Example**: + ```bash + export GAGGLE_API_MIN_INTERVAL_MS=200 ## max 5 calls/sec per process + ``` + +##### Metadata Caching + +###### GAGGLE_METADATA_TTL + +- **Description**: In-memory cache TTL for dataset metadata responses. +- **Type**: Integer (seconds) +- **Default**: `600` (10 minutes) +- **Example**: + ```bash + export GAGGLE_METADATA_TTL=300 + ``` + +##### Download Coordination When multiple queries attempt to download the same dataset concurrently, Gaggle coordinates using an in-process lock. These settings control the wait behavior when a download is already in progress. @@ -99,28 +121,24 @@ These settings control the wait behavior when a download is already in progress. - **Default**: `30` - **Example**: ```bash - export GAGGLE_DOWNLOAD_WAIT_TIMEOUT=600 # 10 minutes + export GAGGLE_DOWNLOAD_WAIT_TIMEOUT=600 ## 10 minutes ``` - **GAGGLE_DOWNLOAD_WAIT_POLL** - **Description**: Polling interval while waiting (seconds) - **Type**: Float or integer (seconds) - **Default**: `0.1` -#### Logging Configuration +##### Logging Configuration -##### GAGGLE_VERBOSE +###### GAGGLE_VERBOSE - **Description**: Enable verbose logging (boolean) -- **Type**: Boolean (accepts: 1, true, yes, on, 0, false, no, off) +- **Type**: Boolean (1/true/yes/on or 0/false/no/off) - **Default**: `false` -- **Example**: - ```bash - export GAGGLE_VERBOSE=1 - ``` -##### GAGGLE_LOG_LEVEL +###### GAGGLE_LOG_LEVEL -- **Description**: Set logging level for structured logs emitted by the Rust core (via `tracing`) +- **Description**: Set logging level for structured logs emitted by the Rust core (via tracing) - **Type**: String (`ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`); case-insensitive - **Default**: `WARN` - **Status**: ✅ Implemented @@ -134,7 +152,7 @@ These settings control the wait behavior when a download is already in progress. is called). The environment variable is read once per process. - Logs include a level prefix and optional ANSI colors if stderr is a terminal. -#### Offline Mode +##### Offline Mode - **GAGGLE_OFFLINE** - **Description**: Disable network access. When enabled, operations that require network will fail fast unless data @@ -142,101 +160,108 @@ These settings control the wait behavior when a download is already in progress. - **Type**: Boolean (`1`, `true`, `yes`, `on` to enable) - **Default**: `false` - **Effects**: - - `gaggle_download(...)` fails if the dataset isn’t cached. - - `gaggle_version_info` reports `latest_version` as "unknown" if no cache metadata exists. - - `gaggle_is_current` and other version checks use cached `.downloaded` metadata when available. - - `gaggle_search` and `gaggle_info` also fail fast in offline mode (no network attempts). + - Downloads fail if dataset isn’t cached. + - Search and metadata fetch fail fast. + - Version checks use cached .downloaded metadata when available; otherwise latest_version becomes "unknown". + - **Example**: ```bash export GAGGLE_OFFLINE=1 ``` -### Usage Examples +##### On-Demand Download Behavior + +- **GAGGLE_STRICT_ONDEMAND** + - **Description**: When enabled, `gaggle_get_file_path` will NOT fall back to a full dataset download if the single-file request fails. + - **Type**: Boolean (`1`, `true`, `yes`, `on` to enable) + - **Default**: `false` + +#### Usage Examples -#### Example 1: Custom Cache Directory +##### Example 1: Custom Cache Directory ```bash -## Set custom cache directory +### Set custom cache directory export GAGGLE_CACHE_DIR="/mnt/fast-ssd/kaggle-cache" -## Start DuckDB +### Start DuckDB ./build/release/duckdb -## Check configuration +### Check configuration SELECT gaggle_search('iris', 1, 10); ``` -#### Example 2: Larger Cache for Big Datasets +##### Example 2: Larger Cache for Big Datasets ```bash -# Set cache to 50GB for large datasets +## Set cache to 50GB for large datasets export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 -# Download and query large Kaggle datasets +## Download and query large Kaggle datasets ./build/release/duckdb ``` -#### Example 3: Production Configuration +##### Example 3: Production Configuration ```bash -# Complete production configuration +## Complete production configuration export GAGGLE_CACHE_DIR="/var/lib/gaggle/cache" -export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 # 50GB -export GAGGLE_HTTP_TIMEOUT=120 # 2 minutes -export GAGGLE_HTTP_RETRY_ATTEMPTS=5 # Retry up to 5 times -export GAGGLE_HTTP_RETRY_DELAY=2 # 2 second initial delay -export GAGGLE_HTTP_RETRY_MAX_DELAY=30 # Cap backoff at 30s -export GAGGLE_LOG_LEVEL=WARN # Production logging - -## Set Kaggle credentials +export GAGGLE_CACHE_SIZE_LIMIT_MB=51200 ## 50GB +export GAGGLE_HTTP_TIMEOUT=120 ## 2 minutes +export GAGGLE_HTTP_RETRY_ATTEMPTS=5 ## Retry up to 5 times +export GAGGLE_HTTP_RETRY_DELAY=2 ## 2 second initial delay +export GAGGLE_HTTP_RETRY_MAX_DELAY=30 ## Cap backoff at 30s +export GAGGLE_LOG_LEVEL=WARN ## Production logging + +### Set Kaggle credentials export KAGGLE_USERNAME="your-username" export KAGGLE_KEY="your-api-key" -## Run DuckDB with Gaggle +### Run DuckDB with Gaggle ./build/release/duckdb ``` -#### Example 4: Development/Debug Configuration +##### Example 4: Development/Debug Configuration ```bash -## Development setup with verbose logging +### Development setup with verbose logging export GAGGLE_CACHE_DIR="./dev-cache" -export GAGGLE_LOG_LEVEL=DEBUG ## Detailed debug logs -export GAGGLE_HTTP_TIMEOUT=10 ## Shorter timeout for dev -export GAGGLE_HTTP_RETRY_ATTEMPTS=1 ## Fail fast in development -export GAGGLE_HTTP_RETRY_DELAY=0.25 ## Quick retry (250ms) +export GAGGLE_LOG_LEVEL=DEBUG ### Detailed debug logs +export GAGGLE_HTTP_TIMEOUT=10 ### Shorter timeout for dev +export GAGGLE_HTTP_RETRY_ATTEMPTS=1 ### Fail fast in development +export GAGGLE_HTTP_RETRY_DELAY=0.25 ### Quick retry (250ms) -## Run DuckDB +### Run DuckDB ./build/release/duckdb ``` -#### Example 5: Slow Network Configuration +##### Example 5: Slow Network Configuration ```bash -## Configuration for slow or unreliable networks -export GAGGLE_HTTP_TIMEOUT=300 ## 5 minute timeout -export GAGGLE_HTTP_RETRY_ATTEMPTS=10 ## Many retries -export GAGGLE_HTTP_RETRY_DELAY=5 ## 5 second initial delay -export GAGGLE_HTTP_RETRY_MAX_DELAY=60 ## Cap at 60s +### Configuration for slow or unreliable networks +export GAGGLE_HTTP_TIMEOUT=300 ### 5 minute timeout +export GAGGLE_HTTP_RETRY_ATTEMPTS=10 ### Many retries +export GAGGLE_HTTP_RETRY_DELAY=5 ### 5 second initial delay +export GAGGLE_HTTP_RETRY_MAX_DELAY=60 ### Cap at 60s ./build/release/duckdb ``` -#### Example 6: Offline Mode +##### Example 6: Offline Mode ```bash -# Enable offline mode +## Enable offline mode export GAGGLE_OFFLINE=1 -# Attempt to download a dataset (will fail if not cached) +## Attempt to download a dataset (will fail if not cached) SELECT gaggle_download('username/dataset-name'); -# Querying metadata or searching will fail fast in offline mode +## Querying metadata or searching will fail fast in offline mode SELECT gaggle_info('username/dataset-name'); SELECT gaggle_search('keyword', 1, 10); ``` -### Configuration Verification +#### Configuration Verification You can verify your configuration at runtime: @@ -258,16 +283,16 @@ SELECT gaggle_info('username/dataset-name'); SELECT gaggle_last_error(); ``` -### Retry Policy Details +#### Retry Policy Details Gaggle implements retries with exponential backoff for HTTP requests. The number of attempts, initial delay, and maximum delay can be tuned with the environment variables above. -### Logging Levels +#### Logging Levels Detailed logging control via `GAGGLE_LOG_LEVEL` is implemented. -### Units +#### Units - Storage sizes are reported in megabytes (MB) throughout the API and SQL functions. - Timeouts and retry delays are configured in seconds via environment variables with clean names (no unit suffixes). For diff --git a/docs/ERROR_CODES.md b/docs/ERROR_CODES.md index c1126f0..e991ae4 100644 --- a/docs/ERROR_CODES.md +++ b/docs/ERROR_CODES.md @@ -1,14 +1,10 @@ -# Gaggle Error Codes Reference +### Overview -**Version:** 0.1.0 -**Date:** November 2, 2025 +Gaggle uses standardized error codes to make error handling more predictable and debugging easier. +Each error includes a numeric code (E001 to E010) that can be used programmatically. +When troubleshooting, look for the bracketed code (like [E003]) and refer to the corresponding section below. -## Overview - -Gaggle uses standardized error codes to make error handling more predictable and debugging easier. Each error includes a -numeric code (E001-E010) that can be used programmatically. - -## Error Code Format +#### Error Code Format All errors follow this format: @@ -22,9 +18,9 @@ Example: [E002] Dataset not found: owner/invalid-dataset ``` -## Error Codes +#### Error Codes -### E001 - Invalid Credentials +##### E001 - Invalid Credentials **Category:** Authentication **Code:** `E001` @@ -48,37 +44,13 @@ Kaggle API credentials are invalid, missing, or incorrectly formatted. **Solutions:** -1. **Set credentials via SQL:** - ```sql - SELECT gaggle_set_credentials('your-username', 'your-api-key'); - ``` - -2. **Set environment variables:** - ```bash - export KAGGLE_USERNAME=your-username - export KAGGLE_KEY=your-api-key - ``` - -3. **Create kaggle.json file:** - ```bash - mkdir -p ~/.kaggle - cat > ~/.kaggle/kaggle.json << EOF - { - "username": "your-username", - "key": "your-api-key" - } - EOF - chmod 600 ~/.kaggle/kaggle.json - ``` - -4. **Get your API key from Kaggle:** - - Go to https://www.kaggle.com/settings/account - - Click "Create New API Token" - - Download kaggle.json +- Set credentials via SQL: `select gaggle_set_credentials('your-username', 'your-api-key');` +- Or via env: `export KAGGLE_USERNAME=...` and `export KAGGLE_KEY=...` +- Or create `~/.kaggle/kaggle.json` with username/key (chmod 600) --- -### E002 - Dataset Not Found +##### E002 - Dataset Not Found **Category:** Dataset **Code:** `E002` @@ -108,7 +80,7 @@ The requested dataset does not exist on Kaggle or is not accessible. 2. **Search for the dataset:** ```sql - SELECT gaggle_search('dataset keywords', 1, 10); + select gaggle_search('dataset keywords', 1, 10); ``` 3. **Check dataset availability:** @@ -117,7 +89,7 @@ The requested dataset does not exist on Kaggle or is not accessible. --- -### E003 - Network Error +##### E003 - Network Error **Category:** Network **Code:** `E003` @@ -131,7 +103,7 @@ Network error occurred during communication with Kaggle API. - No internet connection - Kaggle API is down - Firewall blocking requests -- Timeout +- A timeout happened - Rate limiting **Example:** @@ -153,8 +125,7 @@ Network error occurred during communication with Kaggle API. ``` 3. **Check Kaggle API status:** - - Visit https://www.kaggle.com - - Check https://status.kaggle.com (if available) + - Check https://www.kaggle.com is accessible 4. **Retry with backoff:** ```bash @@ -169,7 +140,7 @@ Network error occurred during communication with Kaggle API. --- -### E004 - Invalid Path +##### E004 - Invalid Path **Category:** Validation **Code:** `E004` @@ -212,7 +183,7 @@ owner/. # Dot component 1. **Use correct format:** ```sql - SELECT gaggle_download('owner/dataset-name'); + select gaggle_download('owner/dataset-name'); ``` 2. **Check for special characters:** @@ -221,7 +192,7 @@ owner/. # Dot component --- -### E005 - File System Error +##### E005 - File System Error **Category:** I/O **Code:** `E005` @@ -259,7 +230,7 @@ Error reading from or writing to the file system. 3. **Verify cache directory:** ```sql - SELECT gaggle_cache_info(); + select gaggle_cache_info(); ``` 4. **Change cache directory:** @@ -269,12 +240,12 @@ Error reading from or writing to the file system. 5. **Clean up cache:** ```sql - SELECT gaggle_clear_cache(); + select gaggle_clear_cache(); ``` --- -### E006 - JSON Error +##### E006 - JSON Error **Category:** Serialization **Code:** `E006` @@ -300,12 +271,12 @@ Error parsing or serializing JSON data. 1. **Clear cache:** ```sql - SELECT gaggle_clear_cache(); + select gaggle_clear_cache(); ``` 2. **Re-download dataset:** ```sql - SELECT gaggle_update_dataset('owner/dataset'); + select gaggle_update_dataset('owner/dataset'); ``` 3. **Check Kaggle API response manually:** @@ -315,7 +286,7 @@ Error parsing or serializing JSON data. --- -### E007 - ZIP Extraction Error +##### E007 - ZIP Extraction Error **Category:** Archive **Code:** `E007` @@ -342,12 +313,12 @@ Error extracting downloaded ZIP file. 1. **Re-download dataset:** ```sql - SELECT gaggle_update_dataset('owner/dataset'); + select gaggle_update_dataset('owner/dataset'); ``` 2. **Check dataset size:** ```sql - SELECT gaggle_info('owner/dataset'); + select gaggle_info('owner/dataset'); ``` 3. **For large datasets:** @@ -361,7 +332,7 @@ Error extracting downloaded ZIP file. --- -### E008 - CSV Parsing Error +##### E008 - CSV Parsing Error **Category:** Parsing **Code:** `E008` @@ -392,13 +363,13 @@ Error parsing CSV file format. 2. **Use DuckDB's flexible CSV reader:** ```sql - SELECT * FROM read_csv_auto('kaggle:owner/dataset/file.csv', + select * FROM read_csv_auto('kaggle:owner/dataset/file.csv', ignore_errors := true); ``` 3. **Try different parser options:** ```sql - SELECT * FROM read_csv('kaggle:owner/dataset/file.csv', + select * FROM read_csv('kaggle:owner/dataset/file.csv', delim := ';', quote := '"', escape := '\\'); @@ -406,7 +377,7 @@ Error parsing CSV file format. --- -### E009 - UTF-8 Encoding Error +##### E009 - UTF-8 Encoding Error **Category:** Encoding **Code:** `E009` @@ -442,12 +413,12 @@ String is not valid UTF-8. 3. **Use DuckDB encoding options:** ```sql - SELECT * FROM read_csv('file.csv', encoding := 'ISO-8859-1'); + select * FROM read_csv('file.csv', encoding := 'ISO-8859-1'); ``` --- -### E010 - Null Pointer Error +##### E010 - Null Pointer Error **Category:** FFI **Code:** `E010` @@ -473,129 +444,3 @@ NULL pointer passed to FFI function. - This is typically an internal error - Report as a bug if you encounter this - Include reproduction steps - ---- - -## Programmatic Error Handling - -### In Rust - -```rust -use gaggle::error::{GaggleError, ErrorCode}; - -match gaggle::download_dataset("owner/dataset") { -Ok(path) => println ! ("Downloaded to: {:?}", path), -Err(e) => { -match e.code() { -ErrorCode::E001_InvalidCredentials => { -// Handle authentication error -eprintln ! ("Please set Kaggle credentials"); -} -ErrorCode::E002_DatasetNotFound => { -// Handle missing dataset -eprintln ! ("Dataset not found, trying alternative..."); -} -ErrorCode::E003_NetworkError => { -// Handle network error -eprintln ! ("Network error, retrying..."); -} -_ => { -// Handle other errors -eprintln ! ("Error: {}", e); -} -} -} -} -``` - -### In SQL - -```sql --- Check last error after failure -SELECT gaggle_download('owner/invalid'); -- This fails -SELECT gaggle_last_error(); --- Get error message with code - --- Example output: --- "[E002] Dataset not found: owner/invalid" -``` - -### Parsing Error Codes in Application - -```python -# Python example -error_msg = execute_sql("SELECT gaggle_last_error()") - -if "[E001]" in error_msg: - # Handle credentials error - setup_credentials() -elif "[E002]" in error_msg: - # Handle dataset not found - search_alternative_dataset() -elif "[E003]" in error_msg: - # Handle network error - retry_with_backoff() -``` - -## Error Recovery Strategies - -### Transient Errors (Retry) - -- **E003** - Network errors (automatic retry with backoff) -- **E006** - JSON errors (may be temporary API issue) - -### Configuration Errors (User Action Required) - -- **E001** - Invalid credentials -- **E004** - Invalid path format - -### Resource Errors (Check System) - -- **E005** - I/O errors (disk space, permissions) -- **E007** - ZIP errors (space, corruption) - -### Data Errors (Check Dataset) - -- **E002** - Dataset not found -- **E008** - CSV parsing errors - -## Best Practices - -1. **Always check error codes in production:** - ```sql - SELECT CASE - WHEN gaggle_is_current('owner/dataset') THEN 'OK' - ELSE gaggle_last_error() - END; - ``` - -2. **Log errors with codes:** - - Include error code in logs - - Helps with debugging and monitoring - -3. **Implement retry logic for transient errors:** - - E003 (Network) - retry with exponential backoff - - E006 (JSON) - retry once or twice - -4. **Alert on specific error codes:** - - E001 (Credentials) - immediate alert - - E002 (Not Found) - dataset issue alert - -5. **Document error codes in your application:** - - Link to this reference - - Provide context-specific solutions - -## Changelog - -### Version 0.1.0 (November 2, 2025) - -- Initial error code implementation -- 10 error codes defined (E001-E010) -- All error messages updated with codes - -## See Also - -- [TROUBLESHOOTING.md](TROUBLESHOOTING.md) - Troubleshooting guide -- [FAQ.md](FAQ.md) - Frequently asked questions -- [CONFIGURATION.md](CONFIGURATION.md) - Configuration options -- [README.md](../README.md) - Main documentation diff --git a/docs/README.md b/docs/README.md index e54648b..e195c84 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,23 +2,25 @@ The table below includes the information about all SQL functions exposed by Gaggle. -| # | Function | Return Type | Description | -|----|:----------------------------------------------------------------|:-----------------|:--------------------------------------------------------------------------------------------------------------------------| -| 1 | `gaggle_set_credentials(username VARCHAR, key VARCHAR)` | `BOOLEAN` | Sets Kaggle API credentials from SQL (alternatively use env vars or `~/.kaggle/kaggle.json`). Returns `true` on success. | -| 2 | `gaggle_download(dataset_path VARCHAR)` | `VARCHAR` | Downloads a Kaggle dataset to the local cache directory and returns the local dataset path. Idempotent. | -| 3 | `gaggle_search(query VARCHAR, page INTEGER, page_size INTEGER)` | `VARCHAR (JSON)` | Searches Kaggle datasets and returns a JSON array. Constraints: `page >= 1`, `1 <= page_size <= 100`. | -| 4 | `gaggle_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns metadata for a dataset as JSON (for example: title, url, last_updated). | -| 5 | `gaggle_version()` | `VARCHAR` | Returns the extension version string (for example: `"0.1.0"`). | -| 6 | `gaggle_clear_cache()` | `BOOLEAN` | Clears the dataset cache directory. Returns `true` on success. | -| 7 | `gaggle_cache_info()` | `VARCHAR (JSON)` | Returns cache info JSON with `path`, `size_mb`, `limit_mb`, `usage_percent`, `is_soft_limit`, and `type` fields. | -| 8 | `gaggle_enforce_cache_limit()` | `BOOLEAN` | Manually enforces cache size limit using LRU eviction. Returns `true` on success. (Automatic with soft limit by default). | -| 9 | `gaggle_is_current(dataset_path VARCHAR)` | `BOOLEAN` | Checks if cached dataset is the latest version from Kaggle. Returns `false` if not cached or outdated. | -| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. | -| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. | -| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object/array into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. | -| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. | +| # | Function | Return Type | Description | +|----|:----------------------------------------------------------------|:-----------------|:------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | `gaggle_set_credentials(username VARCHAR, key VARCHAR)` | `BOOLEAN` | Sets Kaggle API credentials from SQL (alternatively use env vars or `~/.kaggle/kaggle.json`). Returns `true` on success. | +| 2 | `gaggle_download(dataset_path VARCHAR)` | `VARCHAR` | Downloads a Kaggle dataset to the local cache directory and returns the local dataset path. This function is idempotent. | +| 3 | `gaggle_search(query VARCHAR, page INTEGER, page_size INTEGER)` | `VARCHAR (JSON)` | Searches Kaggle datasets and returns a JSON array. Constraints: `page >= 1`, `1 <= page_size <= 100`. | +| 4 | `gaggle_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns metadata for a dataset as JSON (for example: `title`, `url`, `last_updated`). | +| 5 | `gaggle_version()` | `VARCHAR` | Returns the extension version string (for example: `"0.1.0"`). | +| 6 | `gaggle_clear_cache()` | `BOOLEAN` | Clears the dataset cache directory. Returns `true` on success. | +| 7 | `gaggle_cache_info()` | `VARCHAR (JSON)` | Returns cache info JSON with `path`, `size_mb`, `limit_mb`, `usage_percent`, `is_soft_limit`, and `type` fields. | +| 8 | `gaggle_enforce_cache_limit()` | `BOOLEAN` | Manually enforces cache size limit using LRU eviction. Returns `true` on success. (Automatic with soft limit by default). | +| 9 | `gaggle_is_current(dataset_path VARCHAR)` | `BOOLEAN` | Checks if cached dataset is the latest version from Kaggle. Returns `false` if not cached or outdated. | +| 10 | `gaggle_update_dataset(dataset_path VARCHAR)` | `VARCHAR` | Forces update to latest version (ignores cache). Returns local path to freshly downloaded dataset. | +| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. | +| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. | +| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. | > [!NOTE] +> The `gaggle_file_path` function will fetch the file into the cache if it is missing; set `GAGGLE_STRICT_ONDEMAND=1` to prevent fallback to a full dataset download on failures. +> > Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on > Kaggle. > For example: `habedi/flickr-8k-dataset-clean`. @@ -27,9 +29,9 @@ The table below includes the information about all SQL functions exposed by Gagg Table function: -| # | Function | Return Type | Description | -|----|:----------------------------------|:-------------------------------------------------|:-------------------------------------------------------------------------------| -| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursive) in the dataset's local directory; `size` is in MB. | +| # | Function | Return Type | Description | +|----|:----------------------------------|:-------------------------------------------------|:---------------------------------------------------------------------------------| +| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. | Replacement scan (transparent table read): @@ -38,8 +40,8 @@ Replacement scan (transparent table read): - Reader is chosen by extension: - `.parquet`/`.parq` -> `read_parquet` - `.json`/`.jsonl`/`.ndjson` -> `read_json_auto` - - `.xlsx` -> `read_excel` (requires DuckDB to be built with the Excel reader) - - everything else -> `read_csv_auto` + - `.xlsx` -> `read_excel` + - for everything else -> `read_csv_auto` --- @@ -49,24 +51,24 @@ Replacement scan (transparent table read): ```sql -- Load the Gaggle extension -load -'build/release/extension/gaggle/gaggle.duckdb_extension'; +load 'build/release/extension/gaggle/gaggle.duckdb_extension'; --- Set Kaggle credentials (or rely on env vars / ~/.kaggle/kaggle.json) +-- Set Kaggle credentials (or read fron environment variables or from `~/.kaggle/kaggle.json` file) select gaggle_set_credentials('your-username', 'your-api-key'); -- Check version select gaggle_version(); --- Search datasets (JSON string) +-- Search datasets (returns a JSON array) +-- (This function is disabled in offline mode (when GAGGLE_OFFLINE=1)) select gaggle_search('iris', 1, 5); --- Disabled in offline mode (GAGGLE_OFFLINE=1) -- Download a dataset and get its local path select gaggle_download('uciml/iris') as local_path; --- Get dataset metadata (JSON) -select gaggle_info('uciml/iris') as dataset_metadata; -- Disabled in offline mode (GAGGLE_OFFLINE=1) +-- Get dataset metadata (as a JSON object) +-- (This function is disabled in offline mode (when GAGGLE_OFFLINE=1)) +select gaggle_info('uciml/iris') as dataset_metadata; ``` #### Reading Data @@ -86,7 +88,7 @@ execute rp(gaggle_file_path('owner/dataset', 'file.parquet')); ``` ```sql --- Replacement scan: read a single Parquet file via Kaggle: URL +-- Replacement scan: read a single Parquet file via `kaggle:` URL scheme select count(*) from 'kaggle:owner/dataset/file.parquet'; @@ -121,20 +123,21 @@ select * from 'kaggle:owner/dataset@v5/*.parquet'; -- Smart download: update only if outdated -select CASE - WHEN gaggle_is_current('owner/dataset') THEN gaggle_download('owner/dataset') - ELSE gaggle_update_dataset('owner/dataset') - END as path; +select case + when gaggle_is_current('owner/dataset') then gaggle_download('owner/dataset') + else gaggle_update_dataset('owner/dataset') + end as path; ``` #### Utility Functions ```sql --- Purge cache and inspect info +-- Purge cache and see cache info select gaggle_clear_cache(); select gaggle_cache_info(); --- Manually enforce cache size limit (LRU eviction of oldest datasets) +-- Manually enforce cache size limit +-- (Automatic enforcement is done with a soft limit by default and older files are removed first) select gaggle_enforce_cache_limit(); -- Expand JSON into newline-delimited rows @@ -147,14 +150,11 @@ select json_extract_string(value, '$.ref') as ref, from json_each((select j from s)) limit 5; ``` -> [!IMPORTANT] -> When you use a dataset, you read and process its files locally. Only use datasets from trusted sources. - --- ### Building Gaggle from Source -To build Gaggle from source, you need GNU Make, CMake, a C++ compiler (GCC/Clang), Rust and Cargo. +To build Gaggle from source, you need GNU Make, CMake, a modern C++ compiler (like GCC or Clang), Rust and Cargo. 1. **Clone the repository:** ```bash @@ -182,40 +182,7 @@ To build Gaggle from source, you need GNU Make, CMake, a C++ compiler (GCC/Clang --- -### Configuration - -See [CONFIGURATION.md](CONFIGURATION.md) for full details. Key environment variables: - -- `GAGGLE_CACHE_DIR` — cache directory path (default: `~/.cache/gaggle`) -- `GAGGLE_HTTP_TIMEOUT` — HTTP timeout (in seconds) -- `GAGGLE_HTTP_RETRY_ATTEMPTS` — retry attempts after the initial try -- `GAGGLE_HTTP_RETRY_DELAY` — initial backoff delay (in seconds) -- `GAGGLE_HTTP_RETRY_MAX_DELAY` — maximum backoff delay cap (in seconds) -- `GAGGLE_LOG_LEVEL` — structured log level for the Rust core (e.g., `INFO`, `DEBUG`) -- `GAGGLE_OFFLINE` — disable network; only use cached data (downloads fail fast if not cached) -- `KAGGLE_USERNAME` and `KAGGLE_KEY` — Kaggle credentials (alternative to the SQL call) - -> [!NOTE] -> Environment variables are case-sensitive on Unix-like systems. Changes take effect for subsequent operations in the -> same process. - -#### Units - -- Storage sizes are reported in megabytes (MB) across SQL/API (for example: `gaggle_cache_info()` returns `size_mb`). -- Timeouts and retry delays are configured in seconds (via clean environment variables without unit suffixes). - -### Replacement Scan Readers - -Gaggle selects the DuckDB reader based on file extension: - -- `.parquet`/`.parq` -> `read_parquet` -- `.json`/`.jsonl`/`.ndjson` -> `read_json_auto` -- `.xlsx` -> `read_excel` (requires DuckDB to be built with the Excel reader) -- everything else -> `read_csv_auto` - ---- - -### Architecture +### Architecture Overview Gaggle is made up of two main components: @@ -224,9 +191,9 @@ Gaggle is made up of two main components: - HTTP client with timeout and exponential backoff - Dataset download with safe ZIP extraction and file resolution - Search and metadata requests - - C-compatible FFI surface + - A few C-compatible FFI functions for use by DuckDB 2. **C++ DuckDB Bindings (`gaggle/bindings/`)** that: - - Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_path`, `gaggle_search`) + - Defines the custom SQL functions (for example: `gaggle_ls`, `gaggle_file_path`, and `gaggle_search`) - Integrates with DuckDB’s extension system and replacement scans (`'kaggle:...'`) - Marshals values between DuckDB vectors and the Rust FFI diff --git a/gaggle/bindings/include/rust.h b/gaggle/bindings/include/rust.h index bfec325..ab1670c 100644 --- a/gaggle/bindings/include/rust.h +++ b/gaggle/bindings/include/rust.h @@ -52,39 +52,28 @@ extern "C" { /** * Set Kaggle API credentials * - * # Arguments + * Arguments: + * - `username`: non-null pointer to a NUL-terminated C string + * - `key`: non-null pointer to a NUL-terminated C string * - * * `username` - A pointer to a null-terminated C string representing the Kaggle username. - * * `key` - A pointer to a null-terminated C string representing the Kaggle API key. + * Returns 0 on success, -1 on failure (call gaggle_last_error). * - * # Returns - * - * * `0` on success. - * * `-1` on failure. Call `gaggle_last_error()` to get a descriptive error message. - * - * # Safety - * - * * The `username` and `key` pointers must not be null. - * * The memory pointed to by `username` and `key` must be valid, null-terminated C strings. + * Safety: + * - The pointers must be valid and remain alive for the duration of this call. + * - Strings must be valid UTF-8; interior NULs are not allowed. */ int32_t gaggle_set_credentials(const char *username, const char *key); /** * Download a Kaggle dataset and return its local cache path * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path (e.g., "owner/dataset-name"). + * Arguments: + * - `dataset_path`: non-null pointer to a NUL-terminated C string "owner/dataset[[@vN|@latest]]". * - * # Returns - * - * A pointer to a null-terminated C string containing the local path, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety + * Returns pointer to a heap-allocated C string. Free with gaggle_free(). On error, returns NULL and sets gaggle_last_error. * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. + * Safety: + * - The pointer must be valid and the string valid UTF-8; interior NULs are not allowed. */ char *gaggle_download_dataset(const char *dataset_path); @@ -92,208 +81,83 @@ char *gaggle_download_dataset(const char *dataset_path); /** * Get the local path to a specific file in a downloaded dataset * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * * `filename` - A pointer to a null-terminated C string representing the filename. - * - * # Returns - * - * A pointer to a null-terminated C string containing the file path, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The pointers must not be null. - * * The memory pointed to must be valid, null-terminated C strings. + * Arguments: + * - `dataset_path`: non-null pointer to owner/dataset + * - `filename`: non-null pointer to relative filename within the dataset */ char *gaggle_get_file_path(const char *dataset_path, const char *filename); /** * List files in a Kaggle dataset - * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * - * # Returns - * - * A pointer to a null-terminated C string containing JSON array of files, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. */ char *gaggle_list_files(const char *dataset_path); /** * Search for Kaggle datasets - * - * # Arguments - * - * * `query` - A pointer to a null-terminated C string representing the search query. - * * `page` - Page number (1-indexed). - * * `page_size` - Number of results per page. - * - * # Returns - * - * A pointer to a null-terminated C string containing JSON search results, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The `query` pointer must not be null. - * * The memory pointed to by `query` must be a valid, null-terminated C string. */ char *gaggle_search(const char *query, int32_t page, int32_t page_size); /** * Get metadata for a specific Kaggle dataset - * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * - * # Returns - * - * A pointer to a null-terminated C string containing JSON metadata, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. */ char *gaggle_get_dataset_info(const char *dataset_path); /** * Get version information - * - * # Returns - * - * A pointer to a null-terminated C string containing the version string (e.g., "0.1.0"). - * The caller must free this pointer using `gaggle_free()`. */ char *gaggle_get_version(void); /** * Frees a heap-allocated C string * - * # Safety - * - * The `ptr` must be a non-null pointer to a C string that was previously allocated - * by a Gaggle function. + * Safety: + * - `ptr` must be a pointer previously returned by a Gaggle FFI function that transfers ownership + * (e.g., gaggle_get_version, gaggle_list_files, etc.). + * - Passing the same pointer twice, or a pointer not allocated by Gaggle, results in undefined behavior. */ - void gaggle_free(char *ptr); + +void gaggle_free(char *ptr); /** * Clear the dataset cache - * - * # Returns - * - * * `0` on success. - * * `-1` on failure. */ int32_t gaggle_clear_cache(void); /** * Enforce cache size limit by evicting oldest datasets - * - * # Returns - * - * * `0` on success. - * * `-1` on failure. */ int32_t gaggle_enforce_cache_limit(void); /** * Check if cached dataset is the current version - * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * - * # Returns - * - * * `1` if cached version is current. - * * `0` if cached version is outdated or not cached. - * * `-1` on error. - * - * # Safety - * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. */ int32_t gaggle_is_dataset_current(const char *dataset_path); /** * Force update dataset to latest version (ignores cache) - * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * - * # Returns - * - * A pointer to a null-terminated C string containing the local path, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. */ char *gaggle_update_dataset(const char *dataset_path); /** * Get version information for a dataset - * - * # Arguments - * - * * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. - * - * # Returns - * - * A pointer to a null-terminated C string containing JSON version info, or NULL on failure. - * The caller must free this pointer using `gaggle_free()`. - * - * # Safety - * - * * The `dataset_path` pointer must not be null. - * * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. */ char *gaggle_dataset_version_info(const char *dataset_path); /** * Get cache information - * - * # Returns - * - * A pointer to a null-terminated C string containing JSON cache info. - * The caller must free this pointer using `gaggle_free()`. */ char *gaggle_get_cache_info(void); /** * Parse JSON and expand objects/arrays similar to json_each - * - * # Arguments - * - * * `json_str` - A pointer to a null-terminated C string containing JSON data - * - * # Returns - * - * A pointer to a null-terminated C string containing newline-delimited JSON objects - * - * # Safety - * - * * The `json_str` pointer must not be null. - * * The memory pointed to by `json_str` must be a valid, null-terminated C string. */ char *gaggle_json_each(const char *json_str); +/** + * Prefetch multiple files in a dataset without downloading the entire archive + */ + char *gaggle_prefetch_files(const char *dataset_path, const char *file_list); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/gaggle/src/config.rs b/gaggle/src/config.rs index bf89ee3..a0a8ecc 100644 --- a/gaggle/src/config.rs +++ b/gaggle/src/config.rs @@ -206,6 +206,15 @@ pub fn offline_mode() -> bool { .unwrap_or(false) } +/// Whether strict on-demand mode is enabled. When true, gaggle_get_file_path will NOT fall back to +/// full dataset download if single-file fetch fails. +pub fn strict_on_demand() -> bool { + std::env::var("GAGGLE_STRICT_ONDEMAND") + .ok() + .map(|v| matches!(v.to_lowercase().as_str(), "1" | "true" | "yes" | "on")) + .unwrap_or(false) +} + #[cfg(test)] mod tests { use super::*; @@ -533,4 +542,18 @@ mod tests { assert!(!offline_mode()); std::env::remove_var("GAGGLE_OFFLINE"); } + + #[test] + #[serial] + fn test_strict_on_demand_env_parsing() { + std::env::remove_var("GAGGLE_STRICT_ONDEMAND"); + assert!(!strict_on_demand()); + std::env::set_var("GAGGLE_STRICT_ONDEMAND", "1"); + assert!(strict_on_demand()); + std::env::set_var("GAGGLE_STRICT_ONDEMAND", "true"); + assert!(strict_on_demand()); + std::env::set_var("GAGGLE_STRICT_ONDEMAND", "off"); + assert!(!strict_on_demand()); + std::env::remove_var("GAGGLE_STRICT_ONDEMAND"); + } } diff --git a/gaggle/src/ffi.rs b/gaggle/src/ffi.rs index d3b397f..1bf477a 100644 --- a/gaggle/src/ffi.rs +++ b/gaggle/src/ffi.rs @@ -13,20 +13,15 @@ pub extern "C" fn gaggle_init_logging() { /// Set Kaggle API credentials /// -/// # Arguments +/// Arguments: +/// - `username`: non-null pointer to a NUL-terminated C string +/// - `key`: non-null pointer to a NUL-terminated C string /// -/// * `username` - A pointer to a null-terminated C string representing the Kaggle username. -/// * `key` - A pointer to a null-terminated C string representing the Kaggle API key. -/// -/// # Returns -/// -/// * `0` on success. -/// * `-1` on failure. Call `gaggle_last_error()` to get a descriptive error message. +/// Returns 0 on success, -1 on failure (call gaggle_last_error). /// /// # Safety -/// -/// * The `username` and `key` pointers must not be null. -/// * The memory pointed to by `username` and `key` must be valid, null-terminated C strings. +/// - The pointers must be valid and remain alive for the duration of this call. +/// - Strings must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_set_credentials( username: *const c_char, @@ -42,6 +37,14 @@ pub unsafe extern "C" fn gaggle_set_credentials( let username_str = CStr::from_ptr(username).to_str()?; let key_str = CStr::from_ptr(key).to_str()?; + // Input length guardrails to avoid accidental huge strings + const MAX_LEN: usize = 8192; + if username_str.len() > MAX_LEN || key_str.len() > MAX_LEN { + return Err(error::GaggleError::InvalidDatasetPath( + "input too long".to_string(), + )); + } + kaggle::credentials::set_credentials(username_str, key_str)?; Ok(()) })(); @@ -57,19 +60,13 @@ pub unsafe extern "C" fn gaggle_set_credentials( /// Download a Kaggle dataset and return its local cache path /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path (e.g., "owner/dataset-name"). -/// -/// # Returns +/// Arguments: +/// - `dataset_path`: non-null pointer to a NUL-terminated C string "owner/dataset[[@vN|@latest]]". /// -/// A pointer to a null-terminated C string containing the local path, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. +/// Returns pointer to a heap-allocated C string. Free with gaggle_free(). On error, returns NULL and sets gaggle_last_error. /// /// # Safety -/// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// - The pointer must be valid and the string valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_download_dataset(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -80,6 +77,11 @@ pub unsafe extern "C" fn gaggle_download_dataset(dataset_path: *const c_char) -> return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } let local_path = kaggle::download_dataset(path_str)?; Ok(local_path.to_string_lossy().to_string()) @@ -96,20 +98,14 @@ pub unsafe extern "C" fn gaggle_download_dataset(dataset_path: *const c_char) -> /// Get the local path to a specific file in a downloaded dataset /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// * `filename` - A pointer to a null-terminated C string representing the filename. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing the file path, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. +/// Arguments: +/// - `dataset_path`: non-null pointer to owner/dataset +/// - `filename`: non-null pointer to relative filename within the dataset /// /// # Safety /// -/// * The pointers must not be null. -/// * The memory pointed to must be valid, null-terminated C strings. +/// Both pointers must be valid and point to valid NUL-terminated C strings. +/// Strings must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_get_file_path( dataset_path: *const c_char, @@ -124,6 +120,11 @@ pub unsafe extern "C" fn gaggle_get_file_path( } let path_str = CStr::from_ptr(dataset_path).to_str()?; let filename_str = CStr::from_ptr(filename).to_str()?; + if path_str.len() > 4096 || filename_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "input too long".to_string(), + )); + } let file_path = kaggle::get_dataset_file_path(path_str, filename_str)?; Ok(file_path.to_string_lossy().to_string()) @@ -140,19 +141,10 @@ pub unsafe extern "C" fn gaggle_get_file_path( /// List files in a Kaggle dataset /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing JSON array of files, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. -/// /// # Safety /// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_list_files(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -163,6 +155,11 @@ pub unsafe extern "C" fn gaggle_list_files(dataset_path: *const c_char) -> *mut return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } let files = kaggle::list_dataset_files(path_str)?; let json = serde_json::to_string(&files)?; @@ -180,21 +177,10 @@ pub unsafe extern "C" fn gaggle_list_files(dataset_path: *const c_char) -> *mut /// Search for Kaggle datasets /// -/// # Arguments -/// -/// * `query` - A pointer to a null-terminated C string representing the search query. -/// * `page` - Page number (1-indexed). -/// * `page_size` - Number of results per page. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing JSON search results, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. -/// /// # Safety /// -/// * The `query` pointer must not be null. -/// * The memory pointed to by `query` must be a valid, null-terminated C string. +/// The query pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_search( query: *const c_char, @@ -209,6 +195,11 @@ pub unsafe extern "C" fn gaggle_search( return Err(error::GaggleError::NullPointer); } let query_str = CStr::from_ptr(query).to_str()?; + if query_str.len() > 8192 { + return Err(error::GaggleError::InvalidDatasetPath( + "query too long".to_string(), + )); + } let results = kaggle::search_datasets(query_str, page, page_size)?; let json = serde_json::to_string(&results)?; @@ -226,19 +217,10 @@ pub unsafe extern "C" fn gaggle_search( /// Get metadata for a specific Kaggle dataset /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing JSON metadata, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. -/// /// # Safety /// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_get_dataset_info(dataset_path: *const c_char) -> *mut c_char { // Clear any previous error @@ -249,6 +231,11 @@ pub unsafe extern "C" fn gaggle_get_dataset_info(dataset_path: *const c_char) -> return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } let metadata = kaggle::get_dataset_metadata(path_str)?; let json = serde_json::to_string(&metadata)?; @@ -265,11 +252,6 @@ pub unsafe extern "C" fn gaggle_get_dataset_info(dataset_path: *const c_char) -> } /// Get version information -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing the version string (e.g., "0.1.0"). -/// The caller must free this pointer using `gaggle_free()`. #[no_mangle] pub extern "C" fn gaggle_get_version() -> *mut c_char { // Return only the version string (no JSON wrapper) @@ -280,8 +262,9 @@ pub extern "C" fn gaggle_get_version() -> *mut c_char { /// /// # Safety /// -/// The `ptr` must be a non-null pointer to a C string that was previously allocated -/// by a Gaggle function. +/// `ptr` must be a pointer previously returned by a Gaggle FFI function that transfers ownership +/// (e.g., gaggle_get_version, gaggle_list_files, etc.). +/// Passing the same pointer twice, or a pointer not allocated by Gaggle, results in undefined behavior. #[no_mangle] pub unsafe extern "C" fn gaggle_free(ptr: *mut c_char) { if !ptr.is_null() { @@ -290,11 +273,6 @@ pub unsafe extern "C" fn gaggle_free(ptr: *mut c_char) { } /// Clear the dataset cache -/// -/// # Returns -/// -/// * `0` on success. -/// * `-1` on failure. #[no_mangle] pub extern "C" fn gaggle_clear_cache() -> i32 { let result = (|| -> Result<(), error::GaggleError> { @@ -319,11 +297,6 @@ pub extern "C" fn gaggle_clear_cache() -> i32 { } /// Enforce cache size limit by evicting oldest datasets -/// -/// # Returns -/// -/// * `0` on success. -/// * `-1` on failure. #[no_mangle] pub extern "C" fn gaggle_enforce_cache_limit() -> i32 { let result = kaggle::download::enforce_cache_limit_now(); @@ -339,20 +312,10 @@ pub extern "C" fn gaggle_enforce_cache_limit() -> i32 { /// Check if cached dataset is the current version /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// -/// # Returns -/// -/// * `1` if cached version is current. -/// * `0` if cached version is outdated or not cached. -/// * `-1` on error. -/// /// # Safety /// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_is_dataset_current(dataset_path: *const c_char) -> i32 { error::clear_last_error_internal(); @@ -362,6 +325,11 @@ pub unsafe extern "C" fn gaggle_is_dataset_current(dataset_path: *const c_char) return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } kaggle::is_dataset_current(path_str) })(); @@ -377,19 +345,10 @@ pub unsafe extern "C" fn gaggle_is_dataset_current(dataset_path: *const c_char) /// Force update dataset to latest version (ignores cache) /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing the local path, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. -/// /// # Safety /// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_update_dataset(dataset_path: *const c_char) -> *mut c_char { error::clear_last_error_internal(); @@ -399,6 +358,11 @@ pub unsafe extern "C" fn gaggle_update_dataset(dataset_path: *const c_char) -> * return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } let local_path = kaggle::update_dataset(path_str)?; Ok(local_path.to_string_lossy().to_string()) @@ -415,19 +379,10 @@ pub unsafe extern "C" fn gaggle_update_dataset(dataset_path: *const c_char) -> * /// Get version information for a dataset /// -/// # Arguments -/// -/// * `dataset_path` - A pointer to a null-terminated C string representing the dataset path. -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing JSON version info, or NULL on failure. -/// The caller must free this pointer using `gaggle_free()`. -/// /// # Safety /// -/// * The `dataset_path` pointer must not be null. -/// * The memory pointed to by `dataset_path` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_dataset_version_info(dataset_path: *const c_char) -> *mut c_char { error::clear_last_error_internal(); @@ -437,6 +392,11 @@ pub unsafe extern "C" fn gaggle_dataset_version_info(dataset_path: *const c_char return Err(error::GaggleError::NullPointer); } let path_str = CStr::from_ptr(dataset_path).to_str()?; + if path_str.len() > 4096 { + return Err(error::GaggleError::InvalidDatasetPath( + "dataset path too long".to_string(), + )); + } let info = kaggle::get_dataset_version_info(path_str)?; Ok(info.to_string()) @@ -452,11 +412,6 @@ pub unsafe extern "C" fn gaggle_dataset_version_info(dataset_path: *const c_char } /// Get cache information -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing JSON cache info. -/// The caller must free this pointer using `gaggle_free()`. #[no_mangle] pub extern "C" fn gaggle_get_cache_info() -> *mut c_char { let cache_dir = crate::config::cache_dir_runtime(); @@ -500,18 +455,10 @@ pub extern "C" fn gaggle_get_cache_info() -> *mut c_char { /// Parse JSON and expand objects/arrays similar to json_each /// -/// # Arguments -/// -/// * `json_str` - A pointer to a null-terminated C string containing JSON data -/// -/// # Returns -/// -/// A pointer to a null-terminated C string containing newline-delimited JSON objects -/// /// # Safety /// -/// * The `json_str` pointer must not be null. -/// * The memory pointed to by `json_str` must be a valid, null-terminated C string. +/// The pointer must be valid and point to a valid NUL-terminated C string. +/// The string must be valid UTF-8; interior NULs are not allowed. #[no_mangle] pub unsafe extern "C" fn gaggle_json_each(json_str: *const c_char) -> *mut c_char { // Clear any previous error @@ -549,6 +496,53 @@ pub unsafe extern "C" fn gaggle_json_each(json_str: *const c_char) -> *mut c_cha } } +/// Prefetch multiple files in a dataset without downloading the entire archive +/// +/// # Safety +/// +/// Both pointers must be valid and point to valid NUL-terminated C strings. +/// Strings must be valid UTF-8; interior NULs are not allowed. +#[no_mangle] +pub unsafe extern "C" fn gaggle_prefetch_files( + dataset_path: *const c_char, + file_list: *const c_char, +) -> *mut c_char { + error::clear_last_error_internal(); + + let result = (|| -> Result { + if dataset_path.is_null() || file_list.is_null() { + return Err(error::GaggleError::NullPointer); + } + let ds = CStr::from_ptr(dataset_path).to_str()?; + let files_str = CStr::from_ptr(file_list).to_str()?; + if ds.len() > 4096 || files_str.len() > 1_000_000 { + return Err(error::GaggleError::InvalidDatasetPath( + "input too long".to_string(), + )); + } + let files: Vec<&str> = files_str + .lines() + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .collect(); + if files.is_empty() { + return Err(error::GaggleError::IoError( + "no valid files provided".to_string(), + )); + } + let json_val = crate::kaggle::prefetch_files(ds, &files)?; + Ok(json_val.to_string()) + })(); + + match result { + Ok(json) => string_to_c_string(json), + Err(e) => { + error::set_last_error(&e); + std::ptr::null_mut() + } + } +} + pub(crate) fn string_to_c_string(s: String) -> *mut c_char { match CString::new(s) { Ok(cstring) => cstring.into_raw(), @@ -999,4 +993,75 @@ mod tests { assert!(msg.to_lowercase().contains("json")); } } + + #[test] + fn test_gaggle_prefetch_files() { + let dataset_path = CString::new("owner/dataset").unwrap(); + let file_list = CString::new("file1.csv\nfile2.csv").unwrap(); + + unsafe { + let result_ptr = + super::gaggle_prefetch_files(dataset_path.as_ptr(), file_list.as_ptr()); + assert!(!result_ptr.is_null()); + + let result_str = CStr::from_ptr(result_ptr).to_str().unwrap(); + // Should be valid JSON object with dataset and files keys + assert!(result_str.starts_with('{')); + assert!(result_str.contains("\"dataset\"")); + assert!(result_str.contains("\"files\"")); + + // Free the result + super::gaggle_free(result_ptr); + } + } + + #[test] + fn test_gaggle_prefetch_files_null_dataset_path() { + let file_list = CString::new("file1.csv\nfile2.csv").unwrap(); + + unsafe { + let result_ptr = super::gaggle_prefetch_files(std::ptr::null(), file_list.as_ptr()); + assert!(result_ptr.is_null()); + + // Error should be set + let err_ptr = error::gaggle_last_error(); + assert!(!err_ptr.is_null()); + let err_str = CStr::from_ptr(err_ptr).to_str().unwrap(); + assert!(err_str.to_lowercase().contains("null pointer")); + } + } + + #[test] + fn test_gaggle_prefetch_files_null_file_list() { + let dataset_path = CString::new("owner/dataset").unwrap(); + + unsafe { + let result_ptr = super::gaggle_prefetch_files(dataset_path.as_ptr(), std::ptr::null()); + assert!(result_ptr.is_null()); + + // Error should be set + let err_ptr = error::gaggle_last_error(); + assert!(!err_ptr.is_null()); + let err_str = CStr::from_ptr(err_ptr).to_str().unwrap(); + assert!(err_str.to_lowercase().contains("null pointer")); + } + } + + #[test] + fn test_gaggle_prefetch_files_empty_file_list() { + let dataset_path = CString::new("owner/dataset").unwrap(); + let file_list = CString::new("").unwrap(); + + unsafe { + let result_ptr = + super::gaggle_prefetch_files(dataset_path.as_ptr(), file_list.as_ptr()); + assert!(result_ptr.is_null()); + + // Error should be set (since no valid files were given) + let err_ptr = error::gaggle_last_error(); + assert!(!err_ptr.is_null()); + let err_str = CStr::from_ptr(err_ptr).to_str().unwrap(); + assert!(err_str.contains("no valid files")); + } + } } diff --git a/gaggle/src/kaggle/api.rs b/gaggle/src/kaggle/api.rs index 95cdee8..fd765ea 100644 --- a/gaggle/src/kaggle/api.rs +++ b/gaggle/src/kaggle/api.rs @@ -1,13 +1,42 @@ use crate::error::GaggleError; use reqwest::blocking::Client; +use once_cell::sync::Lazy; +use parking_lot::Mutex; #[cfg(test)] use std::cell::RefCell; use std::env; use std::thread::sleep; -use std::time::Duration; +use std::time::{Duration, Instant}; use tracing::{debug, trace, warn}; +/// Optional global rate limiter: enforce a minimum interval between API calls +static LAST_API_CALL: Lazy> = + Lazy::new(|| Mutex::new(Instant::now() - Duration::from_secs(3600))); + +fn min_interval() -> Duration { + let ms = env::var("GAGGLE_API_MIN_INTERVAL_MS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(0); + Duration::from_millis(ms) +} + +fn rate_limit_wait() { + let interval = min_interval(); + if interval.as_millis() == 0 { + return; + } + let mut guard = LAST_API_CALL.lock(); + let elapsed = guard.elapsed(); + if elapsed < interval { + let sleep_for = interval - elapsed; + trace!(?sleep_for, "rate limit sleep before API call"); + sleep(sleep_for); + } + *guard = Instant::now(); +} + /// Helper: get API base URL (overridable at runtime via env for testing) pub(crate) fn get_api_base() -> String { #[cfg(test)] @@ -54,6 +83,7 @@ where for i in 0..max_attempts { trace!(attempt = i + 1, max_attempts, "issuing HTTP call"); + rate_limit_wait(); match f() { Ok(v) => return Ok(v), Err(e) => { @@ -208,4 +238,12 @@ mod tests { env::remove_var("GAGGLE_HTTP_RETRY_MAX_DELAY"); env::remove_var("GAGGLE_HTTP_RETRY_ATTEMPTS"); } + + #[test] + fn test_rate_limit_no_sleep_when_disabled() { + env::remove_var("GAGGLE_API_MIN_INTERVAL_MS"); + let start = Instant::now(); + rate_limit_wait(); + assert!(start.elapsed() < Duration::from_millis(5)); + } } diff --git a/gaggle/src/kaggle/credentials.rs b/gaggle/src/kaggle/credentials.rs index c8fae7f..996bd41 100644 --- a/gaggle/src/kaggle/credentials.rs +++ b/gaggle/src/kaggle/credentials.rs @@ -5,12 +5,21 @@ use std::fs; static CREDENTIALS: once_cell::sync::Lazy>> = once_cell::sync::Lazy::new(|| RwLock::new(None)); -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct KaggleCredentials { pub username: String, pub key: String, } +impl std::fmt::Debug for KaggleCredentials { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KaggleCredentials") + .field("username", &self.username) + .field("key", &"***REDACTED***") + .finish() + } +} + /// Set Kaggle API credentials pub fn set_credentials(username: &str, key: &str) -> Result<(), GaggleError> { let mut creds = CREDENTIALS.write(); @@ -63,9 +72,9 @@ pub fn get_credentials() -> Result { })?; let mode = metadata.permissions().mode(); if mode & 0o077 != 0 { - eprintln!( - "Warning: kaggle.json has overly permissive permissions. \ - It should be readable only by the owner (chmod 600)." + tracing::warn!( + path = %kaggle_json_path.display(), + "kaggle.json has overly permissive permissions; it should be owner-readable only (chmod 600)" ); } } @@ -204,10 +213,13 @@ mod tests { fn test_credentials_debug() { let creds = KaggleCredentials { username: "user".to_string(), - key: "key".to_string(), + key: "supersecret".to_string(), }; let debug_str = format!("{:?}", creds); assert!(debug_str.contains("KaggleCredentials")); + assert!(debug_str.contains("user")); + assert!(!debug_str.contains("supersecret")); + assert!(debug_str.contains("REDACTED")); } #[test] diff --git a/gaggle/src/kaggle/download.rs b/gaggle/src/kaggle/download.rs index 960454e..83cbb92 100644 --- a/gaggle/src/kaggle/download.rs +++ b/gaggle/src/kaggle/download.rs @@ -10,7 +10,7 @@ use std::time::{Duration, SystemTime}; use super::api::{build_client, get_api_base, with_retries}; use super::credentials::get_credentials; -use tracing::debug; +use tracing::{debug, warn}; /// Track ongoing dataset downloads to prevent concurrent downloads of the same dataset static DOWNLOAD_LOCKS: once_cell::sync::Lazy>> = @@ -22,6 +22,28 @@ pub struct DatasetFile { pub size: u64, } +fn list_dataset_files_from_metadata(dataset_path: &str) -> Result, GaggleError> { + let meta = super::metadata::get_dataset_metadata(dataset_path)?; + let mut out = Vec::new(); + if let Some(files) = meta.get("files").and_then(|v| v.as_array()) { + for f in files { + if let Some(name) = f.get("name").and_then(|n| n.as_str()) { + // support size keys in different schemas + let size = f + .get("totalBytes") + .and_then(|x| x.as_u64()) + .or_else(|| f.get("size").and_then(|x| x.as_u64())) + .unwrap_or(0); + out.push(DatasetFile { + name: name.to_string(), + size, + }); + } + } + } + Ok(out) +} + /// Metadata stored in .downloaded marker file #[derive(Debug, Serialize, Deserialize)] struct CacheMetadata { @@ -133,6 +155,10 @@ fn download_dataset_version( loop { let mut locks = DOWNLOAD_LOCKS.lock(); + // While holding the lock, check marker existence to avoid race + if marker_file.exists() { + return Ok(cache_dir.clone()); + } if !locks.contains_key(&lock_key) { locks.insert(lock_key.clone(), ()); break; @@ -152,11 +178,6 @@ fn download_dataset_version( } sleep(Duration::from_millis(poll_ms.max(1))); - - // Check again if download completed while we waited - if marker_file.exists() { - return Ok(cache_dir); - } } // Ensure we clean up the lock when done @@ -166,7 +187,7 @@ fn download_dataset_version( // Double-check after acquiring lock if marker_file.exists() { - return Ok(cache_dir); + return Ok(cache_dir.clone()); } fs::create_dir_all(&cache_dir)?; @@ -211,9 +232,20 @@ fn download_dataset_version( .map_err(|e| GaggleError::HttpRequestError(e.to_string()))?; writer.flush().ok(); - // Extract ZIP - require at least one file extracted - let extracted = extract_zip(&zip_path, &cache_dir)?; + // Extract ZIP - require at least one file extracted; cleanup on failure + let extracted = match extract_zip(&zip_path, &cache_dir) { + Ok(n) => n, + Err(err) => { + // Best-effort cleanup of corrupt zip and partial files + let _ = fs::remove_file(&zip_path); + let _ = fs::remove_dir_all(&cache_dir); + return Err(err); + } + }; if extracted == 0 { + // Clean up if nothing extracted + let _ = fs::remove_file(&zip_path); + let _ = fs::remove_dir_all(&cache_dir); return Err(GaggleError::ZipError("ZIP contained no files".to_string())); } @@ -239,18 +271,100 @@ fn download_dataset_version( Ok(cache_dir) } +/// Download a single file within a Kaggle dataset into the cache without extracting the entire archive +pub fn download_single_file(dataset_path: &str, filename: &str) -> Result { + // Validate dataset path and filename to prevent traversal + let (owner, dataset) = super::parse_dataset_path(dataset_path)?; + use std::path::Component; + let fname_path = Path::new(filename); + if fname_path.is_absolute() { + return Err(GaggleError::InvalidDatasetPath( + "Absolute filenames are not allowed".to_string(), + )); + } + for comp in fname_path.components() { + match comp { + Component::ParentDir | Component::RootDir | Component::Prefix(_) => { + return Err(GaggleError::InvalidDatasetPath( + "Filename must not contain parent or root components".to_string(), + )); + } + _ => {} + } + } + + // Offline mode: fail if file isn't already present + let base_dir = crate::config::cache_dir_runtime() + .join("datasets") + .join(&owner) + .join(&dataset); + let target_path = base_dir.join(fname_path); + if crate::config::offline_mode() { + if target_path.exists() { + return Ok(target_path); + } + return Err(GaggleError::HttpRequestError(format!( + "Offline mode enabled; cannot download '{}' from '{}'.", + filename, dataset_path + ))); + } + + // Ensure parent directories exist + if let Some(parent) = target_path.parent() { + fs::create_dir_all(parent)?; + } + + // Build single-file download URL + // We use an endpoint shape that is easy to mock in tests and aligns with typical Kaggle CLI patterns + let url = format!( + "{}/datasets/download/{}/{}?fileName={}", + get_api_base(), + owner, + dataset, + urlencoding::encode(filename) + ); + + let creds = get_credentials()?; + debug!(%url, "downloading single file"); + let client = build_client()?; + let mut response = with_retries(|| { + client + .get(&url) + .basic_auth(&creds.username, Some(&creds.key)) + .send() + .map_err(|e| GaggleError::HttpRequestError(e.to_string())) + })?; + + if !response.status().is_success() { + return Err(GaggleError::HttpRequestError(format!( + "Failed to download file '{}': HTTP {}", + filename, + response.status() + ))); + } + + // Stream to disk; avoid loading whole file into memory + let mut outfile = fs::File::create(&target_path)?; + response + .copy_to(&mut outfile) + .map_err(|e| GaggleError::HttpRequestError(e.to_string()))?; + + Ok(target_path) +} + /// Extract ZIP file pub(crate) fn extract_zip(zip_path: &Path, dest_dir: &Path) -> Result { let file = fs::File::open(zip_path)?; let mut archive = zip::ZipArchive::new(file).map_err(|e| GaggleError::ZipError(e.to_string()))?; - // ZIP bomb protection: limit total uncompressed size to 10GB + // ZIP bomb protection: limit total uncompressed size to 10GB and compression ratio const MAX_TOTAL_SIZE: u64 = 10 * 1024 * 1024 * 1024; + const MAX_COMPRESSION_RATIO: u64 = 100; // reject entries with >100:1 ratio let mut total_size: u64 = 0; let mut files_extracted: usize = 0; - // Ensure destination directory exists and canonicalize it for comparisons + // Ensure destination directory exists and canonicalize it once fs::create_dir_all(dest_dir)?; let canonical_dest = dest_dir.canonicalize().map_err(|e| { GaggleError::IoError(format!( @@ -284,12 +398,18 @@ pub(crate) fn extract_zip(zip_path: &Path, dest_dir: &Path) -> Result Result MAX_TOTAL_SIZE { return Err(GaggleError::ZipError(format!( "ZIP file too large: uncompressed size exceeds {} GB", MAX_TOTAL_SIZE / (1024 * 1024 * 1024) ))); } + let comp_size = entry.compressed_size(); + if comp_size > 0 { + let ratio = uncompressed.saturating_div(comp_size.max(1)); + if ratio > MAX_COMPRESSION_RATIO { + return Err(GaggleError::ZipError(format!( + "Excessive compression ratio ({}:1) for entry {}", + ratio, + rel_path.display() + ))); + } + } // Finally, write the file if let Some(p) = outpath.parent() { @@ -329,15 +462,68 @@ pub(crate) fn extract_zip(zip_path: &Path, dest_dir: &Path) -> Result Result, GaggleError> { + let (owner, dataset) = super::parse_dataset_path(dataset_path)?; + let dataset_dir = crate::config::cache_dir_runtime() + .join("datasets") + .join(&owner) + .join(&dataset); + + // If directory exists and has content, enumerate locally + if dataset_dir.exists() { + let mut files = Vec::new(); + for entry in fs::read_dir(&dataset_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() { + if let Some(file_name) = path.file_name() { + if file_name != ".downloaded" { + let metadata = fs::metadata(&path)?; + if let Some(name) = path.file_name() { + files.push(DatasetFile { + name: name.to_string_lossy().to_string(), + size: metadata.len(), + }); + } + } + } + } + } + return Ok(files); + } + + // Not cached: try remote listing via metadata + if !crate::config::offline_mode() { + if let Ok(list) = list_dataset_files_from_metadata(dataset_path) { + if !list.is_empty() { + debug!( + dataset = dataset_path, + count = list.len(), + "listing files from remote metadata" + ); + return Ok(list); + } else { + debug!( + dataset = dataset_path, + "remote metadata listing empty; will attempt download" + ); + } + } else { + debug!( + dataset = dataset_path, + "failed to fetch remote metadata; will attempt download" + ); + } + } + + // As a last resort, download and list let dataset_dir = download_dataset(dataset_path)?; let mut files = Vec::new(); - for entry in fs::read_dir(&dataset_dir)? { let entry = entry?; let path = entry.path(); - if path.is_file() { if let Some(file_name) = path.file_name() { if file_name != ".downloaded" { @@ -352,7 +538,6 @@ pub fn list_dataset_files(dataset_path: &str) -> Result, Gaggle } } } - Ok(files) } @@ -377,17 +562,44 @@ pub fn get_dataset_file_path(dataset_path: &str, filename: &str) -> Result Ok(p), + Err(e) => { + // In strict on-demand mode, do not fall back to full download + if crate::config::strict_on_demand() { + debug!(dataset = dataset_path, file = filename, error = %e, "on-demand fetch failed and strict mode enabled; not falling back"); + return Err(e); + } + // If single-file download fails and dataset isn't cached, fall back to full dataset download + if !dataset_dir.exists() + || fs::read_dir(&dataset_dir) + .map(|mut i| i.next().is_none()) + .unwrap_or(true) + { + debug!(dataset = dataset_path, file = filename, error = %e, "on-demand fetch failed; falling back to full dataset download"); + let dir = download_dataset(dataset_path)?; + let p = dir.join(fname_path); + if p.exists() { + return Ok(p); + } + } + Err(e) + } + } } /// Get all cached datasets with their metadata @@ -423,8 +635,9 @@ fn get_cached_datasets() -> Result, GaggleError> { Ok(metadata) => { datasets.push((dataset_path, metadata)); } - Err(_) => { - // Legacy marker without metadata - calculate size + Err(e) => { + // Legacy or invalid marker - calculate size and synthesize metadata + warn!(path = %marker_file.display(), error = %e, "Invalid cache metadata; synthesizing"); let size_mb = crate::utils::calculate_dir_size(&dataset_path) .unwrap_or(0) .saturating_div(1024 * 1024); @@ -433,14 +646,47 @@ fn get_cached_datasets() -> Result, GaggleError> { dataset_entry.file_name().to_string_lossy().to_string(); let metadata = CacheMetadata::new(format!("{}/{}", owner, dataset), size_mb); + // retain None version datasets.push((dataset_path, metadata)); } } } - _ => { - // Empty or unreadable marker - skip + Ok(_) => { + // Empty marker - synthesize + warn!(path = %marker_file.display(), "Empty cache metadata; synthesizing"); + let size_mb = crate::utils::calculate_dir_size(&dataset_path) + .unwrap_or(0) + .saturating_div(1024 * 1024); + let owner = owner_entry.file_name().to_string_lossy().to_string(); + let dataset = dataset_entry.file_name().to_string_lossy().to_string(); + let metadata = + CacheMetadata::new(format!("{}/{}", owner, dataset), size_mb); + datasets.push((dataset_path, metadata)); + } + Err(e) => { + warn!(path = %marker_file.display(), error = %e, "Failed reading cache metadata; synthesizing"); + let size_mb = crate::utils::calculate_dir_size(&dataset_path) + .unwrap_or(0) + .saturating_div(1024 * 1024); + let owner = owner_entry.file_name().to_string_lossy().to_string(); + let dataset = dataset_entry.file_name().to_string_lossy().to_string(); + let metadata = + CacheMetadata::new(format!("{}/{}", owner, dataset), size_mb); + datasets.push((dataset_path, metadata)); } } + } else { + // No marker (e.g., partial on-demand downloads). Include in accounting. + let size_mb = crate::utils::calculate_dir_size(&dataset_path) + .unwrap_or(0) + .saturating_div(1024 * 1024); + // Skip empty directories with zero size + if size_mb > 0 { + let owner = owner_entry.file_name().to_string_lossy().to_string(); + let dataset = dataset_entry.file_name().to_string_lossy().to_string(); + let metadata = CacheMetadata::new(format!("{}/{}", owner, dataset), size_mb); + datasets.push((dataset_path, metadata)); + } } } } @@ -479,19 +725,16 @@ fn enforce_cache_limit() -> Result<(), GaggleError> { // Remove dataset directory if let Err(e) = fs::remove_dir_all(&dataset_path) { - eprintln!( - "Warning: Failed to evict dataset {}: {}", - metadata.dataset_path, e - ); + warn!(path = %dataset_path.display(), error = %e, "Failed to evict dataset"); continue; } total_size_mb = total_size_mb.saturating_sub(metadata.size_mb); - eprintln!( - "Cache limit enforcement: Evicted {} (age: {}s, size: {}MB)", - metadata.dataset_path, - metadata.age_seconds(), - metadata.size_mb + debug!( + dataset = %metadata.dataset_path, + age_secs = metadata.age_seconds(), + size_mb = metadata.size_mb, + "Cache eviction: removed dataset to enforce limit" ); } @@ -1069,4 +1312,29 @@ mod tests { std::env::remove_var("GAGGLE_CACHE_DIR"); } + + #[test] + fn test_partial_cache_counts_and_eviction() { + let temp_dir = tempfile::TempDir::new().unwrap(); + std::env::set_var("GAGGLE_CACHE_DIR", temp_dir.path()); + + // Create two partial cached datasets + let d1 = temp_dir.path().join("datasets/owner1/ds1"); + let d2 = temp_dir.path().join("datasets/owner2/ds2"); + fs::create_dir_all(&d1).unwrap(); + fs::create_dir_all(&d2).unwrap(); + fs::write(d1.join("a.bin"), vec![0u8; 2 * 1024 * 1024]).unwrap(); // 2MB + fs::write(d2.join("b.bin"), vec![0u8; 2 * 1024 * 1024]).unwrap(); // 2MB + + // Total ~4MB; set limit to 2MB so eviction must occur + std::env::set_var("GAGGLE_CACHE_SIZE_LIMIT_MB", "2"); + enforce_cache_limit_now().unwrap(); + + // After eviction, total size must be <= 2MB + let total = get_total_cache_size_mb().unwrap(); + assert!(total <= 2); + + std::env::remove_var("GAGGLE_CACHE_SIZE_LIMIT_MB"); + std::env::remove_var("GAGGLE_CACHE_DIR"); + } } diff --git a/gaggle/src/kaggle/metadata.rs b/gaggle/src/kaggle/metadata.rs index 71a6897..535d122 100644 --- a/gaggle/src/kaggle/metadata.rs +++ b/gaggle/src/kaggle/metadata.rs @@ -3,6 +3,9 @@ use serde::{Deserialize, Serialize}; use super::api::{build_client, get_api_base, with_retries}; use super::credentials::get_credentials; +use parking_lot::RwLock; +use std::collections::HashMap; +use std::time::{Duration, Instant}; #[derive(Debug, Serialize, Deserialize)] #[allow(dead_code)] @@ -14,6 +17,19 @@ pub struct DatasetInfo { pub last_updated: String, } +/// Simple in-memory cache for dataset metadata with TTL +static META_CACHE: once_cell::sync::Lazy>> = + once_cell::sync::Lazy::new(|| RwLock::new(HashMap::new())); + +/// Metadata cache TTL (seconds), configurable via GAGGLE_METADATA_TTL (default 600s) +fn metadata_ttl() -> Duration { + let secs = std::env::var("GAGGLE_METADATA_TTL") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(600); + Duration::from_secs(secs) +} + /// Get metadata for a specific dataset pub fn get_dataset_metadata(dataset_path: &str) -> Result { if crate::config::offline_mode() { @@ -25,6 +41,13 @@ pub fn get_dataset_metadata(dataset_path: &str) -> Result Result().is_err() { - return Err(crate::error::GaggleError::InvalidDatasetPath(format!( - "Invalid version number '{}'. Version must be a positive integer.", - v - ))); + // Validate it's a positive integer (>0) + match version_str.parse::() { + Ok(n) if n > 0 => Some(version_str.to_string()), + _ => { + return Err(crate::error::GaggleError::InvalidDatasetPath(format!( + "Invalid version number '{}'. Version must be a positive integer > 0.", + v + ))); + } } - Some(version_str.to_string()) } } else { None @@ -126,6 +124,35 @@ pub fn parse_dataset_path_with_version( Ok((owner, dataset, version)) } +/// Prefetch multiple files within a dataset without downloading the entire archive. +/// Returns a JSON string with an array of objects: {"name": ..., "status": "ok"|"error", "path"?: ..., "error"?: ...} +#[allow(dead_code)] +pub fn prefetch_files( + dataset_path: &str, + files: &[&str], +) -> Result { + let mut results = Vec::with_capacity(files.len()); + for f in files { + match download::get_dataset_file_path(dataset_path, f) { + Ok(path) => { + results.push(serde_json::json!({ + "name": f, + "status": "ok", + "path": path.to_string_lossy(), + })); + } + Err(e) => { + results.push(serde_json::json!({ + "name": f, + "status": "error", + "error": e.to_string(), + })); + } + } + } + Ok(serde_json::json!({"dataset": dataset_path, "files": results})) +} + #[cfg(test)] mod tests { use super::*; @@ -311,10 +338,12 @@ mod tests { } #[test] - fn test_parse_version_zero() { - let (_owner, _dataset, version) = - parse_dataset_path_with_version("owner/dataset@0").unwrap(); - assert_eq!(version, Some("0".to_string())); + fn test_parse_version_zero_rejected() { + let result = parse_dataset_path_with_version("owner/dataset@0"); + assert!(result.is_err()); + if let Err(crate::error::GaggleError::InvalidDatasetPath(msg)) = result { + assert!(msg.contains("> 0")); + } } #[test] diff --git a/gaggle/src/lib.rs b/gaggle/src/lib.rs index 5112c38..10840e9 100644 --- a/gaggle/src/lib.rs +++ b/gaggle/src/lib.rs @@ -9,7 +9,8 @@ pub use ffi::{ gaggle_clear_cache, gaggle_dataset_version_info, gaggle_download_dataset, gaggle_enforce_cache_limit, gaggle_free, gaggle_get_cache_info, gaggle_get_dataset_info, gaggle_get_file_path, gaggle_get_version, gaggle_is_dataset_current, gaggle_json_each, - gaggle_list_files, gaggle_search, gaggle_set_credentials, gaggle_update_dataset, + gaggle_list_files, gaggle_prefetch_files, gaggle_search, gaggle_set_credentials, + gaggle_update_dataset, }; pub use kaggle::parse_dataset_path; pub use kaggle::parse_dataset_path_with_version; diff --git a/gaggle/tests/mock_http.rs b/gaggle/tests/mock_http.rs index d7a0b4a..fb5fc94 100644 --- a/gaggle/tests/mock_http.rs +++ b/gaggle/tests/mock_http.rs @@ -138,3 +138,170 @@ fn test_download_and_version_with_mock() { env::remove_var("GAGGLE_CACHE_DIR"); env::remove_var("GAGGLE_API_BASE"); } + +#[test] +#[serial_test::serial] +fn test_single_file_fetch_on_demand() { + gaggle::init_logging(); + let temp = tempfile::TempDir::new().unwrap(); + env::set_var("GAGGLE_CACHE_DIR", temp.path()); + + let mut server = Server::new(); + let server_url = server.url(); + env::set_var("GAGGLE_API_BASE", &server_url); + + // Set credentials + let user = CString::new("user").unwrap(); + let key = CString::new("key").unwrap(); + unsafe { + let _ = gaggle::gaggle_set_credentials(user.as_ptr(), key.as_ptr()); + } + + // Mock single-file endpoint + let _file = server + .mock("GET", "/datasets/download/owner/dataset") + .match_query(Matcher::UrlEncoded("fileName".into(), "data.csv".into())) + .with_status(200) + .with_header("content-type", "text/csv") + .with_body("a,b\n1,2\n") + .create(); + + // Act: request file path; should trigger on-demand fetch + let ds = CString::new("owner/dataset").unwrap(); + let fnm = CString::new("data.csv").unwrap(); + let ptr = unsafe { gaggle::gaggle_get_file_path(ds.as_ptr(), fnm.as_ptr()) }; + assert!(!ptr.is_null()); + let path = unsafe { + let s = CStr::from_ptr(ptr).to_str().unwrap().to_string(); + gaggle::gaggle_free(ptr); + std::path::PathBuf::from(s) + }; + assert!(path.exists()); + + // Ensure that full dataset extraction marker is not required for single-file presence + let ds_dir = temp.path().join("datasets/owner/dataset"); + assert!(ds_dir.join("data.csv").exists()); + // .downloaded marker may not exist yet (partial cache is allowed) + + env::remove_var("GAGGLE_CACHE_DIR"); + env::remove_var("GAGGLE_API_BASE"); +} + +#[test] +#[serial_test::serial] +fn test_strict_on_demand_no_fallback() { + gaggle::init_logging(); + let temp = tempfile::TempDir::new().unwrap(); + env::set_var("GAGGLE_CACHE_DIR", temp.path()); + env::set_var("GAGGLE_STRICT_ONDEMAND", "1"); + + let mut server = Server::new(); + let server_url = server.url(); + env::set_var("GAGGLE_API_BASE", &server_url); + + // Set credentials + let user = CString::new("user").unwrap(); + let key = CString::new("key").unwrap(); + unsafe { + let _ = gaggle::gaggle_set_credentials(user.as_ptr(), key.as_ptr()); + } + + // Mock single-file endpoint to return 404 (force failure) + let _file = server + .mock("GET", "/datasets/download/owner/dataset") + .match_query(Matcher::UrlEncoded("fileName".into(), "missing.csv".into())) + .with_status(404) + .with_header("content-type", "text/plain") + .with_body("not found") + .create(); + + // Act: request file path; should fail and not fall back to full download + let ds = CString::new("owner/dataset").unwrap(); + let fnm = CString::new("missing.csv").unwrap(); + let ptr = unsafe { gaggle::gaggle_get_file_path(ds.as_ptr(), fnm.as_ptr()) }; + assert!(ptr.is_null()); + let err_ptr = gaggle::gaggle_last_error(); + assert!(!err_ptr.is_null()); + let err = unsafe { CStr::from_ptr(err_ptr) } + .to_str() + .unwrap() + .to_lowercase(); + assert!(err.contains("http")); + + env::remove_var("GAGGLE_CACHE_DIR"); + env::remove_var("GAGGLE_STRICT_ONDEMAND"); + env::remove_var("GAGGLE_API_BASE"); +} + +#[test] +#[serial_test::serial] +fn test_prefetch_files_mixed_results() { + gaggle::init_logging(); + let temp = tempfile::TempDir::new().unwrap(); + env::set_var("GAGGLE_CACHE_DIR", temp.path()); + env::set_var("GAGGLE_STRICT_ONDEMAND", "1"); + + let mut server = Server::new(); + let server_url = server.url(); + env::set_var("GAGGLE_API_BASE", &server_url); + + // Set credentials + let user = CString::new("user").unwrap(); + let key = CString::new("key").unwrap(); + unsafe { + let _ = gaggle::gaggle_set_credentials(user.as_ptr(), key.as_ptr()); + } + + // Mock good file + let _good = server + .mock("GET", "/datasets/download/owner/dataset") + .match_query(Matcher::UrlEncoded("fileName".into(), "good.csv".into())) + .with_status(200) + .with_header("content-type", "text/csv") + .with_body("x\n1\n") + .create(); + + // Mock missing file + let _bad = server + .mock("GET", "/datasets/download/owner/dataset") + .match_query(Matcher::UrlEncoded("fileName".into(), "bad.csv".into())) + .with_status(404) + .with_body("not found") + .create(); + + // Call prefetch + let ds = CString::new("owner/dataset").unwrap(); + let list = CString::new("good.csv\nbad.csv").unwrap(); + let ptr = unsafe { gaggle::gaggle_prefetch_files(ds.as_ptr(), list.as_ptr()) }; + assert!(!ptr.is_null()); + let s = unsafe { CStr::from_ptr(ptr) }.to_str().unwrap().to_string(); + unsafe { gaggle::gaggle_free(ptr) }; + + let v: serde_json::Value = serde_json::from_str(&s).unwrap(); + assert_eq!(v["dataset"].as_str().unwrap(), "owner/dataset"); + let files = v["files"].as_array().unwrap(); + assert_eq!(files.len(), 2); + + // Find statuses + let mut ok_seen = false; + let mut err_seen = false; + for f in files { + let name = f["name"].as_str().unwrap(); + let status = f["status"].as_str().unwrap(); + if name == "good.csv" { + assert_eq!(status, "ok"); + assert!(f["path"].as_str().unwrap().ends_with("good.csv")); + ok_seen = true; + } + if name == "bad.csv" { + assert_eq!(status, "error"); + assert!(!f["error"].as_str().unwrap().is_empty()); + err_seen = true; + } + } + assert!(ok_seen && err_seen); + + env::remove_var("GAGGLE_CACHE_DIR"); + env::remove_var("GAGGLE_STRICT_ONDEMAND"); + env::remove_var("GAGGLE_API_BASE"); +} From 5f8ca9d37d0f8989d8f57cde5e60f692ec8ccc72 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 3 Nov 2025 16:25:49 +0100 Subject: [PATCH 4/4] WIP --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index e195c84..a81de05 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,7 +20,7 @@ The table below includes the information about all SQL functions exposed by Gagg > [!NOTE] > The `gaggle_file_path` function will fetch the file into the cache if it is missing; set `GAGGLE_STRICT_ONDEMAND=1` to prevent fallback to a full dataset download on failures. -> +> > Dataset paths must be in the form `owner/dataset` where `owner` is the username and `dataset` is the dataset name on > Kaggle. > For example: `habedi/flickr-8k-dataset-clean`.