From 59a598f8b46bde6f8597ba4f61c2263d73c0039a Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Tue, 4 Nov 2025 12:45:33 +0100 Subject: [PATCH 1/3] The base commit --- .github/workflows/dist_pipeline.yml | 4 ++++ .github/workflows/lints.yml | 4 ++++ .github/workflows/tests.yml | 4 ++++ README.md | 7 +++---- external/extension-ci-tools | 2 +- gaggle/Cargo.toml | 2 +- gaggle/bindings/gaggle_extension.cpp | 2 +- 7 files changed, 18 insertions(+), 7 deletions(-) diff --git a/.github/workflows/dist_pipeline.yml b/.github/workflows/dist_pipeline.yml index 39dc4c5..ce9edde 100644 --- a/.github/workflows/dist_pipeline.yml +++ b/.github/workflows/dist_pipeline.yml @@ -4,6 +4,10 @@ on: pull_request: branches: - main + paths-ignore: + - '**.md' + - 'docs/**' + - '.github/**' push: tags: - 'v*' diff --git a/.github/workflows/lints.yml b/.github/workflows/lints.yml index 0ea0007..05b7756 100644 --- a/.github/workflows/lints.yml +++ b/.github/workflows/lints.yml @@ -5,6 +5,10 @@ on: pull_request: branches: - main + paths-ignore: + - '**.md' + - 'docs/**' + - '.github/**' permissions: contents: read diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 32fef03..6ff81bf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,6 +5,10 @@ on: pull_request: branches: - main + paths-ignore: + - '**.md' + - 'docs/**' + - '.github/**' push: branches: - main diff --git a/README.md b/README.md index 1a3ae43..51edb1f 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ updates, etc. This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated frequently. Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside -DuckDB that allow you to run fast analytical queries on the data. +DuckDB that allows you to run fast analytical queries on the data. In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets. @@ -102,8 +102,7 @@ select gaggle_version(); select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; --- Read a Parquet file from local cache using a prepared statement --- (DuckDB doesn't allow the use of subqueries in function arguments, so we use a prepared statement) +-- Read a Parquet file from the local cache using a prepared statement prepare rp as select * from read_parquet(?) limit 10; execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet')); @@ -114,7 +113,7 @@ from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet'; -- Optionally, we check cache info select gaggle_cache_info(); --- Check if cached dataset is current (is newest version?) +-- Check if cached dataset is current (is the newest version?) select gaggle_is_current('habedi/flickr-8k-dataset-clean'); ``` diff --git a/external/extension-ci-tools b/external/extension-ci-tools index 7b1ba0a..3628eea 160000 --- a/external/extension-ci-tools +++ b/external/extension-ci-tools @@ -1 +1 @@ -Subproject commit 7b1ba0a5651e9fd85cf555f950d5f8bef88adf7e +Subproject commit 3628eea3b7483b75b120a16cae158fe980ca7ea0 diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml index 95d70ee..e462f5d 100644 --- a/gaggle/Cargo.toml +++ b/gaggle/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gaggle" -version = "0.1.0-alpha.3" +version = "0.1.0-alpha.4" edition = "2021" publish = false diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index 4cde97f..3e00583 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -748,7 +748,7 @@ static void LoadInternal(ExtensionLoader &loader) { // Provide out-of-line definitions for the extension class void GaggleExtension::Load(ExtensionLoader &loader) { LoadInternal(loader); } std::string GaggleExtension::Name() { return "gaggle"; } -std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.3"); } +std::string GaggleExtension::Version() const { return std::string("0.1.0-alpha.4"); } } // namespace duckdb From 1e11558c221b71fc55cfc5e01590b07436f0e4aa Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Mon, 10 Nov 2025 14:40:54 +0100 Subject: [PATCH 2/3] Make `gaggle_ls` support recursive listing (#4) --- README.md | 6 +- docs/README.md | 12 +- docs/examples/e1_core_functionality.sql | 5 + docs/examples/e2_advanced_features.sql | 5 + gaggle/bindings/gaggle_extension.cpp | 273 ++++++++++++++++++++---- 5 files changed, 257 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 51edb1f..205e38c 100644 --- a/README.md +++ b/README.md @@ -97,10 +97,10 @@ make release -- Get extension version select gaggle_version(); --- List files in the dataset +-- List files in the dataset (recursively) -- (Note that if the datasets is not downloaded, it will be downloaded and cached) select * -from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; +from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 5; -- Read a Parquet file from the local cache using a prepared statement prepare rp as select * from read_parquet(?) limit 10; @@ -110,7 +110,7 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet' select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet'; --- Optionally, we check cache info +-- Then, we check cache info select gaggle_cache_info(); -- Check if cached dataset is current (is the newest version?) diff --git a/docs/README.md b/docs/README.md index 57394b4..ef19b64 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,7 +17,7 @@ The table below includes the information about all SQL functions exposed by Gagg | 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. | | 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. | | 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. | -| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. | +| 14 | `gaggle_ls(dataset_path VARCHAR[, recursive BOOLEAN])` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files in the dataset's local directory; non-recursive by default. When `recursive=true` will walk subdirectories. `path` values are returned as `owner/dataset/` (not an absolute filesystem path); `size` is in MB. | > [!NOTE] > * The `gaggle_file_path` function will retrieve and cache the file if it is not already downloaded; set @@ -65,14 +65,22 @@ select gaggle_info('uciml/iris') as dataset_metadata; #### Reading Data ```sql --- List files as a table +-- List files as a table (non-recursive) select * from gaggle_ls('uciml/iris') limit 5; +-- List files as a table (recursive) +select * +from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10; + -- List files as a JSON array select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json from gaggle_ls('uciml/iris'); +-- Note: returned `path` values are in the form 'owner/dataset/...', +-- which work for use with replacement scans or as an identifier inside the cache; +-- to get an absolute filesystem path use `gaggle_file_path(owner_dataset, relative_path)`. + -- Resolve a file path and read it via a prepared statement prepare rp as select * from read_parquet(?) limit 10; execute rp(gaggle_file_path('owner/dataset', 'file.parquet')); diff --git a/docs/examples/e1_core_functionality.sql b/docs/examples/e1_core_functionality.sql index 03178b9..f043cbe 100644 --- a/docs/examples/e1_core_functionality.sql +++ b/docs/examples/e1_core_functionality.sql @@ -30,6 +30,7 @@ select gaggle_download('uciml/iris') as download_path; -- Section 5: list files (JSON) select '## list files (json)'; +-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/' (not absolute filesystem paths) select to_json( list(struct_pack(name := name, size := size, path := path)) ) as files_json @@ -39,6 +40,10 @@ from gaggle_ls('uciml/iris'); select '## list files (table)'; select * from gaggle_ls('uciml/iris') limit 5; +-- Recursive listing example (walk subdirectories) +select '## recursive listing example'; +select * from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10; + -- Section 6: get dataset metadata select '## get dataset metadata'; select gaggle_info('uciml/iris') as dataset_metadata; diff --git a/docs/examples/e2_advanced_features.sql b/docs/examples/e2_advanced_features.sql index 79b77e5..7ecab83 100644 --- a/docs/examples/e2_advanced_features.sql +++ b/docs/examples/e2_advanced_features.sql @@ -14,6 +14,7 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet' -- Section 2: list and process multiple files select '## list and process dataset files (json and table)'; +-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/' (not absolute filesystem paths) with files as ( select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json from gaggle_ls('habedi/flickr-8k-dataset-clean') @@ -21,6 +22,10 @@ with files as ( select files_json from files; select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5; +-- Recursive listing (example) +select '## recursive listing example for flickr dataset'; +select * from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 10; + -- Section 2b: use replacement scan for direct reads via `kaggle:` URLs select '## Replacement scan - direct reads via `kaggle:`'; -- Single file read diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp index 3e00583..37c016a 100644 --- a/gaggle/bindings/gaggle_extension.cpp +++ b/gaggle/bindings/gaggle_extension.cpp @@ -516,22 +516,34 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input, return nullptr; } - // Parse kaggle:owner/dataset[/pattern] + // Parse kaggle:owner/dataset[/pattern...] string kaggle_ref = table_name.substr(7); // Remove "kaggle:" prefix - auto last_slash = kaggle_ref.find_last_of('/'); - if (last_slash == string::npos) { + // Find first and second slash to extract owner/dataset as canonical dataset + auto first_slash = kaggle_ref.find('/'); + if (first_slash == string::npos) { return nullptr; } + auto second_slash = kaggle_ref.find('/', first_slash + 1); - string dataset_path = kaggle_ref.substr(0, last_slash); - string pattern = kaggle_ref.substr(last_slash + 1); + string dataset_path; + string pattern; + if (second_slash == string::npos) { + // No pattern provided; entire ref is owner/dataset + dataset_path = kaggle_ref; + pattern = string(); + } else { + // Canonical dataset is first two segments owner/dataset + dataset_path = kaggle_ref.substr(0, second_slash); + pattern = kaggle_ref.substr(second_slash + 1); // may contain additional slashes + } string func_name = "read_csv_auto"; string local_path; + // Determine wildcard/dir status auto lower_pat = StringUtil::Lower(pattern); - bool has_wildcard = - pattern.find('*') != string::npos || pattern.find('?') != string::npos; + bool has_wildcard = pattern.find('*') != string::npos || + pattern.find('?') != string::npos; bool is_dir = pattern.empty(); auto decide_reader = [](const string &lower_ext) -> string { @@ -547,7 +559,6 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input, if (StringUtil::EndsWith(lower_ext, ".xlsx")) { return "read_excel"; } - // Default CSV/TSV and others to DuckDB's auto CSV reader return "read_csv_auto"; }; @@ -565,15 +576,22 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input, string tail = is_dir ? string("/*") : (string("/") + pattern); local_path = dir_path + tail; - // Choose reader based on pattern extension if any - func_name = decide_reader(lower_pat); + // Choose reader based on pattern extension if any (use last segment) + string last_segment = pattern; + if (!pattern.empty()) { + auto pos = pattern.find_last_of('/'); + if (pos != string::npos) + last_segment = pattern.substr(pos + 1); + } + func_name = decide_reader(StringUtil::Lower(last_segment)); } else { - // Specific file: resolve exact path + // Specific file or nested path without wildcard: try to resolve exact file + // Use canonical dataset_path (owner/dataset) when calling Rust APIs char *file_path_c = gaggle_get_file_path(dataset_path.c_str(), pattern.c_str()); if (file_path_c == nullptr) { - // Fallback: dataset may have nested paths; attempt a glob match under - // dataset root + // Fallback: dataset may have nested paths; download dataset root and + // inspect filesystem to decide whether pattern is a directory char *dir_c = gaggle_download_dataset(dataset_path.c_str()); if (!dir_c) { throw InvalidInputException( @@ -582,16 +600,31 @@ KaggleReplacementScan(ClientContext &context, ReplacementScanInput &input, } string dir_path(dir_c); gaggle_free(dir_c); - local_path = dir_path + "/" + pattern; - // Keep func_name decision below based on extension + + fs::path candidate = fs::path(dir_path) / fs::path(pattern); + std::error_code ec; + if (fs::exists(candidate, ec) && fs::is_directory(candidate, ec)) { + // If the resolved target is a directory, read all files in it + local_path = candidate.string() + string("/*"); + // Choose reader based on last segment + auto last_seg = candidate.filename().string(); + func_name = decide_reader(StringUtil::Lower(last_seg)); + } else { + // Not a directory: use the candidate path (may be a file or a pattern) + local_path = dir_path + string("/") + pattern; + // Decide reader based on extension of pattern's last segment + auto pos = pattern.find_last_of('/'); + string last_segment = (pos == string::npos) ? pattern + : pattern.substr(pos + 1); + func_name = decide_reader(StringUtil::Lower(last_segment)); + } } else { + // Exact file found local_path = string(file_path_c); gaggle_free(file_path_c); + // Decide reader based on pattern lowercased + func_name = decide_reader(StringUtil::Lower(pattern)); } - - // Decide reader based on extension - auto lower_name = StringUtil::Lower(pattern); - func_name = decide_reader(lower_name); } // Construct a table function call: func_name(local_path) @@ -611,14 +644,45 @@ static unique_ptr GaggleLsBind(ClientContext &context, vector &return_types, vector &names) { auto result = make_uniq(); - if (input.inputs.size() != 1) { + + // Accept either gaggle_ls(dataset_path) or gaggle_ls(dataset_path, recursive) + if (input.inputs.size() < 1 || input.inputs.size() > 2) { throw InvalidInputException( - "gaggle_ls(dataset_path) expects exactly 1 argument"); + "gaggle_ls(dataset_path[, recursive]) expects 1 or 2 arguments"); } result->dataset_path = input.inputs[0].ToString(); - // Verify that the dataset is downloaded and get directory - char *dir_c = gaggle_download_dataset(result->dataset_path.c_str()); + bool recursive = false; + if (input.inputs.size() == 2) { + // Second argument is expected to be a boolean constant at bind time + const auto &val = input.inputs[1]; + if (!val.IsNull()) { + // Extract boolean constant value + recursive = val.GetValue(); + } + } + + // Canonicalize dataset path: ensure we call downloader with owner/dataset + // even if caller provided owner/dataset/... nested path + string ds = result->dataset_path; + auto first_slash = ds.find('/'); + if (first_slash == string::npos) { + throw InvalidInputException("Invalid dataset path: must be owner/dataset"); + } + auto second_slash = ds.find('/', first_slash + 1); + string canonical_ds; + string nested_path; + if (second_slash == string::npos) { + canonical_ds = ds; + nested_path = string(); + } else { + canonical_ds = ds.substr(0, second_slash); + nested_path = ds.substr(second_slash + 1); + } + + // Verify that the dataset is downloaded and get directory for canonical + // dataset + char *dir_c = gaggle_download_dataset(canonical_ds.c_str()); if (!dir_c) { throw InvalidInputException("Failed to download dataset: " + GetGaggleError()); @@ -626,25 +690,150 @@ static unique_ptr GaggleLsBind(ClientContext &context, string dir_path(dir_c); gaggle_free(dir_c); - // Enumerate files (non-recursive) + fs::path dataset_root(dir_path); + try { - for (const auto &entry : fs::directory_iterator(dir_path)) { - if (!entry.is_regular_file()) { - continue; - } - auto name = entry.path().filename().string(); - if (name == ".downloaded") { - continue; + if (nested_path.empty()) { + if (recursive) { + // Recursively walk and collect all regular files under dir_path + for (const auto &entry : fs::recursive_directory_iterator(dir_path)) { + if (!entry.is_regular_file()) { + continue; + } + auto name = entry.path().filename().string(); + if (name == ".downloaded") { + continue; + } + auto full_path = entry.path().string(); + // Compute relative path WRT dataset root + fs::path rel = fs::path(full_path).lexically_relative(dataset_root); + string rel_str; + if (rel.empty()) { + rel_str = canonical_ds + string("/") + entry.path().filename().string(); + } else { + rel_str = canonical_ds + string("/") + rel.string(); + } + std::error_code ec; + auto file_size = entry.file_size(ec); + if (ec) + continue; + int64_t size_mb = static_cast(file_size / (1024 * 1024)); + result->names.push_back(name); + result->paths.push_back(rel_str); + result->sizes.push_back(size_mb); + } + } else { + // Non-recursive: list top-level files + for (const auto &entry : fs::directory_iterator(dir_path)) { + if (!entry.is_regular_file()) { + continue; + } + auto name = entry.path().filename().string(); + if (name == ".downloaded") { + continue; + } + auto full_path = entry.path().string(); + fs::path rel = fs::path(full_path).lexically_relative(dataset_root); + string rel_str; + if (rel.empty()) { + rel_str = canonical_ds + string("/") + entry.path().filename().string(); + } else { + rel_str = canonical_ds + string("/") + rel.string(); + } + std::error_code ec; + auto file_size = entry.file_size(ec); + if (ec) + continue; + int64_t size_mb = static_cast(file_size / (1024 * 1024)); + result->names.push_back(name); + result->paths.push_back(rel_str); + result->sizes.push_back(size_mb); + } } - auto full_path = entry.path().string(); + } else { + // Nested path provided: inspect the nested path inside the downloaded + // canonical dataset + fs::path target = fs::path(dir_path) / fs::path(nested_path); std::error_code ec; - auto file_size = entry.file_size(ec); - if (ec) - continue; - int64_t size_mb = static_cast(file_size / (1024 * 1024)); - result->names.push_back(name); - result->paths.push_back(full_path); - result->sizes.push_back(size_mb); + if (!fs::exists(target, ec)) { + throw InvalidInputException( + string("Requested nested path does not exist: ") + + target.string()); + } + if (fs::is_directory(target, ec)) { + if (recursive) { + for (const auto &entry : fs::recursive_directory_iterator(target)) { + if (!entry.is_regular_file()) { + continue; + } + auto name = entry.path().filename().string(); + if (name == ".downloaded") { + continue; + } + auto full_path = entry.path().string(); + fs::path rel = fs::path(full_path).lexically_relative(dataset_root); + string rel_str; + if (rel.empty()) { + rel_str = canonical_ds + string("/") + entry.path().filename().string(); + } else { + rel_str = canonical_ds + string("/") + rel.string(); + } + std::error_code ec2; + auto file_size = entry.file_size(ec2); + if (ec2) + continue; + int64_t size_mb = static_cast(file_size / (1024 * 1024)); + result->names.push_back(name); + result->paths.push_back(rel_str); + result->sizes.push_back(size_mb); + } + } else { + for (const auto &entry : fs::directory_iterator(target)) { + if (!entry.is_regular_file()) { + continue; + } + auto name = entry.path().filename().string(); + if (name == ".downloaded") { + continue; + } + auto full_path = entry.path().string(); + fs::path rel = fs::path(full_path).lexically_relative(dataset_root); + string rel_str; + if (rel.empty()) { + rel_str = canonical_ds + string("/") + entry.path().filename().string(); + } else { + rel_str = canonical_ds + string("/") + rel.string(); + } + std::error_code ec2; + auto file_size = entry.file_size(ec2); + if (ec2) + continue; + int64_t size_mb = static_cast(file_size / (1024 * 1024)); + result->names.push_back(name); + result->paths.push_back(rel_str); + result->sizes.push_back(size_mb); + } + } + } else { + // Target is a file: return that single file + auto name = target.filename().string(); + std::error_code ec2; + auto file_size = fs::file_size(target, ec2); + if (!ec2) { + int64_t size_mb = static_cast(file_size / (1024 * 1024)); + // compute relative path + fs::path rel = target.lexically_relative(dataset_root); + string rel_str; + if (rel.empty()) { + rel_str = canonical_ds + string("/") + target.filename().string(); + } else { + rel_str = canonical_ds + string("/") + rel.string(); + } + result->names.push_back(name); + result->paths.push_back(rel_str); + result->sizes.push_back(size_mb); + } + } } } catch (const std::exception &e) { throw InvalidInputException(string("Failed to enumerate files: ") + @@ -738,6 +927,12 @@ static void LoadInternal(ExtensionLoader &loader) { GaggleLsBind, GaggleLsInitGlobal, nullptr); loader.RegisterFunction(ls_fun); + // Also register gaggle_ls(dataset_path, recursive BOOLEAN) + TableFunction ls_fun_recursive( + "gaggle_ls", {LogicalType::VARCHAR, LogicalType::BOOLEAN}, + GaggleLsFunction, GaggleLsBind, GaggleLsInitGlobal, nullptr); + loader.RegisterFunction(ls_fun_recursive); + // Register replacement scan for "kaggle:" prefix via DBConfig auto &db = loader.GetDatabaseInstance(); auto &config = DBConfig::GetConfig(db); From 7e2e70140d539a66e884f2e9c4d045777c9c9edc Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Sat, 15 Nov 2025 19:17:30 +0100 Subject: [PATCH 3/3] Update external dependencies --- external/duckdb | 2 +- external/extension-ci-tools | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/duckdb b/external/duckdb index 4c2573a..8521dd4 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 4c2573afaec92b2a7b530e22d9d5e2d98cbfc9d4 +Subproject commit 8521dd47d7f05f454b84f7cfcad6b9e23ec061b5 diff --git a/external/extension-ci-tools b/external/extension-ci-tools index 3628eea..aac9640 160000 --- a/external/extension-ci-tools +++ b/external/extension-ci-tools @@ -1 +1 @@ -Subproject commit 3628eea3b7483b75b120a16cae158fe980ca7ea0 +Subproject commit aac9640615e51d6e7e8b72d4bf023703cfd8e479