Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/dist_pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ on:
pull_request:
branches:
- main
paths-ignore:
- '**.md'
- 'docs/**'
- '.github/**'
push:
tags:
- 'v*'
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/lints.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ on:
pull_request:
branches:
- main
paths-ignore:
- '**.md'
- 'docs/**'
- '.github/**'

permissions:
contents: read
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ on:
pull_request:
branches:
- main
paths-ignore:
- '**.md'
- 'docs/**'
- '.github/**'
push:
branches:
- main
Expand Down
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ updates, etc.
This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated
frequently.
Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside
DuckDB that allow you to run fast analytical queries on the data.
DuckDB that allows you to run fast analytical queries on the data.

In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets.

Expand Down Expand Up @@ -97,24 +97,23 @@ make release
-- Get extension version
select gaggle_version();

-- List files in the dataset
-- List files in the dataset (recursively)
-- (Note that if the datasets is not downloaded, it will be downloaded and cached)
select *
from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 5;

-- Read a Parquet file from local cache using a prepared statement
-- (DuckDB doesn't allow the use of subqueries in function arguments, so we use a prepared statement)
-- Read a Parquet file from the local cache using a prepared statement
prepare rp as select * from read_parquet(?) limit 10;
execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));

-- Alternatively, we can use a replacement scan to read directly via `kaggle:` prefix
select count(*)
from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';

-- Optionally, we check cache info
-- Then, we check cache info
select gaggle_cache_info();

-- Check if cached dataset is current (is newest version?)
-- Check if cached dataset is current (is the newest version?)
select gaggle_is_current('habedi/flickr-8k-dataset-clean');
```

Expand Down
12 changes: 10 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The table below includes the information about all SQL functions exposed by Gagg
| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. |
| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. |
| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. |
| 14 | `gaggle_ls(dataset_path VARCHAR[, recursive BOOLEAN])` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files in the dataset's local directory; non-recursive by default. When `recursive=true` will walk subdirectories. `path` values are returned as `owner/dataset/<relative-path>` (not an absolute filesystem path); `size` is in MB. |

> [!NOTE]
> * The `gaggle_file_path` function will retrieve and cache the file if it is not already downloaded; set
Expand Down Expand Up @@ -65,14 +65,22 @@ select gaggle_info('uciml/iris') as dataset_metadata;
#### Reading Data

```sql
-- List files as a table
-- List files as a table (non-recursive)
select *
from gaggle_ls('uciml/iris') limit 5;

-- List files as a table (recursive)
select *
from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;

-- List files as a JSON array
select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
from gaggle_ls('uciml/iris');

-- Note: returned `path` values are in the form 'owner/dataset/...',
-- which work for use with replacement scans or as an identifier inside the cache;
-- to get an absolute filesystem path use `gaggle_file_path(owner_dataset, relative_path)`.

-- Resolve a file path and read it via a prepared statement
prepare rp as select * from read_parquet(?) limit 10;
execute rp(gaggle_file_path('owner/dataset', 'file.parquet'));
Expand Down
5 changes: 5 additions & 0 deletions docs/examples/e1_core_functionality.sql
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ select gaggle_download('uciml/iris') as download_path;

-- Section 5: list files (JSON)
select '## list files (json)';
-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
select to_json(
list(struct_pack(name := name, size := size, path := path))
) as files_json
Expand All @@ -39,6 +40,10 @@ from gaggle_ls('uciml/iris');
select '## list files (table)';
select * from gaggle_ls('uciml/iris') limit 5;

-- Recursive listing example (walk subdirectories)
select '## recursive listing example';
select * from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;

-- Section 6: get dataset metadata
select '## get dataset metadata';
select gaggle_info('uciml/iris') as dataset_metadata;
Expand Down
5 changes: 5 additions & 0 deletions docs/examples/e2_advanced_features.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'

-- Section 2: list and process multiple files
select '## list and process dataset files (json and table)';
-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
with files as (
select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
from gaggle_ls('habedi/flickr-8k-dataset-clean')
)
select files_json from files;
select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;

-- Recursive listing (example)
select '## recursive listing example for flickr dataset';
select * from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 10;

-- Section 2b: use replacement scan for direct reads via `kaggle:` URLs
select '## Replacement scan - direct reads via `kaggle:`';
-- Single file read
Expand Down
2 changes: 1 addition & 1 deletion external/duckdb
Submodule duckdb updated 1068 files
2 changes: 1 addition & 1 deletion gaggle/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gaggle"
version = "0.1.0-alpha.3"
version = "0.1.0-alpha.4"
edition = "2021"
publish = false

Expand Down
Loading
Loading