Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
805511f
feat: move cache to shared level and add expunge function and request…
DataLabTechTV Jul 15, 2025
b7c5fd5
chore(deps): add requests cache dep
DataLabTechTV Jul 15, 2025
f412b51
feat: CLI command to expunge/clean cache
DataLabTechTV Jul 15, 2025
8e3c6b8
feat: split ingestion into multiple modules and add dataset templates
DataLabTechTV Jul 15, 2025
bf09fb1
feat: DataCite working downloader
DataLabTechTV Jul 15, 2025
5e2ba51
chore(deps): add tqdm dep for tracking download progress
DataLabTechTV Jul 15, 2025
039e08a
feat: skip cache for downloads and display progress bar
DataLabTechTV Jul 15, 2025
6e2cb9c
feat: complete dataset template for The Atlas of Economic Complexity
DataLabTechTV Jul 15, 2025
39943df
perf: increase chunk size and make sure temp files are cleaned even w…
DataLabTechTV Jul 15, 2025
62c7dff
chore: re-enable requests-cache with streaming
DataLabTechTV Jul 15, 2025
6238484
chore(deps): add humanize to print byte sizes in human-readable format
DataLabTechTV Jul 15, 2025
436391b
feat: support for cache usage statistic printing
DataLabTechTV Jul 15, 2025
f23d777
tests: schema and basic tests for taoec dataset
DataLabTechTV Jul 15, 2025
6e082e3
feat: stage transformations for TAoEC
DataLabTechTV Jul 15, 2025
40a79d7
fix: types and missing null strings
DataLabTechTV Jul 16, 2025
40dee56
chore: configs for analytics mart
DataLabTechTV Jul 16, 2025
c4daafb
fix: add missing schema configs for new econ comp models
DataLabTechTV Jul 16, 2025
c579742
feat: aggregated view for 2020-2023 trade covering recent years
DataLabTechTV Jul 16, 2025
af044f8
feat: rename 2020-2023 to latest 3y and add schema for country-countr…
DataLabTechTV Jul 16, 2025
8599498
feat: logic changed to account for the last 3 years in data instead o…
DataLabTechTV Jul 16, 2025
74f2f4f
chore(deps): bump up kuzu to 0.11.0
DataLabTechTV Jul 16, 2025
499bac0
feat: add model selection CLI option to test cmd
DataLabTechTV Jul 16, 2025
43efc61
fix: remove not null tests where they were not required
DataLabTechTV Jul 16, 2025
d1ef5ce
test: ensure ESI is within a 0..1 range
DataLabTechTV Jul 16, 2025
0ca0346
feat: country-country ESI calculation
DataLabTechTV Jul 16, 2025
635dc72
fix: required aggregation per country and product, disregarding partner
DataLabTechTV Jul 16, 2025
1f2f867
fix: remove repeated country pairs in reverse order
DataLabTechTV Jul 16, 2025
09c3ac7
feat: ingest country classification data
DataLabTechTV Jul 16, 2025
3356e4f
feat: select top 5% ESI country-country relations for edges
DataLabTechTV Jul 16, 2025
cca6d5c
feat: country and product nodes, product-country export and import ed…
DataLabTechTV Jul 16, 2025
3e34e80
chore: add env var for econ comp graph db
DataLabTechTV Jul 16, 2025
0e797ae
chore: rename KuzuDBs to match new single-file format
DataLabTechTV Jul 16, 2025
36f6cf7
chore: upgrade explorer script to work with kuzu 0.11.0
DataLabTechTV Jul 16, 2025
93396df
feat: support for loading econ_comp graph
DataLabTechTV Jul 16, 2025
918f23a
fix: remove inexistent property
DataLabTechTV Jul 16, 2025
398ba70
fix: edges needed to be defined based on node_id, which required thes…
DataLabTechTV Jul 16, 2025
2d26651
fix: remove product parent relationship, as there is no multi-level d…
DataLabTechTV Jul 16, 2025
aa65fcd
docs: fill-in the missing schema models for analytics, and econ_comp …
DataLabTechTV Jul 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ S3_BUCKET=lakehouse
S3_INGEST_PREFIX=raw
S3_STAGE_PREFIX=stage
S3_GRAPHS_MART_PREFIX=marts/graphs
S3_ANALYTICS_MART_PREFIX=marts/analytics
S3_EXPORTS_PREFIX=exports
S3_BACKUPS_PREFIX=backups

Expand All @@ -30,11 +31,13 @@ S3_BACKUPS_PREFIX=backups
ENGINE_DB=engine.duckdb
STAGE_DB=stage.sqlite
GRAPHS_MART_DB=marts/graphs.sqlite
ANALYTICS_MART_DB=marts/analytics.sqlite

# KùzuDB configurations
# =====================

MUSIC_TASTE_GRAPH_DB=graphs/music_taste
MUSIC_TASTE_GRAPH_DB=graphs/music_taste.kz
ECON_COMP_GRAPH_DB=graphs/econ_comp.kz

# Ollama configurations
# =====================
Expand Down
52 changes: 48 additions & 4 deletions dlctl/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from export.cli import export
from graph.cli import graph
from ingest.cli import ingest
from shared.cache import cache_usage, expunge_cache
from shared.settings import LOCAL_DIR, MART_DB_VARS, env
from shared.storage import Storage, StoragePrefix

Expand Down Expand Up @@ -185,7 +186,7 @@ def backup_ls(include_all: bool):
help="Model name to transform (can be used multiple times)",
)
@click.option("--debug", is_flag=True, help="Run dbt with the debug flag")
def transform(models: tuple[str], debug: bool):
def transform(models: Optional[tuple[str, ...]], debug: bool):
dbt_handler = DBTHandler(debug=debug)
dbt_handler.run(models)

Expand All @@ -195,9 +196,18 @@ def transform(models: tuple[str], debug: bool):


@dlctl.command(name="test", help="Run data tests")
def test():
dbt_handler = DBTHandler()
dbt_handler.test()
@click.option(
"--model",
"-m",
"models",
multiple=True,
type=click.STRING,
help="Model name to transform (can be used multiple times)",
)
@click.option("--debug", is_flag=True, help="Run dbt with the debug flag")
def test(models: Optional[tuple[str, ...]], debug: bool):
dbt_handler = DBTHandler(debug=debug)
dbt_handler.test(models)


# Documentation
Expand Down Expand Up @@ -243,5 +253,39 @@ def generate_init_sql(path: str):
T.generate_init_sql(path)


# Cache
# =====


@dlctl.group(help="Manage cache (requests, etc.)")
def cache():
pass


@cache.command(name="clean", help="Expunge cache")
@click.option(
"-ns",
"--namespace",
type=click.Choice(["requests", "huggingface"]),
help="Limit cache cleaning to a namespace",
)
@click.option(
"-n",
"--name",
type=click.STRING,
help="Limit cache cleaning to a specific name (namespace required as well)",
)
def cache_clean(namespace: Optional[str], name: Optional[str]):
if namespace is None and name is not None:
raise click.UsageError("name requires that namespace is set")

expunge_cache(namespace, name)


@cache.command(name="df", help="Calculate cache usage statistics")
def cache_df():
cache_usage()


if __name__ == "__main__":
dlctl()
18 changes: 15 additions & 3 deletions dlctl/dbt_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def mkdirs(self):
def deps(self):
self.dbt.invoke(["deps"] + self.PROJECT_ARGS)

def run(self, models: Optional[tuple[str]] = None):
def run(self, models: Optional[tuple[str, ...]] = None):
args = ["run"]
args += self.PROJECT_ARGS

Expand All @@ -71,8 +71,20 @@ def run(self, models: Optional[tuple[str]] = None):
else:
log.warning("{}: {}", r.node.name, r.status)

def test(self):
self.dbt.invoke(["test"] + self.PROJECT_ARGS)
def test(self, models: Optional[tuple[str, ...]] = None):
args = ["test"]
args += self.PROJECT_ARGS

if self.debug:
args += ["--debug"]

if models is not None and len(models) > 0:
args += [
"--select",
",".join(f"{model}" for model in models),
]

self.dbt.invoke(args)

def docs_generate(self):
self.dbt.invoke(["docs", "generate"] + self.PROJECT_ARGS)
Expand Down
9 changes: 8 additions & 1 deletion graph/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,14 @@ def load(schema: str, overwrite: bool):

try:
ops = KuzuOps(schema, overwrite=overwrite)
ops.load_music_graph(s3_path)

match schema:
case "music_taste":
ops.load_music_taste(s3_path)
case "econ_comp":
ops.load_econ_comp(s3_path)
case _:
raise click.UsageError(f"{schema}: graph unsupported")
except Exception as e:
log.error(e)

Expand Down
Loading