diff --git a/.env.example b/.env.example index f775df1f..d5613e21 100644 --- a/.env.example +++ b/.env.example @@ -18,6 +18,7 @@ S3_BUCKET=lakehouse S3_INGEST_PREFIX=raw S3_STAGE_PREFIX=stage S3_GRAPHS_MART_PREFIX=marts/graphs +S3_ANALYTICS_MART_PREFIX=marts/analytics S3_EXPORTS_PREFIX=exports S3_BACKUPS_PREFIX=backups @@ -30,11 +31,13 @@ S3_BACKUPS_PREFIX=backups ENGINE_DB=engine.duckdb STAGE_DB=stage.sqlite GRAPHS_MART_DB=marts/graphs.sqlite +ANALYTICS_MART_DB=marts/analytics.sqlite # KùzuDB configurations # ===================== -MUSIC_TASTE_GRAPH_DB=graphs/music_taste +MUSIC_TASTE_GRAPH_DB=graphs/music_taste.kz +ECON_COMP_GRAPH_DB=graphs/econ_comp.kz # Ollama configurations # ===================== diff --git a/dlctl/cli.py b/dlctl/cli.py index c791a837..8b01021a 100644 --- a/dlctl/cli.py +++ b/dlctl/cli.py @@ -13,6 +13,7 @@ from export.cli import export from graph.cli import graph from ingest.cli import ingest +from shared.cache import cache_usage, expunge_cache from shared.settings import LOCAL_DIR, MART_DB_VARS, env from shared.storage import Storage, StoragePrefix @@ -185,7 +186,7 @@ def backup_ls(include_all: bool): help="Model name to transform (can be used multiple times)", ) @click.option("--debug", is_flag=True, help="Run dbt with the debug flag") -def transform(models: tuple[str], debug: bool): +def transform(models: Optional[tuple[str, ...]], debug: bool): dbt_handler = DBTHandler(debug=debug) dbt_handler.run(models) @@ -195,9 +196,18 @@ def transform(models: tuple[str], debug: bool): @dlctl.command(name="test", help="Run data tests") -def test(): - dbt_handler = DBTHandler() - dbt_handler.test() +@click.option( + "--model", + "-m", + "models", + multiple=True, + type=click.STRING, + help="Model name to transform (can be used multiple times)", +) +@click.option("--debug", is_flag=True, help="Run dbt with the debug flag") +def test(models: Optional[tuple[str, ...]], debug: bool): + dbt_handler = DBTHandler(debug=debug) + dbt_handler.test(models) # Documentation @@ -243,5 +253,39 @@ def generate_init_sql(path: str): T.generate_init_sql(path) +# Cache +# ===== + + +@dlctl.group(help="Manage cache (requests, etc.)") +def cache(): + pass + + +@cache.command(name="clean", help="Expunge cache") +@click.option( + "-ns", + "--namespace", + type=click.Choice(["requests", "huggingface"]), + help="Limit cache cleaning to a namespace", +) +@click.option( + "-n", + "--name", + type=click.STRING, + help="Limit cache cleaning to a specific name (namespace required as well)", +) +def cache_clean(namespace: Optional[str], name: Optional[str]): + if namespace is None and name is not None: + raise click.UsageError("name requires that namespace is set") + + expunge_cache(namespace, name) + + +@cache.command(name="df", help="Calculate cache usage statistics") +def cache_df(): + cache_usage() + + if __name__ == "__main__": dlctl() diff --git a/dlctl/dbt_handler.py b/dlctl/dbt_handler.py index 2ed332f2..c06709f4 100644 --- a/dlctl/dbt_handler.py +++ b/dlctl/dbt_handler.py @@ -46,7 +46,7 @@ def mkdirs(self): def deps(self): self.dbt.invoke(["deps"] + self.PROJECT_ARGS) - def run(self, models: Optional[tuple[str]] = None): + def run(self, models: Optional[tuple[str, ...]] = None): args = ["run"] args += self.PROJECT_ARGS @@ -71,8 +71,20 @@ def run(self, models: Optional[tuple[str]] = None): else: log.warning("{}: {}", r.node.name, r.status) - def test(self): - self.dbt.invoke(["test"] + self.PROJECT_ARGS) + def test(self, models: Optional[tuple[str, ...]] = None): + args = ["test"] + args += self.PROJECT_ARGS + + if self.debug: + args += ["--debug"] + + if models is not None and len(models) > 0: + args += [ + "--select", + ",".join(f"{model}" for model in models), + ] + + self.dbt.invoke(args) def docs_generate(self): self.dbt.invoke(["docs", "generate"] + self.PROJECT_ARGS) diff --git a/graph/cli.py b/graph/cli.py index c5bd2a99..f988ebf7 100644 --- a/graph/cli.py +++ b/graph/cli.py @@ -43,7 +43,14 @@ def load(schema: str, overwrite: bool): try: ops = KuzuOps(schema, overwrite=overwrite) - ops.load_music_graph(s3_path) + + match schema: + case "music_taste": + ops.load_music_taste(s3_path) + case "econ_comp": + ops.load_econ_comp(s3_path) + case _: + raise click.UsageError(f"{schema}: graph unsupported") except Exception as e: log.error(e) diff --git a/graph/ops.py b/graph/ops.py index 6b16ee30..9e495bea 100644 --- a/graph/ops.py +++ b/graph/ops.py @@ -28,17 +28,30 @@ def __init__(self, schema: str, overwrite: bool = False): if os.path.exists(db_path): if overwrite: log.warning(f"Overwriting database: {db_path}") - shutil.rmtree(db_path) + if os.path.isdir(db_path): + shutil.rmtree(db_path) + elif os.path.isfile(db_path): + os.unlink(db_path) db = kuzu.Database(db_path) self.conn = kuzu.Connection(db) self.storage = Storage(prefix=StoragePrefix.EXPORTS) - def _create_music_graph_schema(self): + def _copy_from_s3(self, s3_path: str, query: str, path_var="path"): + with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp: + self.storage.download_file(s3_path, tmp.name) + query = Template(query).substitute({path_var: tmp.name}) + log.debug("Running query: {}", query) + self.conn.execute(query) + + # Graph: music_taste + # ================== + + def _create_music_taste_schema(self): # Nodes # ===== - log.info("Creating music_graph schema for User nodes") + log.info("Creating music_taste graph schema for User nodes") self.conn.execute( """ @@ -52,7 +65,7 @@ def _create_music_graph_schema(self): """ ) - log.info("Creating music_graph schema for Genre nodes") + log.info("Creating music_taste graph schema for Genre nodes") self.conn.execute( """ @@ -64,7 +77,7 @@ def _create_music_graph_schema(self): """ ) - log.info("Creating music_graph schema for Track nodes") + log.info("Creating music_taste graph schema for Track nodes") self.conn.execute( """ @@ -82,13 +95,13 @@ def _create_music_graph_schema(self): # Edges # ===== - log.info("Creating music_graph schema for Friend edges") + log.info("Creating music_taste graph schema for Friend edges") self.conn.execute("CREATE REL TABLE Friend(FROM User TO User, MANY_MANY)") - log.info("Creating music_graph schema for Likes edges") + log.info("Creating music_taste graph schema for Likes edges") self.conn.execute("CREATE REL TABLE Likes(FROM User TO Genre, MANY_MANY)") - log.info("Creating music_graph schema for ListenedTo edges") + log.info("Creating music_taste graph schema for ListenedTo edges") self.conn.execute( """ CREATE REL TABLE ListenedTo( @@ -99,42 +112,35 @@ def _create_music_graph_schema(self): """ ) - log.info("Creating music_graph schema for Tagged edges") + log.info("Creating music_taste graph schema for Tagged edges") self.conn.execute("CREATE REL TABLE Tagged(FROM Track TO Genre, MANY_MANY)") - def _copy_from_s3(self, s3_path: str, query: str, path_var="path"): - with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp: - self.storage.download_file(s3_path, tmp.name) - query = Template(query).substitute({path_var: tmp.name}) - log.debug("Running query: {}", query) - self.conn.execute(query) - - def _import_music_graph(self, s3_path: str): + def _import_music_taste(self, s3_path: str): # Nodes # ===== - log.info("Importing music_graph DSN User nodes") + log.info("Importing music_taste DSN User nodes") self._copy_from_s3( f"{s3_path}/nodes/dsn_nodes_users.parquet", "COPY User(node_id, user_id, country, source) FROM '$path'", ) - log.info("Importing music_graph MSDSL User nodes") + log.info("Importing music_taste MSDSL User nodes") self._copy_from_s3( f"{s3_path}/nodes/msdsl_nodes_users.parquet", "COPY User(node_id, user_id, source) FROM '$path'", ) - log.info("Importing music_graph MSDSL Track nodes") + log.info("Importing music_taste MSDSL Track nodes") self._copy_from_s3( f"{s3_path}/nodes/msdsl_nodes_tracks.parquet", "COPY Track(node_id, track_id, name, artist, year) FROM '$path'", ) - log.info("Importing music_graph Genre nodes") + log.info("Importing music_taste Genre nodes") self._copy_from_s3( f"{s3_path}/nodes/nodes_genres.parquet", @@ -144,45 +150,203 @@ def _import_music_graph(self, s3_path: str): # Edges # ===== - log.info("Importing music_graph DSN user-user friend edges") + log.info("Importing music_taste DSN user-user friend edges") self._copy_from_s3( f"{s3_path}/edges/dsn_edges_friendships.parquet", "COPY Friend FROM '$path'", ) - log.info("Importing music_graph DSN user-genre edges") + log.info("Importing music_taste DSN user-genre edges") self._copy_from_s3( f"{s3_path}/edges/dsn_edges_user_genres.parquet", "COPY Likes FROM '$path'", ) - log.info("Importing music_graph MSDSL user-tracks edges") + log.info("Importing music_taste MSDSL user-tracks edges") self._copy_from_s3( f"{s3_path}/edges/msdsl_edges_user_tracks.parquet", "COPY ListenedTo FROM '$path'", ) - log.info("Importing music_graph MSDSL track-genres edges") + log.info("Importing music_taste MSDSL track-genres edges") self._copy_from_s3( f"{s3_path}/edges/msdsl_edges_track_tags.parquet", "COPY Tagged FROM '$path'", ) - def load_music_graph(self, path: str): + def load_music_taste(self, path: str): + try: + self._create_music_taste_schema() + except Exception as e: + log.error("Failed to create schema for music_taste: {}", e) + return + + try: + self._import_music_taste(path) + except Exception as e: + log.error("Failed to import nodes/edges for music_taste: {}", e) + return + + # Graph: econ_comp + # ================ + + def _create_econ_comp_schema(self): + # Nodes + # ===== + + log.info("Creating econ_comp graph schema for Country nodes") + + self.conn.execute( + """ + CREATE NODE TABLE Country ( + node_id INT64, + country_id UINT16, + country_iso3_code STRING, + country_name STRING, + country_name_short STRING, + in_rankings BOOLEAN, + former_country BOOLEAN, + PRIMARY KEY (node_id) + ) + """ + ) + + log.info("Creating econ_comp graph schema for Product nodes") + + self.conn.execute( + """ + CREATE NODE TABLE Product ( + node_id INT64, + product_id UINT16, + product_hs92_code UINT32, + product_level UINT8, + product_name STRING, + product_name_short STRING, + product_id_hierarchy STRING, + show_feasibility BOOLEAN, + natural_resource BOOLEAN, + green_product BOOLEAN, + PRIMARY KEY (node_id) + ) + """ + ) + + # Edges + # ===== + + log.info("Creating econ_comp graph schema for CompetesWith edges") + self.conn.execute( + """ + CREATE REL TABLE CompetesWith( + FROM Country TO Country, + esi DOUBLE, + MANY_MANY + ) + """ + ) + + log.info("Creating econ_comp graph schema for Exports edges") + self.conn.execute( + """ + CREATE REL TABLE Exports( + FROM Country TO Product, + amount_usd INT128, + MANY_MANY + ) + """ + ) + + log.info("Creating econ_comp graph schema for Imports edges") + self.conn.execute( + """ + CREATE REL TABLE Imports( + FROM Product TO Country, + amount_usd INT128, + MANY_MANY + ) + """ + ) + + def _import_econ_comp(self, s3_path: str): + # Nodes + # ===== + + log.info("Importing econ_comp Country nodes") + + self._copy_from_s3( + f"{s3_path}/nodes/nodes_countries.parquet", + """ + COPY Country( + node_id, + country_id, + country_iso3_code, + country_name, + country_name_short, + in_rankings, + former_country + ) FROM '$path' + """, + ) + + log.info("Importing econ_comp Product nodes") + + self._copy_from_s3( + f"{s3_path}/nodes/nodes_products.parquet", + """ + COPY Product( + node_id, + product_id, + product_hs92_code, + product_level, + product_name, + product_name_short, + product_id_hierarchy, + show_feasibility, + natural_resource, + green_product + ) FROM '$path' + """, + ) + + # Edges + # ===== + + log.info("Importing econ_comp country-country CompetesWith edges") + + self._copy_from_s3( + f"{s3_path}/edges/edges_competes_with.parquet", + "COPY CompetesWith FROM '$path'", + ) + + log.info("Importing econ_comp country->product Exports edges") + + self._copy_from_s3( + f"{s3_path}/edges/edges_exports.parquet", + "COPY Exports FROM '$path'", + ) + + log.info("Importing econ_comp product->country Imports edges") + + self._copy_from_s3( + f"{s3_path}/edges/edges_imports.parquet", + "COPY Imports FROM '$path'", + ) + + def load_econ_comp(self, path: str): try: - self._create_music_graph_schema() + self._create_econ_comp_schema() except Exception as e: - log.error("Failed to create schema for music_graph: {}", e) + log.error("Failed to create schema for econ_comp graph: {}", e) return try: - self._import_music_graph(path) + self._import_econ_comp(path) except Exception as e: - log.error("Failed to import nodes/edges for music_graph: {}", e) + log.error("Failed to import nodes/edges for econ_comp graph: {}", e) return @property diff --git a/ingest/cli.py b/ingest/cli.py index ab9f4f05..383155d1 100644 --- a/ingest/cli.py +++ b/ingest/cli.py @@ -1,7 +1,15 @@ +from typing import Optional + import click from loguru import logger as log -from ingest.handler import handle_hugging_face, handle_kaggle, handle_standalone +from ingest.handler import ( + handle_hugging_face, + handle_kaggle, + handle_standalone, + handle_template, +) +from ingest.template.base import DatasetTemplateID from shared.storage import Storage, StoragePrefix @@ -10,7 +18,13 @@ def ingest(): pass -@ingest.command(help="Handle ingestion into a dated directory structure") +@ingest.command( + help=""" + Handle ingestion into a dated directory structure. + + Supports Kaggle and Hugging Face URLs for DATASET. + """ +) @click.argument("dataset", type=click.STRING) @click.option( "-m", @@ -18,11 +32,17 @@ def ingest(): is_flag=True, help="Dataset argument will be used to create an empty directory in S3", ) -def dataset(dataset: str, manual: bool): +@click.option("-t", "--template", type=click.Choice(t.value for t in DatasetTemplateID)) +def dataset(dataset: str, manual: Optional[bool], template: Optional[str]): log.info("Running ingestion for: {}", dataset) + if manual and template: + raise click.UsageError("--manual and --template cannot be used together") + if manual: handle_standalone(dataset) + elif template: + handle_template(dataset, DatasetTemplateID(template)) elif dataset.startswith("https://www.kaggle.com/datasets/"): handle_kaggle(dataset_url=dataset) elif dataset.startswith("https://huggingface.co/datasets/"): diff --git a/ingest/fetcher.py b/ingest/fetcher.py new file mode 100644 index 00000000..5e45a844 --- /dev/null +++ b/ingest/fetcher.py @@ -0,0 +1,121 @@ +import os +import tempfile +from pathlib import Path +from urllib.parse import parse_qs, urljoin, urlsplit, urlunsplit + +import requests +from loguru import logger as log +from tqdm import tqdm + +from shared.cache import get_requests_cache_session +from shared.storage import Storage, StoragePrefix + +DATACITE_API_URL = "https://api.datacite.org/" + + +class DataCiteFetcher: + def __init__(self, s3_dir_path: str): + self.s3_dir_path = s3_dir_path + self.storage = Storage(StoragePrefix.INGEST) + self.session = get_requests_cache_session("datacite") + + def to_canonical_doi(self, doi: str) -> str: + rel_path = urlsplit(doi).path.removeprefix("/") + canonical_doi = "/".join(rel_path.split("/")[:3]) + return canonical_doi + + def get_url_from_datacite(self, canonical_doi: str) -> str: + dc_api_url = urljoin(DATACITE_API_URL, f"dois/{canonical_doi}") + + dc_api_resp = self.session.get(dc_api_url) + dc_api_resp.raise_for_status() + + ds_url = dc_api_resp.json()["data"]["attributes"]["url"] + + return ds_url + + def get_files_list(self, ds_url: str) -> list[tuple[int, str]]: + ds_url_parts = urlsplit(ds_url) + ds_persistent_id = parse_qs(ds_url_parts.query)["persistentId"][0] + + ds_api_url = urlunsplit( + ( + ds_url_parts.scheme, + ds_url_parts.netloc, + "/api/datasets/:persistentId", + f"persistentId={ds_persistent_id}", + None, + ) + ) + + ds_api_resp = self.session.get(ds_api_url) + ds_api_resp.raise_for_status() + + ds_files = ds_api_resp.json()["data"]["latestVersion"]["files"] + + files = [] + + for ds_file in ds_files: + if "dataFile" not in ds_file: + continue + + file_id = ds_file["dataFile"]["id"] + filename = ds_file["dataFile"]["filename"] + + files.append((file_id, filename)) + + return files + + def download_file(self, ds_url: str, file_id: int) -> str: + with tempfile.NamedTemporaryFile(delete=False) as tmp: + ds_url_parts = urlsplit(ds_url) + + ds_api_url = urlunsplit( + ( + ds_url_parts.scheme, + ds_url_parts.netloc, + f"/api/access/datafile/{file_id}", + None, + None, + ) + ) + + log.info("Downloading {} to {}", file_id, ds_api_url, tmp.name) + + with self.session.get(ds_api_url, stream=True) as r: + r.raise_for_status() + + total_size = int(r.headers.get("content-length", 0)) + + with ( + open(tmp.name, "wb") as fp, + tqdm( + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + desc=tmp.name, + ) as pb, + ): + for chunk in r.iter_content(chunk_size=262144): + if chunk is not None: + fp.write(chunk) + pb.update(len(chunk)) + + return tmp.name + + def download(self, doi: str, target: Path): + log.info("Processing DOI: {}", doi) + + canonical_doi = self.to_canonical_doi(doi) + ds_url = self.get_url_from_datacite(canonical_doi) + + log.info("Getting files from {}", ds_url) + files = self.get_files_list(ds_url) + + for file_id, filename in files: + try: + tmp_path = self.download_file(ds_url, file_id) + self.storage.upload_file(tmp_path, f"{target}/{filename}") + finally: + os.unlink(tmp_path) diff --git a/ingest/handler.py b/ingest/handler.py index dcf7908f..4e89af5a 100644 --- a/ingest/handler.py +++ b/ingest/handler.py @@ -1,14 +1,14 @@ import shutil -from dataclasses import dataclass -from pathlib import Path -from urllib.parse import urlparse import git import kagglehub as kh from loguru import logger as log -from platformdirs import user_cache_dir from slugify import slugify +from ingest.fetcher import DataCiteFetcher +from ingest.parser import DatasetURL +from ingest.template.base import DataCiteTemplate, DatasetTemplate, DatasetTemplateID +from shared.cache import get_cache_dir from shared.storage import Storage, StoragePrefix @@ -24,35 +24,35 @@ def handle_standalone(dataset: str): log.error("Could not create directory {} for {}: {}", ds_name, dataset, e) -@dataclass -class DatasetURL: - author: str - slug: str - handle: str - name: str - +def handle_template(dataset: str, template_id: DatasetTemplateID): + ds_name = slugify(dataset, separator="_") + template = DatasetTemplate.from_id(template_id) -def parse_dataset_url(dataset_url: str) -> DatasetURL: - url = urlparse(dataset_url) - path = url.path.split("/") + log.info( + "{} template detected, downloading dataset: {}", + template.__class__.__name__, + ds_name, + ) - author = path[-2] - slug = path[-1] - handle = f"{author}/{slug}" - name = slugify(slug, separator="_") + try: + s = Storage(prefix=StoragePrefix.INGEST) + s3_dir_path = s.get_dir(ds_name, dated=True, upload_placeholder=True) + s.upload_manifest(ds_name, latest=s3_dir_path) + except Exception as e: + log.error("Could not create directory {} for {}: {}", ds_name, dataset, e) + return - ds_url = DatasetURL( - author=author, - slug=slug, - handle=handle, - name=name, - ) + match template: + case DataCiteTemplate(): + dcdl = DataCiteFetcher(s3_dir_path) - return ds_url + for source, target, attribution in template: + log.info("Downloading: {}", attribution.replace("\n", " ").strip()) + dcdl.download(doi=source, target=f"{s3_dir_path}/{target}") def handle_kaggle(dataset_url: str): - ds_url = parse_dataset_url(dataset_url) + ds_url = DatasetURL.parse(dataset_url) log.info("Kaggle dataset detected, downloading dataset: {}", ds_url.name) try: @@ -69,18 +69,12 @@ def handle_kaggle(dataset_url: str): ) -def get_cache_dir() -> Path: - cache_dir = Path(user_cache_dir("datalab")) - cache_dir.mkdir(parents=True, exist_ok=True) - return cache_dir - - def handle_hugging_face(dataset_url: str): - ds_url = parse_dataset_url(dataset_url) + ds_url = DatasetURL.parse(dataset_url) log.info("Hugging Face dataset detected, downloading dataset: {}", ds_url.name) try: - hf_ds_path = get_cache_dir() / ds_url.author / ds_url.slug + hf_ds_path = get_cache_dir() / "huggingface" / ds_url.author / ds_url.slug log.info("Fetching {}", dataset_url) diff --git a/ingest/parser.py b/ingest/parser.py new file mode 100644 index 00000000..30388eff --- /dev/null +++ b/ingest/parser.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass +from typing import Self +from urllib.parse import urlparse + +from slugify import slugify + + +@dataclass +class DatasetURL: + author: str + slug: str + handle: str + name: str + + @classmethod + def parse(cls, dataset_url: str) -> Self: + url = urlparse(dataset_url) + path = url.path.split("/") + + author = path[-2] + slug = path[-1] + handle = f"{author}/{slug}" + name = slugify(slug, separator="_") + + ds_url = cls( + author=author, + slug=slug, + handle=handle, + name=name, + ) + + return ds_url diff --git a/ingest/template/atlas.py b/ingest/template/atlas.py new file mode 100644 index 00000000..cd1928f0 --- /dev/null +++ b/ingest/template/atlas.py @@ -0,0 +1,55 @@ +from ingest.template.base import DataCiteTemplate, DatasetFileMetadata + + +class TheAtlasOfEconomicComplexityTemplate(DataCiteTemplate): + template = [ + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/XTAQMC", + target="rankings/", + attribution=""" + The Growth Lab at Harvard University, 2025, "Growth Projections and Complexity Rankings", https://doi.org/10.7910/DVN/XTAQMC, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/T4CHWJ", + target="HS92/", + attribution=""" + The Growth Lab at Harvard University, 2025, "International Trade Data (HS92)", https://doi.org/10.7910/DVN/T4CHWJ, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/YAVJDF", + target="HS12/", + attribution=""" + The Growth Lab at Harvard University, 2025, "International Trade Data (HS12)", https://doi.org/10.7910/DVN/YAVJDF, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/H8SFD2", + target="SITC/", + attribution=""" + The Growth Lab at Harvard University, 2025, "International Trade Data (SITC, Rev. 2)", https://doi.org/10.7910/DVN/H8SFD2, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/NDDMSN", + target="services_unilateral/", + attribution=""" + The Growth Lab at Harvard University, 2025, "International Trade Data (Services)", https://doi.org/10.7910/DVN/NDDMSN, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/3BAL1O", + target="classifications/", + attribution=""" + The Growth Lab at Harvard University, 2025, "Classifications Data", https://doi.org/10.7910/DVN/3BAL1O, Harvard Dataverse + """, + ), + DatasetFileMetadata( + source="https://doi.org/10.7910/DVN/FCDZBN", + target="product_space/", + attribution=""" + The Growth Lab at Harvard University, 2025, "Product Space Networks", https://doi.org/10.7910/DVN/FCDZBN, Harvard Dataverse + """, + ), + ] diff --git a/ingest/template/base.py b/ingest/template/base.py new file mode 100644 index 00000000..db21f3d4 --- /dev/null +++ b/ingest/template/base.py @@ -0,0 +1,65 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Iterator, Optional, Self + + +class DatasetTemplateID(Enum): + THE_ATLAS_OF_ECONOMIC_COMPLEXITY = "atlas" + + +@dataclass +class DatasetFileMetadata: + def __init__( + self, + source: str, + target: str | Path, + attribution: Optional[str] = None, + ): + self.source = source + + if isinstance(target, Path): + self.target = target + elif isinstance(target, str): + self.target = Path(target) + else: + raise TypeError("Invalid type: target must either be a Path or a str") + + self.attribution = attribution + + def __iter__(self): + yield self.source + yield self.target + yield self.attribution + + +class DatasetTemplate(ABC): + @classmethod + def from_id(cls, template_id: DatasetTemplateID) -> Self: + """Instance child classes based on the DatasetTemplateID""" + match template_id: + case DatasetTemplateID.THE_ATLAS_OF_ECONOMIC_COMPLEXITY: + from ingest.template.atlas import TheAtlasOfEconomicComplexityTemplate + + return TheAtlasOfEconomicComplexityTemplate() + + def __iter__(self) -> Iterator[DatasetFileMetadata]: + """Iterate over the dataset files in the template.""" + yield from self.template + + @property + @abstractmethod + def template(self) -> list[DatasetFileMetadata]: + """ + Source and target metadata for dataset files. + + + - The source might be a filename, a URL, a table name, etc. + - The target will be a destination dir Path where the source should be dropped. + - Attribution is used to comply with legal requirements and will be printed whenever the dataset file is accessed. + """ + + +class DataCiteTemplate(DatasetTemplate): + pass diff --git a/pyproject.toml b/pyproject.toml index 5bdb19c1..a5a49133 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,9 @@ dependencies = [ "dbt-duckdb", "environs>=14.2.0", "gitpython>=3.1.44", + "humanize>=4.12.3", "kagglehub>=0.3.12", - "kuzu>=0.10.0", + "kuzu==0.11.0", "langchain>=0.3.26", "langchain-kuzu>=0.4.2", "langchain-ollama>=0.3.3", @@ -30,7 +31,9 @@ dependencies = [ "platformdirs>=4.3.8", "prompt-toolkit>=3.0.51", "python-slugify>=8.0.4", + "requests-cache>=1.2.1", "torch>=2.7.1", + "tqdm>=4.67.1", ] [dependency-groups] diff --git a/scripts/explore_graph.sh b/scripts/explore_graph.sh index cdb918e1..38afce76 100755 --- a/scripts/explore_graph.sh +++ b/scripts/explore_graph.sh @@ -1,6 +1,8 @@ #!/bin/bash -KUZUDB_EXPLORER_VERSION=0.10.0 +KUZUDB_EXPLORER_VERSION=0.11.0 +CONTAINER_NAME=datalab-kuzudb-explorer-1 +PROJECT_NAME=datalab if ! which docker >/dev/null; then echo "docker: not found" @@ -14,19 +16,19 @@ fi cleanup() { trap - SIGINT SIGTERM - echo "==> Stopping docker container for kuzudb-explorer" - docker stop kuzudb-explorer >/dev/null + echo "==> Stopping docker container for $CONTAINER_NAME" + docker stop $CONTAINER_NAME >/dev/null } container_exists() { docker ps -a --format '{{.Names}}' | - awk '$0 == "kuzudb-explorer" { found=1 } END { exit !found }' >/dev/null + awk '$0 == "'$CONTAINER_NAME'" { found=1 } END { exit !found }' >/dev/null } kuzudb_container_mode() { local mode - mode=$(docker inspect kuzudb-explorer \ + mode=$(docker inspect $CONTAINER_NAME \ --format='{{range .Config.Env}}{{println .}}{{end}}' | awk '{ if (match($0, /MODE=(.*)/, m)) print m[1] }') @@ -49,7 +51,7 @@ kuzudb_container_db_path_cmp() { new_db_path=$(readlink -f "$1") fmt='{{range .Mounts}}{{if eq .Destination "/database"}}{{.Source}}{{end}}{{end}}' - cur_db_path=$(docker inspect kuzudb-explorer --format "$fmt") + cur_db_path=$(docker inspect $CONTAINER_NAME --format "$fmt") [ "$cur_db_path" = "$new_db_path" ] } @@ -62,8 +64,8 @@ while [[ $# -gt 0 ]]; do case "$1" in --reset) if container_exists; then - echo "==> Removing existing kuzudb-explorer container" - docker rm -f kuzudb-explorer >/dev/null + echo "==> Removing existing $CONTAINER_NAME container" + docker rm -f $CONTAINER_NAME >/dev/null fi shift @@ -79,33 +81,37 @@ while [[ $# -gt 0 ]]; do done kuzudb_path=$(readlink -f "$1") +kuzudb_file=${kuzudb_path##*/} if container_exists; then current_mode=$(kuzudb_container_mode) if [ "$current_mode" != $MODE ]; then - echo "==> Removing existing $current_mode kuzudb-explorer container" - docker rm -f kuzudb-explorer >/dev/null + echo "==> Removing existing $current_mode $CONTAINER_NAME container" + docker rm -f $CONTAINER_NAME >/dev/null fi if ! kuzudb_container_db_path_cmp "$kuzudb_path"; then - echo "==> Removing existing kuzudb-explorer container for $kuzudb_path" - docker rm -f kuzudb-explorer >/dev/null + echo "==> Removing existing $CONTAINER_NAME container for $kuzudb_path" + docker rm -f $CONTAINER_NAME >/dev/null fi fi if container_exists; then - echo "==> Starting existing docker container for kuzudb-explorer..." - docker start kuzudb-explorer >/dev/null + echo "==> Starting existing docker container: $CONTAINER_NAME" + docker start $CONTAINER_NAME >/dev/null else - echo "==> Creating and starting docker container for kuzudb-explorer..." - docker run -d --name kuzudb-explorer \ - -p 8000:8000 -v "${kuzudb_path}:/database" \ - -e MODE=$MODE \ + echo "==> Creating and starting docker container: $CONTAINER_NAME" + docker run -d -p 8000:8000 \ + --name $CONTAINER_NAME \ + --network ${PROJECT_NAME}_default \ + --label com.docker.compose.project=$PROJECT_NAME \ + -v "$kuzudb_path:/database/$kuzudb_file" \ + -e KUZU_FILE="$kuzudb_file" -e MODE=$MODE \ kuzudb/explorer:$KUZUDB_EXPLORER_VERSION >/dev/null fi echo "==> Opening browser at http://localhost:8000..." open http://localhost:8000 -docker logs -n 4 -f kuzudb-explorer +docker logs -n 4 -f $CONTAINER_NAME diff --git a/shared/cache.py b/shared/cache.py new file mode 100644 index 00000000..ddce6b02 --- /dev/null +++ b/shared/cache.py @@ -0,0 +1,64 @@ +import shutil +from pathlib import Path +from typing import Optional + +import humanize +from loguru import logger as log +from platformdirs import user_cache_dir +from requests_cache.session import CachedSession + + +def get_cache_dir() -> Path: + cache_dir = Path(user_cache_dir("datalab")) + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def get_requests_cache_session(name: str) -> CachedSession: + cache_dir = get_cache_dir() / "requests" / name + session = CachedSession(cache_name=cache_dir, backend="filesystem") + return session + + +def expunge_cache(namespace: Optional[str] = None, name: Optional[str] = None): + cache_dir = get_cache_dir() + + match (namespace, name): + case (None, None): + log.info("Cleaning cache completely") + shutil.rmtree(cache_dir) + case (_, None): + log.info("Cleaning cache for {}", namespace) + shutil.rmtree(cache_dir / namespace) + case (None, _): + raise ValueError("name requires namespace to be set") + case _: + log.info("Cleaning cache for {}: {}", namespace, name) + shutil.rmtree(cache_dir / namespace / name) + + +def cache_usage(): + log.info("Calculating cache usage statistics") + + cache_dir = get_cache_dir() + + total_size_bytes = 0 + byte_size_per_dir = {} + + for path in cache_dir.iterdir(): + if path.is_dir(): + dir_name = f"{path.relative_to(cache_dir)}/" + + byte_size_per_dir[dir_name] = sum( + f.stat().st_size for f in path.rglob("*") if f.is_file() + ) + + total_size_bytes += byte_size_per_dir[dir_name] + + elif path.is_file(): + total_size_bytes += path.stat().st_size + + print("Total:", humanize.naturalsize(total_size_bytes)) + + for dir_name, dir_size in byte_size_per_dir.items(): + print(f"\t{dir_name}", humanize.naturalsize(dir_size)) diff --git a/shared/storage.py b/shared/storage.py index df90477e..20c6b335 100644 --- a/shared/storage.py +++ b/shared/storage.py @@ -101,6 +101,11 @@ def get_dir( return s3_path + def upload_file(self, source_path: str, s3_target_path: str): + log.info(f"Uploading {source_path} to {s3_target_path}") + s3_target_prefix = self.from_s3_path(s3_target_path) + self.bucket.upload_file(Filename=source_path, Key=s3_target_prefix) + def upload_files( self, source_root: str, diff --git a/transform/dbt_project.yml b/transform/dbt_project.yml index b67b40e1..3bba80ef 100644 --- a/transform/dbt_project.yml +++ b/transform/dbt_project.yml @@ -23,9 +23,17 @@ models: +schema: dsn million_song_dataset_spotify_lastfm: +schema: msdsl + the_atlas_of_economic_complexity: + +schema: taoec marts: +materialized: table graphs: +database: graphs music_taste: +schema: music_taste + econ_comp: + +schema: econ_comp + analytics: + +database: analytics + the_atlas_of_economic_complexity: + +schema: taoec diff --git a/transform/models/marts/analytics/the_atlas_of_economic_complexity/schema.yml b/transform/models/marts/analytics/the_atlas_of_economic_complexity/schema.yml new file mode 100644 index 00000000..daeec53b --- /dev/null +++ b/transform/models/marts/analytics/the_atlas_of_economic_complexity/schema.yml @@ -0,0 +1,108 @@ +version: 2 + +models: + - name: taoec_hs92_ccp_trade_3y_latest + description: >- + Country Trade by Partner and Product (HS92), aggregated for the most recent + 3 years. + columns: + - name: country_id + description: >- + Numerical country ID (internal to Growth Labs) for the reporting country. + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: country_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for the reporting country. + data_tests: + - not_null + - name: partner_country_id + description: >- + Numerical country ID (internal to Growth Labs) for the partner country. + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: partner_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for the partner country. + data_tests: + - not_null + - name: product_id + description: >- + Numerical product ID (internal to Growth Labs). This matches the IDs in + the HS92 products classification table. + data_tests: + - not_null + - name: product_hs92_code + description: >- + HS92 6-digit product code. HS stands for Harmonized System, and it's a + classification system used in international trade to categorize goods. + - name: since_year + description: Calendar year for the start of the period (inclusive). + data_tests: + - not_null + - name: until_year + description: Calendar year for the end of the period (inclusive). + data_tests: + - not_null + - name: export_value + description: Export value in dollars (USD). + data_tests: + - not_null + - name: import_value + description: Import value in dollars (USD). + data_tests: + - not_null + + - name: taoec_cc_metrics + description: Country to Country metrics and statistics (e.g., ESI). + columns: + - name: country_id_1 + description: >- + Numerical country ID (internal to Growth Labs) for one of the countries (1). + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: country_iso3_code_1 + description: >- + Three digit country code (e.g., USA, PRT) for one of the countries (1). + data_tests: + - not_null + - name: country_id_2 + description: >- + Numerical country ID (internal to Growth Labs) for one of the countries (2). + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: country_iso3_code_2 + description: >- + Three digit country code (e.g., USA, PRT) for one of the countries (2). + data_tests: + - not_null + - name: esi + description: Export Similarity Index accounting for destination. + + - name: taoec_competing_countries + description: Competing countries according to the top 5% pairs with highest ESI. + columns: + - name: country_id + description: >- + Numerical country ID (internal to Growth Labs) for a country that appears in + the top 5% pairs with highest ESI. This matches the IDs in the countries classification table. + data_tests: + - not_null + - unique + + - name: taoec_competing_countries_products + description: >- + Products traded by countries appearing the top 5% pairs with highest ESI. + columns: + - name: product_id + description: >- + Numerical product ID (internal to Growth Labs) for a product that is traded + by countries in the top 5% pairs with highest ESI. This matches the IDs in + the HS92 products classification table. + data_tests: + - not_null + - unique diff --git a/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_cc_metrics.sql b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_cc_metrics.sql new file mode 100644 index 00000000..1ad70abc --- /dev/null +++ b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_cc_metrics.sql @@ -0,0 +1,41 @@ +{{ config(alias='cc_metrics') }} + +WITH country_exports AS ( + SELECT + country_id, + country_iso3_code, + product_id, + product_hs92_code, + sum(export_value) AS export_value + FROM + {{ ref('taoec_hs92_ccp_trade_3y_latest') }} + GROUP BY + country_id, + country_iso3_code, + product_id, + product_hs92_code +), +shares AS ( + SELECT + country_id, + country_iso3_code, + product_id, + product_hs92_code, + export_value / sum(export_value) OVER (PARTITION BY country_id) AS share + FROM country_exports +) +SELECT + a.country_id AS country_id_1, + a.country_iso3_code AS country_iso3_code_1, + b.country_id AS country_id_2, + b.country_iso3_code AS country_iso3_code_2, + sum(least(a.share, b.share)) AS esi +FROM shares a +JOIN shares b +ON a.product_id = b.product_id +WHERE a.country_id < b.country_id +GROUP BY + a.country_id, + a.country_iso3_code, + b.country_id, + b.country_iso3_code diff --git a/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries.sql b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries.sql new file mode 100644 index 00000000..2ab2e069 --- /dev/null +++ b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries.sql @@ -0,0 +1,15 @@ +{{ config(alias='competing_countries', materialized='view') }} + +WITH countries AS ( + SELECT country_id_1, country_id_2 + FROM {{ ref('taoec_cc_metrics') }} + ORDER BY esi DESC + LIMIT 5% +) +SELECT country_id_1 AS country_id +FROM countries + +UNION + +SELECT country_id_2 AS country_id +FROM countries diff --git a/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries_products.sql b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries_products.sql new file mode 100644 index 00000000..bbb99c88 --- /dev/null +++ b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_competing_countries_products.sql @@ -0,0 +1,15 @@ +{{ config(alias='competing_countries_products', materialized='view') }} + +SELECT DISTINCT + product_id +FROM + {{ ref('taoec_hs92_ccp_trade_3y_latest') }} +WHERE + country_id IN ( + SELECT country_id + FROM {{ ref('taoec_competing_countries') }} + ) + OR partner_country_id IN ( + SELECT country_id + FROM {{ ref('taoec_competing_countries') }} + ) diff --git a/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade_3y_latest.sql b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade_3y_latest.sql new file mode 100644 index 00000000..edd11735 --- /dev/null +++ b/transform/models/marts/analytics/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade_3y_latest.sql @@ -0,0 +1,25 @@ +{{ config(alias='hs92_ccp_trade_3y_latest', materialized='view') }} + +SELECT + country_id, + country_iso3_code, + partner_country_id, + partner_iso3_code, + product_id, + product_hs92_code, + min(year) AS since_year, + max(year) AS until_year, + sum(export_value) AS export_value, + sum(import_value) AS import_value +FROM {{ ref('taoec_hs92_ccp_trade') }} +WHERE year >= ( + SELECT max(year) - 3 + FROM stage.taoec.hs92_ccp_trade +) +GROUP BY + country_id, + country_iso3_code, + partner_country_id, + partner_iso3_code, + product_id, + product_hs92_code diff --git a/transform/models/marts/graphs/econ_comp/edges/edges_competes_with.sql b/transform/models/marts/graphs/econ_comp/edges/edges_competes_with.sql new file mode 100644 index 00000000..fbcdd06c --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/edges/edges_competes_with.sql @@ -0,0 +1,15 @@ +SELECT + sn.node_id AS source_id, + tn.node_id AS target_id, + m.esi AS esi + +FROM {{ ref('taoec_cc_metrics') }} AS m + +JOIN {{ ref('nodes_countries') }} AS sn +ON m.country_id_1 = sn.country_id + +JOIN {{ ref('nodes_countries') }} AS tn +ON m.country_id_2 = tn.country_id + +ORDER BY m.esi DESC +LIMIT 5% diff --git a/transform/models/marts/graphs/econ_comp/edges/edges_exports.sql b/transform/models/marts/graphs/econ_comp/edges/edges_exports.sql new file mode 100644 index 00000000..ce67ae44 --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/edges/edges_exports.sql @@ -0,0 +1,17 @@ +SELECT + sn.node_id AS source_id, + tn.node_id AS target_id, + sum(export_value) AS amount_usd +FROM + {{ ref('taoec_hs92_ccp_trade_3y_latest') }} AS t +JOIN + {{ ref('nodes_countries') }} AS sn + ON t.country_id = sn.country_id +JOIN + {{ ref('nodes_products') }} AS tn + ON t.product_id = tn.product_id +WHERE + export_value > 0 +GROUP BY + source_id, + target_id diff --git a/transform/models/marts/graphs/econ_comp/edges/edges_imports.sql b/transform/models/marts/graphs/econ_comp/edges/edges_imports.sql new file mode 100644 index 00000000..12f63dbb --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/edges/edges_imports.sql @@ -0,0 +1,17 @@ +SELECT + sn.node_id AS source_id, + tn.node_id AS target_id, + sum(import_value) AS amount_usd +FROM + {{ ref('taoec_hs92_ccp_trade_3y_latest') }} AS t +JOIN + {{ ref('nodes_products') }} AS sn + ON t.product_id = sn.product_id +JOIN + {{ ref('nodes_countries') }} AS tn + ON t.country_id = tn.country_id +WHERE + import_value > 0 +GROUP BY + source_id, + target_id diff --git a/transform/models/marts/graphs/econ_comp/edges/schema.yml b/transform/models/marts/graphs/econ_comp/edges/schema.yml new file mode 100644 index 00000000..a4eda367 --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/edges/schema.yml @@ -0,0 +1,53 @@ +version: 2 + +models: + - name: edges_competes_with + description: >- + Countries that export similar products and are significantly dependent on those + products as a part of their exports share. + columns: + - name: source_id + description: Node ID for a country. + data_tests: + - not_null + - name: target_id + description: Node ID for another country. + data_tests: + - not_null + - name: esi + description: >- + Export Similarity Index for the two linked countries. Only the top 5% pairs + with highest ESI are considered. + - name: edges_exports + description: Exported products by countries. + columns: + - name: source_id + description: Node ID for a country. + data_tests: + - not_null + - name: target_id + description: Node ID for a product. + data_tests: + - not_null + - name: amount_usd + description: Exported amount in US dollars. + data_tests: + - not_null + - positive_integer + + - name: edges_imports + description: Imported products by countries. + columns: + - name: source_id + description: Node ID for a product. + data_tests: + - not_null + - name: target_id + description: Node ID for a country. + data_tests: + - not_null + - name: amount_usd + description: Imported amount in US dollars. + data_tests: + - not_null + - positive_integer diff --git a/transform/models/marts/graphs/econ_comp/nodes/nodes_countries.sql b/transform/models/marts/graphs/econ_comp/nodes/nodes_countries.sql new file mode 100644 index 00000000..9b79985d --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/nodes/nodes_countries.sql @@ -0,0 +1,15 @@ +SELECT + row_number() OVER () AS node_id, + country_id, + country_iso3_code, + country_name, + country_name_short, + in_rankings, + former_country +FROM + {{ ref('taoec_countries') }} +WHERE + country_id IN ( + SELECT country_id + FROM {{ ref('taoec_competing_countries') }} + ) diff --git a/transform/models/marts/graphs/econ_comp/nodes/nodes_products.sql b/transform/models/marts/graphs/econ_comp/nodes/nodes_products.sql new file mode 100644 index 00000000..e5eaf52c --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/nodes/nodes_products.sql @@ -0,0 +1,23 @@ +WITH node_meta AS ( + SELECT max(node_id) + 1 AS start_node_id + FROM {{ ref('nodes_countries') }} +) +SELECT + n.start_node_id + row_number() OVER () AS node_id, + product_id, + product_hs92_code, + product_level, + product_name, + product_name_short, + product_id_hierarchy, + show_feasibility, + natural_resource, + green_product +FROM + {{ ref('taoec_hs92_products') }}, + node_meta AS n +WHERE + product_id IN ( + SELECT product_id + FROM {{ ref('taoec_competing_countries_products') }} + ) diff --git a/transform/models/marts/graphs/econ_comp/nodes/schema.yml b/transform/models/marts/graphs/econ_comp/nodes/schema.yml new file mode 100644 index 00000000..98e5854b --- /dev/null +++ b/transform/models/marts/graphs/econ_comp/nodes/schema.yml @@ -0,0 +1,95 @@ +version: 2 + +models: + - name: nodes_countries + description: Nodes for countries in the Economic Competition Network. + columns: + - name: node_id + description: Unique node ID for this node in the whole graph. + data_tests: + - not_null + - unique + - name: country_id + description: >- + Numerical country ID (internal to Growth Labs) for a country that appears in + the top 5% pairs with highest ESI. This matches the IDs in the countries classification table. + data_tests: + - not_null + - unique + - name: country_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for a country that appears in + the top 5% pairs with highest ESI. + data_tests: + - not_null + - unique + - name: country_name + description: >- + Complete official country name (e.g., United Kingdom of Great Britain and + Northern Ireland). + data_tests: + - not_null + - name: country_name_short + description: Short country name (e.g., United Kingdom). + data_tests: + - not_null + - name: in_rankings + description: >- + Flag indicating whether the country is fit to appear in Growth Labs rankings. + data_tests: + - not_null + - name: former_country + description: Flag indicating a country that no longer exists. + data_tests: + - not_null + + - name: nodes_products + description: >- + Nodes for products traded by countries in the Economic Competition Network. + columns: + - name: node_id + description: Unique node ID for this node in the whole graph. + data_tests: + - not_null + - unique + - name: product_id + description: >- + Numerical product ID (internal to Growth Labs) for a product that is traded + by countries that appear in the top 5% pairs with highest ESI. + data_tests: + - not_null + - name: product_hs92_code + description: >- + HS92 product code for a product that is traded by countries that appear in + the top 5% pairs with highest ESI. HS stands for Harmonized System, and it's + a classification system used in international trade to categorize goods. + - name: product_level + description: Granularity of 1, 2, 4 or 6 digits for HS92. + data_tests: + - not_null + - name: product_name + description: >- + Categorical product name (e.g., Musical instruments; parts and accessories of + such articles). + data_tests: + - not_null + - name: product_name_short + description: Readable product name. + data_tests: + - not_null + - name: product_id_hierarchy + description: Dot-separated product ID hierarchy (e.g., 5.180.1593). + data_tests: + - not_null + - name: show_feasibility + description: >- + Reliability regarding analysis feasibility due to trade discrepancies, + natural resource requirements, etc. + data_tests: + - not_null + - name: natural_resource + description: Flags product as a natural resource. + data_tests: + - not_null + - name: green_product + description: Products associated with green energy, or environmentally friendly. diff --git a/transform/models/stage/the_atlas_of_economic_complexity/schema.yml b/transform/models/stage/the_atlas_of_economic_complexity/schema.yml new file mode 100644 index 00000000..28d0ef2b --- /dev/null +++ b/transform/models/stage/the_atlas_of_economic_complexity/schema.yml @@ -0,0 +1,124 @@ + +version: 2 + +models: + - name: taoec_hs92_ccp_trade + description: Country Trade by Partner and Product (HS92), covering only 2020-2023. + columns: + - name: country_id + description: >- + Numerical country ID (internal to Growth Labs) for the reporting country. + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: country_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for the reporting country. + data_tests: + - not_null + - name: partner_country_id + description: >- + Numerical country ID (internal to Growth Labs) for the partner country. + This matches the IDs in the countries classification table. + data_tests: + - not_null + - name: partner_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for the partner country. + data_tests: + - not_null + - name: product_id + description: Numerical product ID (internal to Growth Labs). + data_tests: + - not_null + - name: product_hs92_code + description: >- + HS92 6-digit product code. HS stands for Harmonized System, and it's a + classification system used in international trade to categorize goods. + - name: year + description: Calendar year. + data_tests: + - not_null + - name: export_value + description: Export value in dollars (USD). + data_tests: + - not_null + - name: import_value + description: Import value in dollars (USD). + data_tests: + - not_null + + - name: taoec_hs92_products + columns: + - name: product_id + description: Numerical product ID (internal to Growth Labs). + data_tests: + - not_null + - name: product_hs92_code + description: >- + HS92 product code. HS stands for Harmonized System, and it's a classification + system used in international trade to categorize goods. + - name: product_level + description: Granularity of 1, 2, 4 or 6 digits for HS92. + data_tests: + - not_null + - name: product_name + description: >- + Categorical product name (e.g., Musical instruments; parts and accessories of + such articles). + data_tests: + - not_null + - name: product_name_short + description: Readable product name. + data_tests: + - not_null + - name: product_parent_id + description: >- + Numerical product ID (internal to Growth Labs) of the parent product category. + - name: product_id_hierarchy + description: Dot-separated product ID hierarchy (e.g., 5.180.1593). + data_tests: + - not_null + - name: show_feasibility + description: >- + Reliability regarding analysis feasibility due to trade discrepancies, + natural resource requirements, etc. + data_tests: + - not_null + - name: natural_resource + description: Flags product as a natural resource. + data_tests: + - not_null + - name: green_product + description: Products associated with green energy, or environmentally friendly. + + - name: taoec_countries + columns: + - name: country_id + description: Numerical country ID (internal to Growth Labs). + data_tests: + - not_null + - name: country_iso3_code + description: >- + Three digit country code (e.g., USA, PRT) for the reporting country. + data_tests: + - not_null + - name: country_name + description: >- + Complete official country name (e.g., United Kingdom of Great Britain and + Northern Ireland). + data_tests: + - not_null + - name: country_name_short + description: Short country name (e.g., United Kingdom). + data_tests: + - not_null + - name: in_rankings + description: >- + Flag indicating whether the country is fit to appear in Growth Labs rankings. + data_tests: + - not_null + - name: former_country + description: Flag indicating a country that no longer exists. + data_tests: + - not_null diff --git a/transform/models/stage/the_atlas_of_economic_complexity/taoec_countries.sql b/transform/models/stage/the_atlas_of_economic_complexity/taoec_countries.sql new file mode 100644 index 00000000..70098358 --- /dev/null +++ b/transform/models/stage/the_atlas_of_economic_complexity/taoec_countries.sql @@ -0,0 +1,24 @@ +{{ config(alias='countries') }} + +SELECT + country_id, + country_iso3_code, + country_name, + country_name_short, + in_rankings, + former_country +FROM read_csv( + '{{ env_var("RAW__THE_ATLAS_OF_ECONOMIC_COMPLEXITY__CLASSIFICATIONS__LOCATION_COUNTRY") }}', + delim = ',', + quote = '"', + escape = '"', + header = true, + columns = { + country_id: USMALLINT, + country_iso3_code: VARCHAR, + country_name: VARCHAR, + country_name_short: VARCHAR, + in_rankings: BOOLEAN, + former_country: BOOLEAN + } +) diff --git a/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade.sql b/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade.sql new file mode 100644 index 00000000..da7ec314 --- /dev/null +++ b/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_ccp_trade.sql @@ -0,0 +1,31 @@ +{{ config(alias='hs92_ccp_trade') }} + +SELECT + country_id, + country_iso3_code, + partner_country_id, + partner_iso3_code, + product_id, + product_hs92_code, + year, + export_value, + import_value +FROM read_csv( + '{{ env_var("RAW__THE_ATLAS_OF_ECONOMIC_COMPLEXITY__HS92__HS92_COUNTRY_COUNTRY_PRODUCT_YEAR_6_2020_2023") }}', + delim = ',', + quote = '"', + escape = '"', + header = true, + nullstr = ['XXXXXX'], + columns = { + country_id: USMALLINT, + country_iso3_code: VARCHAR, + partner_country_id: USMALLINT, + partner_iso3_code: VARCHAR, + product_id: USMALLINT, + product_hs92_code: UINTEGER, + year: USMALLINT, + export_value: BIGINT, + import_value: BIGINT + } +) diff --git a/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_products.sql b/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_products.sql new file mode 100644 index 00000000..01719f51 --- /dev/null +++ b/transform/models/stage/the_atlas_of_economic_complexity/taoec_hs92_products.sql @@ -0,0 +1,33 @@ +{{ config(alias='hs92_products') }} + +SELECT + product_id, + product_hs92_code, + product_level, + product_name, + product_name_short, + product_parent_id, + product_id_hierarchy, + show_feasibility, + natural_resource, + green_product +FROM read_csv( + '{{ env_var("RAW__THE_ATLAS_OF_ECONOMIC_COMPLEXITY__CLASSIFICATIONS__PRODUCT_HS92") }}', + delim = ',', + quote = '"', + escape = '"', + header = true, + nullstr = ['XXXX', 'XXXXXX', '', '9999AA'], + columns = { + product_id: USMALLINT, + product_hs92_code: UINTEGER, + product_level: UTINYINT, + product_name: VARCHAR, + product_name_short: VARCHAR, + product_parent_id: USMALLINT, + product_id_hierarchy: VARCHAR, + show_feasibility: BOOLEAN, + natural_resource: BOOLEAN, + green_product: BOOLEAN + } +) diff --git a/transform/profiles.yml b/transform/profiles.yml index b9925b5a..40ecc2ae 100644 --- a/transform/profiles.yml +++ b/transform/profiles.yml @@ -27,4 +27,10 @@ transform: options: data_path: > s3://{{ env_var('S3_BUCKET') }}/{{ env_var('S3_GRAPHS_MART_PREFIX') }} + - path: > + ducklake:sqlite:{{ env_var('LOCAL_DIR') }}/{{ env_var('ANALYTICS_MART_DB') }} + alias: analytics + options: + data_path: > + s3://{{ env_var('S3_BUCKET') }}/{{ env_var('S3_ANALYTICS_MART_PREFIX') }} target: lakehouse diff --git a/transform/tests/analytics_taoec_cc_metrics_esi_range.sql b/transform/tests/analytics_taoec_cc_metrics_esi_range.sql new file mode 100644 index 00000000..2a9b14da --- /dev/null +++ b/transform/tests/analytics_taoec_cc_metrics_esi_range.sql @@ -0,0 +1,3 @@ +SELECT * +FROM {{ ref('taoec_cc_metrics') }} +WHERE esi < 0 OR esi > 1 diff --git a/uv.lock b/uv.lock index 323dd31d..a3879b52 100644 --- a/uv.lock +++ b/uv.lock @@ -164,6 +164,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/6e/2a2e414e48ab2f7fbb78ba13fc4400ac88f6ae6febe8f792329be5167618/botocore_stubs-1.38.29-py3-none-any.whl", hash = "sha256:85c1852bc08cb6db25e26c8f07fec7e35433537a7a07f0bb91953f4689e262f5", size = 65628, upload-time = "2025-06-03T20:18:41.421Z" }, ] +[[package]] +name = "cattrs" +version = "25.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/57/2b/561d78f488dcc303da4639e02021311728fb7fda8006dd2835550cddd9ed/cattrs-25.1.1.tar.gz", hash = "sha256:c914b734e0f2d59e5b720d145ee010f1fd9a13ee93900922a2f3f9d593b8382c", size = 435016, upload-time = "2025-06-04T20:27:15.44Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/b0/215274ef0d835bbc1056392a367646648b6084e39d489099959aefcca2af/cattrs-25.1.1-py3-none-any.whl", hash = "sha256:1b40b2d3402af7be79a7e7e097a9b4cd16d4c06e6d526644b0b26a063a1cc064", size = 69386, upload-time = "2025-06-04T20:27:13.969Z" }, +] + [[package]] name = "certifi" version = "2025.4.26" @@ -273,6 +286,7 @@ dependencies = [ { name = "dbt-duckdb" }, { name = "environs" }, { name = "gitpython" }, + { name = "humanize" }, { name = "kagglehub" }, { name = "kuzu" }, { name = "langchain" }, @@ -286,7 +300,9 @@ dependencies = [ { name = "platformdirs" }, { name = "prompt-toolkit" }, { name = "python-slugify" }, + { name = "requests-cache" }, { name = "torch" }, + { name = "tqdm" }, ] [package.dev-dependencies] @@ -306,8 +322,9 @@ requires-dist = [ { name = "dbt-duckdb", git = "https://github.com/duckdb/dbt-duckdb.git?rev=afc39991158c0f719e5e57469ab466cfb63fbb8c" }, { name = "environs", specifier = ">=14.2.0" }, { name = "gitpython", specifier = ">=3.1.44" }, + { name = "humanize", specifier = ">=4.12.3" }, { name = "kagglehub", specifier = ">=0.3.12" }, - { name = "kuzu", specifier = ">=0.10.0" }, + { name = "kuzu", specifier = "==0.11.0" }, { name = "langchain", specifier = ">=0.3.26" }, { name = "langchain-kuzu", specifier = ">=0.4.2" }, { name = "langchain-ollama", specifier = ">=0.3.3" }, @@ -319,7 +336,9 @@ requires-dist = [ { name = "platformdirs", specifier = ">=4.3.8" }, { name = "prompt-toolkit", specifier = ">=3.0.51" }, { name = "python-slugify", specifier = ">=8.0.4" }, + { name = "requests-cache", specifier = ">=1.2.1" }, { name = "torch", specifier = ">=2.7.1" }, + { name = "tqdm", specifier = ">=4.67.1" }, ] [package.metadata.requires-dev] @@ -630,6 +649,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "humanize" +version = "4.12.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/22/d1/bbc4d251187a43f69844f7fd8941426549bbe4723e8ff0a7441796b0789f/humanize-4.12.3.tar.gz", hash = "sha256:8430be3a615106fdfceb0b2c1b41c4c98c6b0fc5cc59663a5539b111dd325fb0", size = 80514, upload-time = "2025-04-30T11:51:07.98Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/1e/62a2ec3104394a2975a2629eec89276ede9dbe717092f6966fcf963e1bf0/humanize-4.12.3-py3-none-any.whl", hash = "sha256:2cbf6370af06568fa6d2da77c86edb7886f3160ecd19ee1ffef07979efc597f6", size = 128487, upload-time = "2025-04-30T11:51:06.468Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -767,17 +795,19 @@ wheels = [ [[package]] name = "kuzu" -version = "0.10.0" +version = "0.11.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3c/48/ebd5b564df0474ec7df294a6209d2453086f2ef0cef930608b176c73a40a/kuzu-0.10.0.tar.gz", hash = "sha256:7c7f975d053749295eaacf7acaccbd345bf852264f5ed25323001bbd9d408f48", size = 4854770, upload-time = "2025-05-08T18:23:11.497Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/7c/d2c9355054a67a79ec0cc516b3fad68d970245a1a6f5173eaa2bf94d1782/kuzu-0.11.0.tar.gz", hash = "sha256:34b9fe2d9f94421585f921cb0513bd584842a5705ae757c09fd075e23acb42d7", size = 4897335, upload-time = "2025-07-13T18:37:37.009Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/21/7c/767460c87e4ec70d6ca630172336677fe23698b99afd52178961a5901dee/kuzu-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8146efda377a1661368b607d23aaee4e46a624f764f79a2ee98540fe6c325cc9", size = 3687910, upload-time = "2025-05-08T18:22:32.837Z" }, - { url = "https://files.pythonhosted.org/packages/47/70/6b5a6329183446761527df23977133aef26370dd1f8d511652af1d989275/kuzu-0.10.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:06a4bf8d9057100a2df576adbe1d50d683cd799d8a97f7fe2cbdd013bf6e439b", size = 4151510, upload-time = "2025-05-08T18:22:34.796Z" }, - { url = "https://files.pythonhosted.org/packages/65/26/5478eeefe08e04f7087be4659f2b513faa1fad5d0a362abdd86b2e133d90/kuzu-0.10.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:85fa3af5283632dbc8b98b2146f3fd7fbcdf93ce3430b74fe22df48a09e88540", size = 6097521, upload-time = "2025-05-08T18:22:36.471Z" }, - { url = "https://files.pythonhosted.org/packages/99/47/f7e9fd9d31823453d4857e589b9cbf58907c7ee5443e61c4ad80011a8f34/kuzu-0.10.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f01f7f3be037b6800dd9cdbbc682ba8449a1fb230769341004e890a87541e48", size = 6892802, upload-time = "2025-05-08T18:22:38.668Z" }, - { url = "https://files.pythonhosted.org/packages/09/1b/a061123be5919088b88603e5ec07243ea8b680c3c05aee5ab563c1ba9d8b/kuzu-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:7a92b0c3047e4e77a227a2e2126864c002e9789d8e3aa8b96d8f61df6a35d09d", size = 4189828, upload-time = "2025-05-08T18:22:40.542Z" }, - { url = "https://files.pythonhosted.org/packages/30/e7/3a25f52c1a6c5ffb2aa90e81f4d0a0f9cf84897fc024ddb07242bf578887/kuzu-0.10.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57ed7e0054ac3f2fef4a9c5875f77c15b32ef16ea40d8cc129c492edcce21900", size = 6101063, upload-time = "2025-05-08T18:22:42.159Z" }, - { url = "https://files.pythonhosted.org/packages/cb/de/469d26653805a47b7fa67b77741e7a2271d1316c531463640cd39d063db0/kuzu-0.10.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:964ff28e3540a55e1c865eb8748de6a9385d391b07358ff069c257df3fe76dd0", size = 6897320, upload-time = "2025-05-08T18:22:44.048Z" }, + { url = "https://files.pythonhosted.org/packages/e7/22/b1577470c1e142272cc3646cd68ec13dc06b68bfe26869c1339e3ba8a1b0/kuzu-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d3b928a6646aad0a4284a07918140761f70626e936976c7bc9a1504395029353", size = 3693508, upload-time = "2025-07-13T18:37:02.4Z" }, + { url = "https://files.pythonhosted.org/packages/af/7c/c97de999c782860bff2a223d07afaa71c9ae4e0a214a1d7c3db866cf9157/kuzu-0.11.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:5a995172d99e961fe2ff073722a447d335dca608d566fc924520f1bfea4f97cf", size = 4095016, upload-time = "2025-07-13T18:37:03.742Z" }, + { url = "https://files.pythonhosted.org/packages/2a/df/c9d63b4a3835b944d042add771bdfbaca5bd61a1490b78492e4e299c948f/kuzu-0.11.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:836af97ba5159a59e55cb336869f45987d74d9875bd97caae31af5244f8b99e8", size = 6201752, upload-time = "2025-07-13T18:37:05.756Z" }, + { url = "https://files.pythonhosted.org/packages/e6/8d/55226444b7607d81299e3ff1d47ae4ad76149c0fd266ae7fe04eab52060e/kuzu-0.11.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ee8559686eac9f874d125708f9a83f1dca09bb165e5b838c6c0ad521cce68ee", size = 6979587, upload-time = "2025-07-13T18:37:07.468Z" }, + { url = "https://files.pythonhosted.org/packages/a7/19/1e19851f7229953cd696df9983b953dcc2c0cc1f0ae81e02be9eddd2b379/kuzu-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:7ae94e8add6b5cc25f3cf2a38a07f3c4a4acb9b636078be8a53ac3e8f736d6ba", size = 4289847, upload-time = "2025-07-13T18:37:09.08Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2a/f4579d9b7a8dd205bfc1af89596ed3cbcfea3c0bdf14206083fea509c545/kuzu-0.11.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3667b430de2efbc96e45878e460851d1aa8aa94be96fa5d4d82186f19a95889a", size = 6204963, upload-time = "2025-07-13T18:37:10.637Z" }, + { url = "https://files.pythonhosted.org/packages/ff/bd/a827d5eff7a7abd577841bbe71f8df485501ca8f0250ddbe29c7edf67e6e/kuzu-0.11.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4162d80861e606f4d82d6e559fc11c0d7efa7725a6dc811c61bcd266a2963705", size = 6982953, upload-time = "2025-07-13T18:37:12.429Z" }, + { url = "https://files.pythonhosted.org/packages/03/19/6d41056e2d429ddb19396d992dee5f7804cdb3bee160d53c3cbf97c0f251/kuzu-0.11.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da89fb506be064ebb7d3954f9ffb6e9c0f9ef9c10f37be59a347a0bc48efd28", size = 6202100, upload-time = "2025-07-13T18:37:14.156Z" }, + { url = "https://files.pythonhosted.org/packages/ea/a7/13585d872b65263da8e83c77100914fbaafe91fea11160151a61cf111e03/kuzu-0.11.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b17cc92a925073a3bbd65e05af59a9c0c931e1573755d7ad340705059d849af7", size = 6205072, upload-time = "2025-07-13T18:37:15.907Z" }, ] [[package]] @@ -1590,6 +1620,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928, upload-time = "2024-05-29T15:37:47.027Z" }, ] +[[package]] +name = "requests-cache" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "cattrs" }, + { name = "platformdirs" }, + { name = "requests" }, + { name = "url-normalize" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/be/7b2a95a9e7a7c3e774e43d067c51244e61dea8b120ae2deff7089a93fb2b/requests_cache-1.2.1.tar.gz", hash = "sha256:68abc986fdc5b8d0911318fbb5f7c80eebcd4d01bfacc6685ecf8876052511d1", size = 3018209, upload-time = "2024-06-18T17:18:03.774Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/2e/8f4051119f460cfc786aa91f212165bb6e643283b533db572d7b33952bd2/requests_cache-1.2.1-py3-none-any.whl", hash = "sha256:1285151cddf5331067baa82598afe2d47c7495a1334bfe7a7d329b43e9fd3603", size = 61425, upload-time = "2024-06-18T17:17:45Z" }, +] + [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -1900,6 +1947,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "url-normalize" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/31/febb777441e5fcdaacb4522316bf2a527c44551430a4873b052d545e3279/url_normalize-2.2.1.tar.gz", hash = "sha256:74a540a3b6eba1d95bdc610c24f2c0141639f3ba903501e61a52a8730247ff37", size = 18846, upload-time = "2025-04-26T20:37:58.553Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/d9/5ec15501b675f7bc07c5d16aa70d8d778b12375686b6efd47656efdc67cd/url_normalize-2.2.1-py3-none-any.whl", hash = "sha256:3deb687587dc91f7b25c9ae5162ffc0f057ae85d22b1e15cf5698311247f567b", size = 14728, upload-time = "2025-04-26T20:37:57.217Z" }, +] + [[package]] name = "urllib3" version = "2.4.0"