In [None]:
import sys
import os
from pathlib import Path

# Get the root directory by moving up one level from "notebooks/"
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add root directory to sys.path
sys.path.append(root_dir)

# Now you can import using the module path
from pipeline.utils.duckdb_wrapper import DuckDBWrapper


In [None]:
# Initialize the DuckDBWrapper (in-memory DuckDB instance) You can connect directly to a DuckDB file by adding the path like con = DuckDBWrapper("path/to/file.duckdb")
con = DuckDBWrapper()

In [None]:
#We import our dataset mapping from pipeline/datasets.py to create views on our parquet files for analysis in our notebook
from pathlib import Path
import sys

nb_dir   = Path.cwd().resolve()           # …/notebooks
repo_dir = nb_dir.parent                  # repo root that contains `pipeline/`

if str(repo_dir) not in sys.path:
    sys.path.insert(0, str(repo_dir))

from pipeline.datasets import (
    SINGLE_FILE_ASSETS_NAMES,      # -> use for non-partitioned files
    PARTITIONED_ASSETS_NAMES       # -> use for partitioned files
)


base_path = "data/opendata"

con.bulk_register_data(
    repo_root   = repo_dir,
    base_path   = base_path,
    table_names = SINGLE_FILE_ASSETS_NAMES,       # <-- here
    wildcard    = "*.parquet",
    as_table    = False,
    show_tables = False
)

con.bulk_register_partitioned_data(
    repo_root   = repo_dir,
    base_path   = base_path,
    table_names = PARTITIONED_ASSETS_NAMES,  
    wildcard    = "year=*/month=*/*.parquet",
    as_table    = False,
    show_tables = True
)

In [None]:
query = f"""

SELECT * from mta_subway_hourly_ridership limit 20000

"""

result = con.run_query(query)

print(result)


In [None]:
#If you want a better looking table, set show_results=True. I'd recomend capping the limit at about 50 rows
#T

query = f"""

SELECT * from mta_subway_hourly_ridership limit 10
"""

result = con.run_query(query,show_results=True)


In [None]:
# Show the tables registered
con.show_tables()


In [None]:
# Show the schema of a specific table
con.show_schema("mta_subway_hourly_ridership")

In [None]:
query = f"""

SELECT * from mta_subway_hourly_ridership where year=2024 limit 100 

"""

result = con.run_query(query)

print(result)

repo_root = Path.cwd().resolve().parents[0]  # Adjust to locate the repo root
base_path = repo_root / "data/exports"
file_name = "mta_subway_hourly_ridership_data_sample"
file_type= "csv"
# Export the query result to CSV
con.export(result, file_type=file_type, base_path=base_path, file_name=file_name)