# Bulk Data - Ingesting data for an identifier to local filesystem

In [None]:
# Import required dependencies
import os
from carbonarc import CarbonArcClient

from datetime import datetime
from urllib.parse import urlparse, parse_qs

## Read in environment variables
API_AUTH_TOKEN=os.getenv("API_AUTH_TOKEN")

# Create API Client
client = CarbonArcClient(API_AUTH_TOKEN)

In [None]:
data_identifier = "CA0005"

In [None]:
# Helper function to create local path from URL
def create_local_path(url: str, base_dir: str = "./data") -> tuple:
    parsed = urlparse(url)

    # Get the table name (second last part of the path)
    path_parts = parsed.path.strip("/").split("/")
    table_name = path_parts[-2]
    filename = path_parts[-1]

    # Parse query parameters
    query_params = parse_qs(parsed.query)
    param_dirs = [f"{key}={value[0]}" for key, value in query_params.items()]

    # Construct full local path
    local_dir = os.path.join(base_dir, table_name, *param_dirs)
    local_path = os.path.join(local_dir, f"{filename}.parquet")

    return local_dir, local_path

def download_manifest_files(manifest):
    manifest_files = manifest.get("files", [])
    print(f"Downloading {len(manifest_files)} files")
    for file_info in manifest_files:
        manifest_file_url = file_info["url"]
        local_dir, local_path = create_local_path(manifest_file_url)
        # skip download if file exists
        if os.path.exists(local_path):
            print(f"File '{local_path}' exists. Skipping download.")
            continue

        # make sure local_dir exists
        os.makedirs(local_dir, exist_ok=True)
        print(f"Downloading {manifest_file_url} to: {local_path}")
        client.data.download_bulk_data_to_file(manifest_file_url, local_path)  

In [None]:
# Download all history for give data identifier
manifest = client.data.get_bulk_data_manifest(data_identifier, created_since=None)
print(f"\nManifest for {data_identifier}:")
download_manifest_files(manifest)

In [None]:
# Downloading files created since last ingestions, this needs last ingestion time
last_ingest_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')
print(last_ingest_time)

manifest = client.data.get_bulk_data_manifest(data_identifier, created_since=last_ingest_time)

print(f"\nManifest for {data_identifier}:")
download_manifest_files(manifest)

### Manifest file structure

```
{ 'url': 'link',
'format': 'parquet',
'records': 1000,
'size_bytes': 123456789,
'modification_time': '2025-04-15T23:04:44',
'price': 123.45, }