# Goal

* Approximate the number of times that each arc-VCA dataset has been downloaded


In [24]:
from google.cloud import monitoring
from google.cloud import storage
import datetime

In [25]:
# set variables
project_id = 'c-tc-429521'
egress_cost_per_tb = 120    # dollars per TB
days_ago = (datetime.datetime.now() - datetime.datetime(2025, 2, 25)).days

In [26]:
# Convert to readable format
def human_readable_size(size_bytes):
    """Convert bytes to human readable format (KB, MB, GB, etc.)"""
    if size_bytes == 0:
        return "0B"
    
    size_names = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = 0
    while size_bytes >= 1024 and i < len(size_names) - 1:
        size_bytes /= 1024
        i += 1
    
    return f"{size_bytes:.2f} {size_names[i]}"

In [27]:
def get_bucket_egress(project_id, bucket_name, days_ago=30):
    client = monitoring.MetricServiceClient()
    project_name = f"projects/{project_id}"
    
    # Define the time interval (last N days)
    now = datetime.datetime.utcnow()
    seconds = int(now.timestamp())
    start_seconds = seconds - (days_ago * 24 * 60 * 60)
    
    # Create the time interval using the correct structure
    interval = monitoring.TimeInterval(
        start_time={"seconds": start_seconds},
        end_time={"seconds": seconds},
    )
    
    # Define the filter string directly
    filter_str = (
        f'metric.type="storage.googleapis.com/network/sent_bytes_count"'
        f' AND resource.type="gcs_bucket"'
        f' AND resource.labels.bucket_name="{bucket_name}"'
    )
    
    # Create the request
    request = monitoring.ListTimeSeriesRequest(
        name=project_name,
        filter=filter_str,
        interval=interval,
    )
    
    # Execute the query and aggregate the results
    total_bytes = 0
    for time_series in client.list_time_series(request):
        for point in time_series.points:    
            if hasattr(point.value, 'int64_value'):
                total_bytes += point.value.int64_value
            elif hasattr(point.value, 'double_value'):
                total_bytes += point.value.double_value
            else:
                print(f"Unknown value type: {type(point.value)}")
                return None
    
    return total_bytes


In [28]:
def get_folder_size(bucket_name, prefix):
    """
    Calculate the total size of all objects in a folder (prefix) in a GCP bucket.
    
    Args:
        bucket_name (str): Name of the GCS bucket
        prefix (str): The folder prefix to calculate size for (e.g., "folder/subfolder/")
    
    Returns:
        int: Total size in bytes
        int: Number of files
    """
    # Make sure the prefix ends with a slash if it's meant to be a folder
    if prefix and not prefix.endswith('/'):
        prefix += '/'
    
    # Initialize the storage client
    storage_client = storage.Client()
    
    # Get the bucket
    bucket = storage_client.get_bucket(bucket_name)
    
    # List all blobs with the given prefix
    blobs = bucket.list_blobs(prefix=prefix)
    
    # Calculate total size
    total_size = 0
    file_count = 0
    
    for blob in blobs:
        # Skip the directory marker itself if it exists as an object
        if blob.name == prefix:
            continue
        
        total_size += blob.size
        file_count += 1
    
    return total_size, file_count

## Tahoe100

In [29]:
# Total egress
total_egress_bytes = get_bucket_egress(project_id, "arc-ctc-tahoe100", days_ago=days_ago)
total_egress_gb = total_egress_bytes / (1024**3)
print(f"Total egress: {human_readable_size(total_egress_bytes)}")

  now = datetime.datetime.utcnow()


Total egress: 108.32 TB


In [30]:
# 2025-02-25 folder size
total_size_bytes, num_files = get_folder_size('arc-ctc-tahoe100', '2025-02-25')
total_size_gb = total_size_bytes / (1024**3)
print(f"Total size: {human_readable_size(total_size_bytes)}")

Total size: 315.75 GB


In [31]:
# egress divided by size of the dataset
num_downloads = total_egress_gb / total_size_gb
print(f"Number of downloads: {num_downloads:.1f}")

Number of downloads: 351.3


### Costs

In [32]:
# multiple total egress by egress cost per TB
total_cost = total_egress_gb / 1024 * egress_cost_per_tb
print(f"Total cost: ${total_cost:.2f}")

Total cost: $12998.60


In [33]:
# per day
total_cost_per_day = total_cost / days_ago
print(f"Total cost per day: ${total_cost_per_day:.2f}")

Total cost per day: $154.75


# scBaseCount (Camp)

In [34]:
# Total egress for scBaseCount
total_egress_bytes = get_bucket_egress(project_id, "arc-ctc-scbasecamp", days_ago=days_ago)
total_egress_gb = total_egress_bytes / (1024**3)
print(f"Total egress: {human_readable_size(total_egress_bytes)}")

  now = datetime.datetime.utcnow()


Total egress: 392.13 TB


In [35]:
# Total egress for scBaseCamp
total_egress_bytes += get_bucket_egress(project_id, "arc-scbasecount", days_ago=days_ago)
total_egress_gb = total_egress_bytes / (1024**3)
print(f"Total egress: {human_readable_size(total_egress_bytes)}")

  now = datetime.datetime.utcnow()


Total egress: 394.25 TB


In [36]:
# Total folder size
total_size_bytes, num_files = get_folder_size('arc-scbasecount', '2025-02-25')
total_size_gb = total_size_bytes / (1024**3)
print(f"Total size: {human_readable_size(total_size_bytes)}")

Total size: 8.10 TB


In [37]:
# egress divided by size of the dataset
num_downloads = total_egress_gb / total_size_gb
print(f"Number of downloads: {num_downloads:.1f}")

Number of downloads: 48.7


In [40]:
# egress divided by size of the dataset => assuming dataset is 1 of the 5 feature types
num_downloads = total_egress_gb / (total_size_gb / 5)
print(f"Number of downloads: {num_downloads:.1f}")

Number of downloads: 243.4


### Costs

In [38]:
# multiple total egress by egress cost per TB
total_cost = total_egress_gb / 1024 * egress_cost_per_tb
print(f"Total cost: ${total_cost:.2f}")

Total cost: $47310.30


In [39]:
# per day
total_cost_per_day = total_cost / days_ago
print(f"Total cost per day: ${total_cost_per_day:.2f}")

Total cost per day: $563.22


# session info

In [125]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/tiledb:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiobotocore               2.19.0                   pypi_0    pypi
aiohappyeyeballs          2.4.4              pyhd8ed1ab_1    conda-forge
aiohttp                   3.11.11         py312h178313f_0    conda-forge
aioitertools              0.12.0                   pypi_0    pypi
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
anndata                   0.11.3             pyhd8ed1ab_0    conda-forge
array-api-compat          1.10.0             pyhd8ed1ab_0    conda-forge
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
attrs                     25.1.0             pyh71513ae_0    conda-forge
aws-c-auth                0.9.0                h66f1c83_6    conda-fo