In [None]:
import tempfile
import boto3
from botocore import UNSIGNED
from botocore.client import Config

import anndata as ad
import pandas as pd

# https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com
s3_bucket_name = 'allen-brain-cell-atlas'
s3_region_name = 'us-west-2'

# set version, brain strings for accessing data
CURRENT_VERSION = '20230830'
BRAIN_LABEL = 'C57BL6J-638850'

# Use a temporary directory to load in files from the S3 bucket
with tempfile.TemporaryDirectory() as temp_dir:
    # initialize S3 client with unsigned credentials because the bucket is public
    s3_client = boto3.client('s3', region_name=s3_region_name, 
                             config=Config(signature_version=UNSIGNED))

    # Load raw counts as AnnData obj from .h5ad file
    counts_raw_file = f'{BRAIN_LABEL}-raw.h5ad'
    counts_dir = f'expression_matrices/MERFISH-{BRAIN_LABEL}/{CURRENT_VERSION}/'
    counts_temp_path = f'{temp_dir}/{counts_raw_file}'
    s3_client.download_file(s3_bucket_name, counts_dir+counts_raw_file, 
                            counts_temp_path)
    counts_raw_adata = ad.read_h5ad(counts_temp_path)
    
    # Load metadata as pandas DataFrames from CSV files
    # 'metadata/MERFISH-C57BL6J-638850-CCF/20230830/views/cell_metadata_with_parcellation_annotation.csv'
    metadata_MERFISH_CCF_dir = f'metadata/MERFISH-{BRAIN_LABEL}-CCF/{CURRENT_VERSION}/views/'
    metadata_MERFISH_dir = f'metadata/MERFISH-{BRAIN_LABEL}/{CURRENT_VERSION}/'
    metadata_CCF_dir = 'metadata/Allen-CCF-2020/20230630/'
    # cell metadata
    cell_metadata_file = 'cell_metadata_with_parcellation_annotation.csv'
    cell_md_temp_path = f'{temp_dir}/{cell_metadata_file}'
    s3_client.download_file(s3_bucket_name, 
                            metadata_MERFISH_CCF_dir+cell_metadata_file, 
                            cell_md_temp_path)
    cell_md_df = pd.read_csv(cell_md_temp_path)
    # gene metadata
    gene_metadata_file = 'gene.csv'
    gene_md_temp_path = f'{temp_dir}/{gene_metadata_file}'
    s3_client.download_file(s3_bucket_name, 
                            metadata_MERFISH_dir+gene_metadata_file, 
                            gene_md_temp_path)
    gene_md_df = pd.read_csv(gene_md_temp_path)
    # CCF terms metadata
    ccf_terms_file = 'parcellation_to_parcellation_term_membership.csv'
    ccf_terms_temp_path = f'{temp_dir}/{ccf_terms_file}'
    s3_client.download_file(s3_bucket_name, 
                            metadata_CCF_dir+ccf_terms_file, 
                            ccf_terms_temp_path)
    ccf_terms_df = pd.read_csv(ccf_terms_temp_path)
    
    # Load CCF labeled image volumes
    ccf_annotation_dir = f'image_volumes/MERFISH-{BRAIN_LABEL}-CCF/20230630/'
    ccf_annotation_file = 'resampled_annotation.nii.gz'
    ccf_labels_temp_path = f'{temp_dir}/{ccf_annotation_file}'
    s3_client.download_file(s3_bucket_name, 
                            ccf_annotation_dir+ccf_annotation_file, 
                            ccf_labels_temp_path)
    img = nibabel.load(ccf_labels_temp_path)
    ccf_imdata = np.array(img.dataobj)