# Goals

* Update the paths to the h5ad files in the metadata files (parquet)

# Var

In [28]:
gcp_bucket = 'arc-scbasecount'
gcp_bucket_path = '2025-02-25/metadata'
feature_types = ['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto']

# Init

In [29]:
import os
import pandas as pd
import scanpy as sc
import pyarrow.dataset as ds
import gcsfs

In [30]:
# initialize GCS file system for reading data from GCS
fs = gcsfs.GCSFileSystem()

# List available files

In [51]:
# helper function to list files 
def get_file_table(gcs_base_path: str, target: str=None, endswith: str=None):
    files = fs.glob("/".join([gcs_base_path.rstrip("/"), "**"]))
    if target:
        files = [f for f in files if os.path.basename(f) == target]
    else:
        files = [f for f in files if f.endswith(endswith)]
    file_list = []
    for f in files:
        file_list.append(f.split("/")[-2:-1] + [f])
    return pd.DataFrame(file_list, columns=["organism", "file_path"])

In [52]:
# set the path to the metadata files
gcs_path = f'gs://{gcp_bucket}/{gcp_bucket_path}'
gcs_path

'gs://arc-scbasecount/2025-02-25/metadata'

In [54]:
meta_files = get_file_table(gcs_path, 'sample_metadata.parquet')
meta_files

Unnamed: 0,organism,file_path
0,Arabidopsis_thaliana,arc-scbasecount/2025-02-25/metadata/Gene/Arabi...
1,Bos_taurus,arc-scbasecount/2025-02-25/metadata/Gene/Bos_t...
2,Caenorhabditis_elegans,arc-scbasecount/2025-02-25/metadata/Gene/Caeno...
3,Callithrix_jacchus,arc-scbasecount/2025-02-25/metadata/Gene/Calli...
4,Danio_rerio,arc-scbasecount/2025-02-25/metadata/Gene/Danio...
...,...,...
100,Pan_troglodytes,arc-scbasecount/2025-02-25/metadata/Velocyto/P...
101,Schistosoma_mansoni,arc-scbasecount/2025-02-25/metadata/Velocyto/S...
102,Solanum_lycopersicum,arc-scbasecount/2025-02-25/metadata/Velocyto/S...
103,Sus_scrofa,arc-scbasecount/2025-02-25/metadata/Velocyto/S...


# Updating file paths

In [64]:
for parquet_file in meta_files["file_path"]:
    print(parquet_file)
    # read the parquet file
    sample_metadata = ds.dataset(parquet_file, filesystem=fs, format="parquet").to_table().to_pandas()
    # update the file path
    sample_metadata["file_path"] = sample_metadata["file_path"].str.replace("arc-ctc-scbasecamp", "arc-scbasecount", regex=True)
    # write the updated metadata to the parquet file (gzip)
    sample_metadata.to_parquet(parquet_file, filesystem=fs, index=False, compression="gzip")

arc-scbasecount/2025-02-25/metadata/Gene/Arabidopsis_thaliana/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Bos_taurus/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Caenorhabditis_elegans/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Callithrix_jacchus/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Danio_rerio/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Drosophila_melanogaster/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Equus_caballus/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Gallus_gallus/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Gorilla_gorilla/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Heterocephalus_glaber/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Homo_sapiens/sample_metadata.parquet
arc-scbasecount/2025-02-25/metadata/Gene/Macaca_mulatta/sample_metadata.parquet
arc-scbasecount

## Check the results

In [65]:
parquet_file = meta_files[meta_files["organism"] == "Arabidopsis_thaliana"]["file_path"].values[0]
parquet_file

'arc-scbasecount/2025-02-25/metadata/Gene/Arabidopsis_thaliana/sample_metadata.parquet'

In [66]:
# read the parquet file
sample_metadata = ds.dataset(parquet_file, filesystem=fs, format="parquet").to_table().to_pandas()
sample_metadata.head()

Unnamed: 0,entrez_id,srx_accession,feature_type,file_path,obs_count,lib_prep,tech_10x,cell_prep,organism,tissue,disease,perturbation,cell_line,czi_collection_id,czi_collection_name
0,24123125,SRX17302366,GeneFull_Ex50pAS,gs://arc-scbasecount/2025-02-25/h5ad/Gene/Arab...,9036,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,not specified,"BL (Brassinolide), 100nM, 0.5 hours post-treat...",WT Col-0,,
1,24123140,SRX17302381,GeneFull_Ex50pAS,gs://arc-scbasecount/2025-02-25/h5ad/Gene/Arab...,14317,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,not specified,"control treatment, age: 7 days",WT Col-0,,
2,24123142,SRX17302383,GeneFull_Ex50pAS,gs://arc-scbasecount/2025-02-25/h5ad/Gene/Arab...,20075,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,unsure,control,unsure,,
3,26626960,SRX19366049,GeneFull_Ex50pAS,gs://arc-scbasecount/2025-02-25/h5ad/Gene/Arab...,7539,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,unsure,mock treatment (control group),not applicable,,
4,26626958,SRX19366047,GeneFull_Ex50pAS,gs://arc-scbasecount/2025-02-25/h5ad/Gene/Arab...,7703,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,none,mock treatment; 2 µM RALF1 peptide for 2 hours,none,,


In [68]:
parquet_file = meta_files["file_path"].values[meta_files.shape[0] - 1]
parquet_file

'arc-scbasecount/2025-02-25/metadata/Velocyto/Zea_mays/sample_metadata.parquet'

In [69]:
# read the parquet file
sample_metadata = ds.dataset(parquet_file, filesystem=fs, format="parquet").to_table().to_pandas()
sample_metadata.head()

Unnamed: 0,entrez_id,srx_accession,feature_type,file_path,obs_count,lib_prep,tech_10x,cell_prep,organism,tissue,disease,perturbation,cell_line,czi_collection_id,czi_collection_name
0,26649277,SRX19383767,Velocyto,gs://arc-scbasecount/2025-02-25/h5ad/Velocyto/...,1894,10x_Genomics,3_prime_gex,single_nucleus,Zea mays,other,Not specified,Not specified,"Nuclei, Strain: B73, Age: 10 days old",,
1,26649278,SRX19383768,Velocyto,gs://arc-scbasecount/2025-02-25/h5ad/Velocyto/...,2043,10x_Genomics,3_prime_gex,single_nucleus,Zea mays,other,Not specified,Not specified,Nuclei,,
2,26649276,SRX19383766,Velocyto,gs://arc-scbasecount/2025-02-25/h5ad/Velocyto/...,4680,10x_Genomics,3_prime_gex,single_cell,Zea mays,other,Not specified,Not specified,"B73, 10 days old, Nuclei",,
3,26222704,SRX19052017,Velocyto,gs://arc-scbasecount/2025-02-25/h5ad/Velocyto/...,9105,10x_Genomics,3_prime_gex,single_cell,Zea mays,other,unsure,unsure,other,,
4,21466486,SRX15010906,Velocyto,gs://arc-scbasecount/2025-02-25/h5ad/Velocyto/...,5087,10x_Genomics,3_prime_gex,single_cell,Zea mays,other,ulcerative colitis,infliximab treatment,not applicable,,


# session info

In [71]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/tiledb:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiobotocore               2.19.0                   pypi_0    pypi
aiohappyeyeballs          2.4.4              pyhd8ed1ab_1    conda-forge
aiohttp                   3.11.11         py312h178313f_0    conda-forge
aioitertools              0.12.0                   pypi_0    pypi
aiosignal                 1.3.2              pyhd8ed1ab_0    conda-forge
anndata                   0.11.3             pyhd8ed1ab_0    conda-forge
array-api-compat          1.10.0             pyhd8ed1ab_0    conda-forge
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
attrs                     25.1.0             pyh71513ae_0    conda-forge
aws-c-auth                0.8.1                h205f482_0    conda-fo