In [None]:
import caveclient
import pandas as pd
import numpy as np
import os
import tqdm
import standard_transform

from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib


trafo_streamline = standard_transform.datasets.v1dd_streamline_nm()

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 1000)

mat_version = 1169

HOME = os.path.expanduser("~")

# NOTE: adjust for your system
# data_dir = f"{HOME}/SWDB_2025_Connectomics/data/"
data_dir = f"{HOME}/code/swdb/SWDB_2025_Connectomics/data/{mat_version}"

client = caveclient.CAVEclient(
    "v1dd",
    server_address="https://global.em.brain.allentech.org",
    version=mat_version,
)

## Proofread information


In [2]:
neuron_soma_df = client.materialize.query_table("neurons_soma_model")
axon_proof_df = client.materialize.query_table(
    "proofreading_status_and_strategy",
    filter_in_dict={"strategy_axon": ["axon_fully_extended"]},
)
single_soma_df = client.materialize.query_view(
    "single_somas", split_positions=True, desired_resolution=[1, 1, 1]
)

dendrite_proof_root_ids = np.array(single_soma_df["pt_root_id"])
dendrite_proof_root_ids = dendrite_proof_root_ids[
    np.isin(dendrite_proof_root_ids, neuron_soma_df["pt_root_id"])
]
axon_proof_root_ids = np.array(axon_proof_df["pt_root_id"])

In [3]:
np.save(f"{data_dir}/proofread_axon_list_{mat_version}.npy", axon_proof_root_ids)
np.save(
    f"{data_dir}/proofread_dendrite_list_{mat_version}.npy", dendrite_proof_root_ids
)

## Cell information


In [4]:
cell_type_df = client.materialize.query_table(
    "cell_type_snds", desired_resolution=[1, 1, 1], split_positions=True
).rename(columns={"classification_system": "cell_type_coarse"})
cell_type_df = cell_type_df[
    ~cell_type_df["cell_type"].map(
        lambda x: x.startswith("L23_")
        or x.startswith("L1_inh")
        or x.startswith("L6_6b")
    )
]
nuc_df = client.materialize.query_view(
    "nucleus_alternative_lookup", split_positions=True, desired_resolution=[1, 1, 1]
)

# reference_point = np.array([900_000.0, 700_000.0, 300_000.0])
# single_soma_df[["pt_position_x_trafo", "pt_position_y_trafo", "pt_position_z_trafo"]] = trafo_streamline.radial_points(reference_point, single_soma_df[["pt_position_x", "pt_position_y", "pt_position_z"]]) * 1000

In [5]:
def map_cell_type(ct_str):
    if ct_str.startswith("L"):
        return "-".join(ct_str.split("_")[:2])
    else:
        return ct_str[:3]


cell_type_df["cell_type"] = cell_type_df["cell_type"].map(map_cell_type)
cell_type_df["cell_type_coarse"] = cell_type_df["cell_type_coarse"].map(
    lambda x: {"inh": "I", "exc": "E"}[x]
)

In [6]:
soma_ct_df = pd.merge(
    nuc_df[
        [
            "id",
            "pt_position_x",
            "pt_position_y",
            "pt_position_z",
            "pt_root_id",
            "volume",
        ]
    ],
    cell_type_df[["pt_root_id", "cell_type_coarse", "cell_type"]],
    how="left",
    on="pt_root_id",
)

In [7]:
soma_ct_df

Unnamed: 0,id,pt_position_x,pt_position_y,pt_position_z,pt_root_id,volume,cell_type_coarse,cell_type
0,228132,632828,749849,738270,864691132737039043,458.464831,,
1,543247,1304922,977915,83880,864691132730839988,73.345940,,
2,203262,624680,531094,283770,864691132654552792,338.276613,E,L3-IT
3,350562,894573,478559,163530,864691132773514104,326.965400,E,L2-IT
4,718122,1729859,674111,781200,864691132774106773,333.888647,,
...,...,...,...,...,...,...,...,...
207450,527607,1262940,628094,734445,864691132639606383,100.547645,,
207451,168582,491518,1057067,92070,864691133042980384,369.919126,,
207452,29422,302330,415005,81855,0,285.031368,,
207453,422767,1065603,538932,36405,864691132851361283,394.724290,,


In [8]:
soma_ct_df.to_feather(
    f"{data_dir}/soma_and_cell_type_{mat_version}.feather", compression="zstd"
)

## Synapse information


In [9]:
syn_path = f"{data_dir}/syn_df_all_to_proofread_to_all_{mat_version}.feather"

if os.path.exists(syn_path):
    syn_df = pd.read_feather(syn_path)
else:
    syn_dfs = []

    pre_syn_path = (
        f"{data_dir}/pre_syn_df_all_to_proofread_to_all_{mat_version}.feather"
    )
    if os.path.exists(pre_syn_path):
        pre_syn_df = pd.read_feather(pre_syn_path)
    else:
        for pre_root_ids in tqdm.tqdm(
            np.array_split(axon_proof_root_ids, len(axon_proof_root_ids) // 40)
        ):
            syn_df_chunk = client.materialize.query_table(
                "synapses_v1dd",
                filter_in_dict={"pre_pt_root_id": pre_root_ids},
                desired_resolution=[1, 1, 1],
                split_positions=True,
            )
            syn_df_chunk = syn_df_chunk[
                syn_df_chunk["pre_pt_root_id"] != syn_df_chunk["post_pt_root_id"]
            ]
            syn_df_chunk = syn_df_chunk.drop(
                columns=[
                    "created",
                    "valid",
                    "superceded_id",
                    "pre_pt_supervoxel_id",
                    "post_pt_supervoxel_id",
                ]
            )
            assert len(syn_df_chunk) < 500_000
            syn_dfs.append(syn_df_chunk)
        pre_syn_df = pd.concat(syn_dfs)
        pre_syn_df.to_feather(syn_path, compression="zstd")

    post_syn_path = (
        f"{data_dir}/post_syn_df_all_to_proofread_to_all_{mat_version}.feather"
    )
    if os.path.exists(post_syn_path):
        post_syn_df = pd.read_feather(post_syn_path)
    else:
        for post_root_ids in tqdm.tqdm(
            np.array_split(axon_proof_root_ids, len(axon_proof_root_ids) // 40)
        ):
            syn_df_chunk = client.materialize.query_table(
                "synapses_v1dd",
                filter_in_dict={"post_pt_root_id": post_root_ids},
                desired_resolution=[1, 1, 1],
                split_positions=True,
            )
            syn_df_chunk = syn_df_chunk[
                syn_df_chunk["pre_pt_root_id"] != syn_df_chunk["post_pt_root_id"]
            ]
            syn_df_chunk = syn_df_chunk.drop(
                columns=[
                    "created",
                    "valid",
                    "superceded_id",
                    "pre_pt_supervoxel_id",
                    "post_pt_supervoxel_id",
                ]
            )
            assert len(syn_df_chunk) < 500_000
            syn_dfs.append(syn_df_chunk)
        post_syn_df = pd.concat(syn_dfs)
        post_syn_df.to_feather(syn_path, compression="zstd")

    syn_df = pd.concat([pre_syn_df, post_syn_df])
    syn_df = syn_df.drop_duplicates("id", keep="first").reset_index(drop=True)
    syn_df.to_feather(syn_path, compression="zstd")

In [10]:
syn_df

Unnamed: 0,id,pre_pt_position_x,pre_pt_position_y,pre_pt_position_z,post_pt_position_x,post_pt_position_y,post_pt_position_z,ctr_pt_position_x,ctr_pt_position_y,ctr_pt_position_z,size,pre_pt_root_id,post_pt_root_id
0,354386968,758200.5,802316.1,304380.0,757861.0,802558.6,304650.0,757967.7,802597.4,304380.0,240,864691132536286810,864691132734919083
1,220616943,574501.9,337249.6,258570.0,574152.7,337016.8,258570.0,574337.0,336900.4,258570.0,420,864691132558380553,864691132828255906
2,119675985,444260.0,602544.6,3285.0,443988.4,602311.8,3555.0,444182.4,602370.0,3780.0,3637,864691132572564252,864691132654028028
3,378070488,792063.2,514342.5,183735.0,792664.6,514284.3,183915.0,792412.4,514294.0,183735.0,3056,864691132572190492,864691132606767301
4,560121937,1054031.1,711388.3,323010.0,1054234.8,711417.4,323100.0,1054147.5,711368.9,323010.0,440,864691132558474249,864691132640932240
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8190070,418692287,844967.0,719390.8,245880.0,844520.8,718964.0,246240.0,844889.4,719352.0,246150.0,294,864691132720378721,864691132831272057
8190071,450289242,902575.3,759607.0,153045.0,902914.8,760043.5,152460.0,902895.4,759742.8,152730.0,537,864691130711898993,864691132831272057
8190072,466283966,904689.9,773167.6,330120.0,904893.6,773652.6,330030.0,904738.4,773487.7,329805.0,4893,864691132638495307,864691132831272057
8190073,449903160,897172.4,751934.3,237060.0,896755.3,752322.3,237375.0,897085.1,752167.1,237105.0,1856,864691132626349628,864691132831272057


In [None]:
syn_label_path = (
    f"{data_dir}/syn_label_df_all_to_proofread_to_all_{mat_version}.feather"
)


def get_syn_label_chunk(syn_id_chunk) -> pd.DataFrame:
    syn_label_chunk = client.materialize.query_table(
        "synapse_target_predictions_ssa",
        filter_in_dict={"target_id": syn_id_chunk},
        log_warning=False,
        merge_reference=False,
        select_columns=["target_id", "tag"],
    )
    syn_label_chunk = syn_label_chunk.rename(columns={"target_id": "id"})
    return syn_label_chunk


if os.path.exists(syn_label_path):
    syn_label_df = pd.read_feather(syn_label_path)
else:
    syn_ids = syn_df["id"].values
    chunk_size = 25_000
    n_chunks = len(syn_ids) // chunk_size + 1

    syn_id_chunks = np.array_split(syn_ids, n_chunks)

    with tqdm_joblib(desc="Pulling synapse labels", total=len(syn_id_chunks)):
        syn_label_chunks: list[pd.DataFrame] = Parallel(n_jobs=-2)(
            delayed(get_syn_label_chunk)(syn_id_chunk) for syn_id_chunk in syn_id_chunks
        )

    syn_label_df: pd.DataFrame = pd.concat(syn_label_chunks).set_index("id")
    syn_label_df.to_feather(
        syn_label_path,
        compression="zstd",
    )

syn_label_df

Pulling synapse labels: 100%|██████████| 328/328 [00:30<00:00, 10.82it/s]
