In [2]:
import pandas as pd
import caveclient 
import numpy as np
import sys
import os
from deltalake import DeltaTable, write_deltalake
import pyarrow as pa
from pathlib import Path

In [3]:
sys.path.append(str("/root/capsule/src/"))

In [4]:
from connects_common_connectivity.arrow_utils import build_arrow_schema, models_to_table, attach_linkml_metadata


In [5]:
# df = pd.read_feather('../data/minnie1412/joint_clustering_feat_df_minnie.feather')
client = caveclient.CAVEclient('minnie65_phase3_v1', auth_token=os.environ['CUSTOM_KEY'])
version = 1412
client.materialize.version = version
nuc_df=client.materialize.query_view('nucleus_detection_lookup_v1')
# dfm=pd.merge(df, nuc_df[['id', 'pt_root_id']], left_on='root_id', right_on='pt_root_id', how='left')
# dfm.to_parquet('../data/minnie1412/minnie_features.parquet')

In [20]:
dfm = pd.read_parquet('../data/minnie1412/minnie_features.parquet')

In [21]:
from connects_common_connectivity.models import DataSet, Modality, DataItem, DataItemDataSetAssociation

In [22]:
ds=DataSet(id='minnie65_v1412_csm_cluster',
         name="Minnie65 v1412 CSM Dendrite Ultrastructure Collection",
         publication="none",
         modality="ELECTRON_MICROSCOPY",
         project_id="minnie65")

In [23]:
ds

DataSet(project_id='minnie65', id='minnie65_v1412_csm_cluster', name='Minnie65 v1412 CSM Dendrite Ultrastructure Collection', publication='none', modality='ELECTRON_MICROSCOPY')

In [24]:
schema = build_arrow_schema(DataSet)
table = models_to_table([ds], schema)
table = attach_linkml_metadata(table, linkml_class="DataSet")  # version auto-populated



In [25]:
PATH = "../results/dataset/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id"])

In [76]:
nuc_df=client.materialize.query_view('nucleus_detection_lookup_v1', materialization_version=1412)
nuc_df.query('pt_root_id!=0', inplace=True)
data_items=[]
for k, row in nuc_df.iterrows():
    data_items.append(DataItem(
        id=str(row.id),
        name=str(row.pt_root_id),
        project_id="minnie65"))

In [26]:
# data_items=[DataItem(project_id="minnie65",
#          id = str(row.id),
#          name= str(row.root_id)) for idx, row in dfm.iterrows()]

In [77]:
schema = build_arrow_schema(DataItem)
table = models_to_table(data_items, schema)
table = attach_linkml_metadata(table, linkml_class="DataItem")  # version auto-populated



In [78]:
PATH = "../results/dataitem/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id"])

In [29]:
data_item_associations = [DataItemDataSetAssociation(
    dataitem_id=di.id,
    dataset_id=ds.id,
    project_id='minnie65'
) for di in data_items ]

In [30]:
schema = build_arrow_schema(DataItemDataSetAssociation)
table = models_to_table(data_item_associations, schema)
table = attach_linkml_metadata(table, linkml_class="DataItemDataSetAssociation")  # version auto-populated



In [31]:
PATH = "../results/dataitem_dataset_association/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id"])

In [32]:
import polars as pl

DATASET_ID = "minnie65_v1412_csm_cluster"

assoc_df = pl.read_delta("../results/dataitem_dataset_association/")
items_df = pl.read_delta("../results/dataitem/")

result = (
    assoc_df
    .filter(pl.col("project_id") == "minnie65")
    .filter(pl.col("dataset_id") == DATASET_ID)
    .join(items_df, left_on="dataitem_id", right_on="id", how="inner")
)

print(result)

shape: (35_795, 6)
┌─────────────┬─────────────────┬────────────┬─────────────────┬─────────────────┬─────────────────┐
│ dataitem_id ┆ dataset_id      ┆ project_id ┆ name            ┆ neuroglancer_li ┆ project_id_righ │
│ ---         ┆ ---             ┆ ---        ┆ ---             ┆ nk              ┆ t               │
│ str         ┆ str             ┆ str        ┆ str             ┆ ---             ┆ ---             │
│             ┆                 ┆            ┆                 ┆ str             ┆ str             │
╞═════════════╪═════════════════╪════════════╪═════════════════╪═════════════════╪═════════════════╡
│ 485509      ┆ minnie65_v1412_ ┆ minnie65   ┆ 864691136740606 ┆ null            ┆ minnie65        │
│             ┆ csm_cluster     ┆            ┆ 812             ┆                 ┆                 │
│ 263203      ┆ minnie65_v1412_ ┆ minnie65   ┆ 864691136210678 ┆ null            ┆ minnie65        │
│             ┆ csm_cluster     ┆            ┆ 204             ┆        

In [33]:
units_df = pd.read_csv('../data/minnie1412/minnie_cell_features.csv')

In [34]:
units_df

Unnamed: 0,id,description,unit,data_type,range_min,range_max
0,nucleus_volume_um,Nucleus volume,MICRONS_CUBED,<f4,0.0,
1,nucleus_area_um,Nucleus surface area,MICRONS_SQUARE,<f4,0.0,
2,nuclear_area_to_volume_ratio,Nucleus surface area to volume ratio,MICRONS_INVERSE,<f4,0.0,
3,nuclear_folding_area_um,Area of nucleus in an infolding (see Elabbady ...,MICRONS_SQUARE,<f4,0.0,
4,fraction_nuclear_folding,Fraction of nucleus in an infolding,RATIO,<f4,0.0,1.0
...,...,...,...,...,...,...
77,branch_svd3,SVD loading dendritic path length vs distance ...,RATIO,<f4,,
78,branch_svd4,SVD loading dendritic path length vs distance ...,RATIO,<f4,,
79,ego_count_pca0,PC loading synapse depth relative to soma comp...,RATIO,<f4,,
80,ego_count_pca1,PC loading synapse depth relative to soma comp...,RATIO,<f4,,


In [35]:
from connects_common_connectivity.models import CellFeatureDefinition


In [36]:
fds=[]
for idx, row in units_df.iterrows():
    fd=CellFeatureDefinition(
        id=str(row['id']),
        description=str(row['description']),
        unit=str(row['unit']),
        data_type=str(row['data_type']),
        range_min=float(row['range_min']),
        range_max=float(row['range_max'])
    )
    fds.append(fd)


In [37]:
schema = build_arrow_schema(CellFeatureDefinition)
table = models_to_table(fds, schema)
table = attach_linkml_metadata(table, linkml_class="CellFeatureDefinition")  # version auto-populated



In [38]:
PATH = "../results/cellfeaturedefinition/"
write_deltalake(PATH, table, mode="append")

In [39]:
from connects_common_connectivity.models import CellFeatureSet


In [40]:
cfs = CellFeatureSet(
    id='csm_cluster_features',
    description='Cell features used for clustering in the Allen Institute\'s large scale EM projects. ' \
                'Contains features from Elabbady et al 2025, Scheider-Mizell et al 2025, and ' \
                'some more recent features dervied from spine detection from Ben Pedigo. '
                'Feature set developed by Casey Schneider-Mizell.  ' \
                'Tries to take a synapse centric morphological approach with features ' \
                'describing how synapse densities are distributed across the dendritic arbors.',
    feature_definition_ids=[fd.id for fd in fds],
    extraction_method='Aggegated and computed via https://github.com/AllenInstitute/em_skeleton_feature_extraction.'
)
    

In [41]:
schema = build_arrow_schema(CellFeatureSet)
table = models_to_table([cfs], schema)
table = attach_linkml_metadata(table, linkml_class="CellFeatureSet")  # version auto-populated

PATH = "../results/cellfeatureset/"
write_deltalake(PATH, table, mode="append")


In [42]:
from connects_common_connectivity.arrow_utils import build_cell_feature_matrix_schema
schema = build_cell_feature_matrix_schema(cfs, fds, cell_index_column="id")

In [43]:
df=dfm.drop(['root_id','pt_root_id_y', 'valence'],axis=1)
df['project_id']='minnie65'
df['feature_set_id']='csm_cluster_features'

df['id']=dfm['id'].astype('string')
# go through an cast the types of each column according to the cfd's
for cfd in fds:
    col=cfd.id
    if cfd.data_type[1]=='f':
        df[col]=df[col].astype('float32')
    elif cfd.data_type[1]=='i':
        df[col]=df[col].astype('int32')


In [44]:
table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)

In [45]:
PATH = "../results/cellfeatures/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id", "feature_set_id"])

In [46]:
feat_df = pl.read_delta(PATH)

In [47]:
feat_df

id,nucleus_volume_um,nucleus_area_um,nuclear_area_to_volume_ratio,nuclear_folding_area_um,fraction_nuclear_folding,nucleus_to_soma_ratio,soma_volume_um,soma_area_um,soma_to_nucleus_center_dist,soma_area_to_volume_ratio,soma_synapse_density_um,tip_len_dist_dendrite_p75,tip_tort_dendrite_p75,num_syn_dendrite,num_syn_soma,path_length_dendrite,radial_extent_dendrite,syn_dist_distribution_dendrite_p50,syn_size_distribution_soma_p50,syn_size_distribution_dendrite_p50,syn_size_distribution_dendrite_p5,syn_size_distribution_dendrite_p95,syn_size_dendrite_cv,syn_depth_dist_p1,syn_depth_dist_p99,syn_depth_extent,median_density,radius_dist,area_factor,dendrite_length_binned_0,dendrite_length_binned_1,dendrite_length_binned_2,dendrite_length_binned_3,syn_dist_distribution_dendrite_spine_p50,syn_dist_distribution_dendrite_shaft_p50,dend_spine_shaft_offset,…,median_density_shaft,syn_spine_shaft_ratio_dendrite,num_spine_syn_dendrite,num_shaft_syn_dendrite,num_spine_syn_soma,syn_count_dist_binned_shaft_0,syn_count_dist_binned_shaft_1,syn_count_dist_binned_shaft_2,syn_count_dist_binned_shaft_3,syn_count_dist_binned_spine_0,syn_count_dist_binned_spine_1,syn_count_dist_binned_spine_2,syn_count_dist_binned_spine_3,syn_count_dist_binned_ratio_0,syn_count_dist_binned_ratio_1,syn_count_dist_binned_ratio_2,syn_count_dist_binned_ratio_3,syn_count_pca0,syn_count_pca1,syn_count_pca2,syn_count_pca3,syn_count_pca4,syn_count_pca5,syn_count_pca6,syn_count_pca7,syn_count_pca8,syn_count_pca9,branch_svd0,branch_svd1,branch_svd2,branch_svd3,branch_svd4,ego_count_pca0,ego_count_pca1,ego_count_pca2,project_id,feature_set_id
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,f32,f32,f32,i32,i32,i32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,str
"""485509""",344.116638,269.338379,0.782695,4.873741e6,0.017687,0.309179,1112.999878,1202.533447,537.42511,1.080443,0.15817,147.846939,1.264484,6768,117,4849.900391,227.368088,88.470299,3532,4340,596,28359,1.170447,27.544989,173.108597,145.563599,1.422885,385.890778,2.670925,818.647766,1723.804077,1067.032959,438.294983,88.60463,90.456306,-1.851673,…,0.414144,1.269837,4718,1956,0,371,721,407,209,606,2213,1268,405,0.706394,1.616585,1.637051,0.95109,5.812237,0.119504,-0.61887,-0.880462,-0.127526,-0.166496,0.165302,-0.123148,1.384805,-0.181554,62.978085,0.950874,0.729539,-1.328438,4.192588,1.960875,0.805681,-0.739578,"""minnie65""","""csm_cluster_features"""
"""263203""",254.538391,250.433167,0.983872,3.075656e7,0.123807,0.397382,640.53772,934.029358,361.333221,1.458196,0.119974,152.7453,1.301633,2036,45,2394.644043,115.675705,74.979202,4564,4618,737,21603,0.944691,23.207253,425.879395,402.672119,0.860583,319.731171,2.512924,508.080261,855.880737,473.234985,117.005539,76.613541,74.566284,2.047251,…,0.189176,1.760139,1547,456,2,133,165,78,18,339,729,340,70,1.343302,2.136713,2.109847,1.90182,-1.764601,-2.52387,-0.272709,-2.189014,-1.03418,-0.030833,0.031632,-0.05569,0.326676,-2.280065,31.198191,-4.262416,0.754856,1.729803,1.534398,0.588246,-0.777625,-0.096427,"""minnie65""","""csm_cluster_features"""
"""456177""",338.026459,298.603607,0.883373,3.5204028e7,0.124294,0.366174,923.129578,1017.687988,365.622009,1.102432,0.189111,153.671768,1.35095,4380,124,4331.236816,107.554108,86.69693,4408,5460,695,26492,0.97058,14.459391,330.694427,316.235046,1.02033,333.71936,2.624826,639.515625,1594.599854,1216.418457,168.38266,88.55098,86.423515,2.127462,…,0.27645,1.333254,3068,1217,0,246,446,278,56,406,1462,951,101,0.720518,1.710583,1.770696,0.839535,-0.915223,-1.138865,-0.62281,4.627585,-0.271961,-0.14354,0.484421,-0.112422,0.98503,-0.476574,56.717365,4.266992,9.26533,-0.329062,7.591257,-0.729798,-0.946995,0.546816,"""minnie65""","""csm_cluster_features"""
"""461339""",328.602386,300.213623,0.913608,3.8599868e7,0.126401,0.324381,1013.014954,1051.013306,1082.797363,1.03751,0.093517,163.220428,1.238037,2672,52,2813.501953,116.800041,83.578728,3458,5186,574,18920,0.865209,59.640705,553.47467,493.833954,0.915874,327.661102,2.319643,523.42218,890.910522,719.018127,118.758301,80.014793,95.96003,-15.945234,…,0.309888,0.995062,1749,877,0,189,266,233,50,404,707,491,61,1.091923,1.40691,1.07215,0.281771,-0.318475,-1.321457,4.860217,-0.743879,-0.431872,-0.034319,-0.755491,0.012028,-0.744178,2.989701,34.661217,0.517283,3.532224,3.231579,-0.251822,0.552461,-0.534918,-0.287912,"""minnie65""","""csm_cluster_features"""
"""302377""",292.694,327.72168,1.119673,6.713424e7,0.197033,0.349798,836.752563,1107.547852,977.260864,1.323626,0.139707,142.371948,1.279469,2383,77,3074.820801,118.841194,72.394661,3844,4848,532,21244,0.922376,273.156494,523.057495,249.901016,0.759468,286.268036,2.255821,688.406616,1245.724365,727.305237,105.033318,73.741653,74.023186,-0.281531,…,0.27603,0.815887,1483,842,0,243,348,189,31,397,687,352,44,0.705887,0.979182,0.893669,0.491853,-0.563825,-1.615024,5.870802,-0.696228,-0.383073,0.175306,-0.907597,0.045243,-0.985779,3.231719,45.571911,-3.164315,4.887801,2.438147,0.188189,-0.621269,-0.94124,-0.753779,"""minnie65""","""csm_cluster_features"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""256280""",338.222168,273.305908,0.808066,4.731331e6,0.014629,0.357534,945.987122,1132.804077,330.315338,1.197484,0.174231,142.346664,1.262041,5617,94,4996.428223,171.832535,79.3311,2844,4504,583,24702,1.047025,19.659378,181.790573,162.131195,1.08333,331.814056,2.663466,1008.819702,2076.951416,931.284668,409.198792,80.43734,77.527237,2.910104,…,0.292757,1.49547,4093,1451,0,372,545,261,150,676,2248,890,213,0.85998,2.042311,1.765859,0.503062,5.116732,-0.12661,-0.943557,-1.171136,-0.152127,0.089928,1.819459,-0.085567,0.630853,-0.062365,68.597374,-7.284147,1.391263,-5.12009,-2.106417,0.527054,1.737964,-1.021066,"""minnie65""","""csm_cluster_features"""
"""258113""",341.24054,289.302826,0.847797,2.3207194e7,0.07973,0.338805,1007.189941,1126.204468,455.18277,1.118165,0.158707,153.30571,1.312715,4468,100,4619.354492,101.683472,86.536369,4448,5102,585,27182,1.02501,30.442608,280.330597,249.888,1.001496,330.485291,2.55391,907.731934,1793.58606,1158.996216,453.717865,88.418129,84.297691,4.120444,…,0.245242,1.571383,3277,1102,0,289,378,275,125,424,1620,1006,204,0.55141,2.096614,1.867324,0.7022,-0.7926,-0.67776,-0.796247,2.996905,-0.291545,0.187134,4.478372,-0.056041,0.631604,-0.436205,64.209297,0.124744,4.125066,1.052152,-2.153912,-1.061345,-0.573624,0.390256,"""minnie65""","""csm_cluster_features"""
"""258355""",223.770218,202.311554,0.904104,791314.375,0.003746,0.421774,530.5448,621.939514,539.227661,1.172266,0.148173,204.468369,1.274882,1373,60,2574.078369,148.92598,90.121712,2900,4508,484,25402,1.100713,26.57056,351.1763,324.605743,0.510034,267.108795,2.067558,360.722748,802.0802,718.004517,343.821869,92.95932,92.357422,0.6019,…,0.200037,0.409331,756,569,1,114,185,152,67,104,331,239,62,-0.131245,0.835881,0.649503,-0.110183,-0.717033,-1.31488,-0.788657,3.889642,-0.307721,-0.093898,0.718085,-0.103261,0.254278,-0.510237,31.755663,6.499439,-0.238466,-1.643386,-0.245814,-1.356785,-0.264673,1.984516,"""minnie65""","""csm_cluster_features"""
"""256602""",244.462021,228.91745,0.936413,9.46886e6,0.039617,0.388277,629.606934,775.752686,445.404053,1.232122,0.13029,151.957291,1.35199,2344,69,3207.395508,118.277939,86.771233,3276,5040,564,21248,0.939229,29.61202,260.218689,230.606674,0.745805,294.417572,2.356135,593.553711,1118.317261,885.582092,255.265305,88.005402,88.005402,0.0,…,0.182526,1.485769,1685,601,1,137,198,150,69,228,804,553,81,0.730679,2.01622,1.875337,0.228269,0.619391,-0.625096,-1.1293,0.977087,-0.26526,0.420049,5.232856,-0.023793,0.455531,-0.20996,43.433788,3.230959,1.529497,3.024427,-0.886315,-1.313265,0.020434,0.008615,"""minnie65""","""csm_cluster_features"""


In [48]:
client = caveclient.CAVEclient('minnie65_phase3_v1', auth_token=os.environ['CUSTOM_KEY'])

In [49]:
ct_df=client.materialize.query_table('cell_type_multifeature_v1')

In [50]:
excitatory_types=[a for a in ct_df.query('classification_system == "excitatory"').cell_type.value_counts().index]
inhibitory_types=[a for a in ct_df.query('classification_system == "inhibitory"').cell_type.value_counts().index]

In [51]:

from connects_common_connectivity.models import Cluster # now picks up the refreshed definition

nrn_cluster = Cluster(
    id='neuron',
    children=['excitatory', 'inhibitory'],
    level=0,
    project_id='minnie65'
)
inh_cluster = Cluster(
    id='inhibitory',
    parent=nrn_cluster.id,
    children=inhibitory_types,
    level=1,
    project_id='minnie65')
exc_cluster = Cluster(
    id='excitatory',
    parent=nrn_cluster.id,
    children=excitatory_types,
    level=1,
    project_id='minnie65'
)
clusters = [nrn_cluster, inh_cluster, exc_cluster]
for ctype in inhibitory_types:
    c = Cluster(
        id=ctype,
        parent=inh_cluster.id,
        level=2,
        project_id='minnie65')
    clusters.append(c)
for ctype in excitatory_types:
    c = Cluster(
        id=ctype,
        parent=exc_cluster.id,
        level=2,
        project_id='minnie65')
    clusters.append(c)

In [52]:
schema = build_arrow_schema(Cluster)
table = models_to_table(clusters, schema)
table = attach_linkml_metadata(table, linkml_class="Cluster")  # version auto-populated


In [53]:
table

pyarrow.Table
project_id: string not null
id: string not null
parent: string
children: list<item: string>
  child 0, item: string
level: int64
score: double
distance_to_parent: double
----
project_id: [["minnie65","minnie65","minnie65","minnie65","minnie65",...,"minnie65","minnie65","minnie65","minnie65","minnie65"]]
id: [["neuron","inhibitory","excitatory","PTC","DTC",...,"L5IT","L6IT","L5ET","L5NP","L6SP"]]
parent: [[null,"neuron","neuron","inhibitory","inhibitory",...,"excitatory","excitatory","excitatory","excitatory","excitatory"]]
children: [[["excitatory","inhibitory"],["PTC","DTC","ITC","STC"],...,null,null]]
level: [[0,1,1,2,2,...,2,2,2,2,2]]
score: [[null,null,null,null,null,...,null,null,null,null,null]]
distance_to_parent: [[null,null,null,null,null,...,null,null,null,null,null]]

In [54]:
PATH = "../results/cluster/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id"])

In [55]:
import importlib, connects_common_connectivity.models as _m
importlib.reload(_m)
from connects_common_connectivity.models import ClusterMembership


In [56]:
# all items are neurons
cms= []
for k, row in ct_df.iterrows():
    cms.append(ClusterMembership(
        item = str(row.target_id),
        cluster='neuron',
        probability=1.0,
        project_id='minnie65'
    ))
    cms.append(ClusterMembership(
        item = str(row.target_id),
        cluster=row.classification_system,
        project_id='minnie65'
    ))
    cms.append(ClusterMembership(
        item = str(row.target_id),
        cluster=row.cell_type,
        project_id='minnie65'
    ))

In [57]:
schema = build_arrow_schema(ClusterMembership)
table = models_to_table(cms, schema)
table = attach_linkml_metadata(table, linkml_class="ClusterMembership")  # version auto-populated


In [58]:
PATH = "../results/clustermembership/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id"])

In [59]:
from connects_common_connectivity.models import CellCellMeasurementMatrix


In [60]:
conn_df = pd.read_csv('../data/minnie1412/public_minnie65_phase3_v1_v1412_connections_with_nuclei.csv.gz', header=None)


KeyboardInterrupt: 

In [None]:
columns = ['pre_pt_root_id', 'post_pt_root_id', 'n_syn', 'sum_size', 'pre_nuc_id', 'post_nuc_id']
conn_df.columns = columns

In [61]:
conn_df.head()

Unnamed: 0,pre_pt_root_id,post_pt_root_id,n_syn,sum_size,pre_nuc_id,post_nuc_id
43176584,864691135136899865,864691134884743162,1,5984,337175,304043
43181492,864691135360346200,864691134884756730,1,5920,330167,339142
43181499,864691135373601736,864691134884756730,1,3852,273595,339142
43183398,864691135928570068,864691134884756730,1,11504,301121,339142
43183978,864691135373601736,864691134884759546,2,6244,273595,205051


In [10]:
# filter out all the rows that have pre_nuc_id or post_nuc_id =-1
conn_df.query('pre_nuc_id!=-1 and post_nuc_id!=-1', inplace=True)

In [11]:
client.materialize.version = 1412

In [12]:
prf_df = client.materialize.query_table('proofreading_status_and_strategy')

In [13]:
prf_axons= prf_df.query('status_axon=="t"')
# filter the conn_df so that it only has connections from proofread axons
conn_df=conn_df[conn_df.pre_pt_root_id.isin(prf_axons.pt_root_id)]

In [14]:
conn_df.head()

Unnamed: 0,pre_pt_root_id,post_pt_root_id,n_syn,sum_size,pre_nuc_id,post_nuc_id
43176584,864691135136899865,864691134884743162,1,5984,337175,304043
43181492,864691135360346200,864691134884756730,1,5920,330167,339142
43181499,864691135373601736,864691134884756730,1,3852,273595,339142
43183398,864691135928570068,864691134884756730,1,11504,301121,339142
43183978,864691135373601736,864691134884759546,2,6244,273595,205051


In [15]:
from connects_common_connectivity.models import CellCellConnectivityLong, SynapticMeasurementType, Unit


In [16]:
cccls=[]
for k, row in conn_df.iterrows():
    cccls.append(CellCellConnectivityLong(
        id=str(k),
        presynaptic_cell=str(row.pre_nuc_id),
        postsynaptic_cell=str(row.post_nuc_id),
        measurement_type=SynapticMeasurementType.SYNAPSE_COUNT,
        value=row.n_syn,
        unit=Unit.COUNT,
        project_id='minnie65'))


In [64]:
schema = build_arrow_schema(CellCellConnectivityLong)
table = models_to_table(cccls, schema)
table = attach_linkml_metadata(table, linkml_class="CellCellConnectivityLong")  # version auto-populated


In [65]:
table

pyarrow.Table
project_id: string not null
id: string not null
description: string
presynaptic_cell: string
postsynaptic_cell: string
measurement_type: string
modality: string
value: double not null
unit: string not null
----
project_id: [["minnie65","minnie65","minnie65","minnie65","minnie65",...,"minnie65","minnie65","minnie65","minnie65","minnie65"]]
id: [["43176584","43181492","43181499","43183398","43183978",...,"303199984","303200956","303201438","303201480","303202316"]]
description: [[null,null,null,null,null,...,null,null,null,null,null]]
presynaptic_cell: [["337175","330167","273595","301121","273595",...,"497103","463716","490761","363525","520364"]]
postsynaptic_cell: [["304043","339142","339142","339142","205051",...,"422139","422139","422139","422139","422139"]]
measurement_type: [["SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT",...,"SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT","SYNAPSE_COUNT"]]
modality: [[null,null,null,n

In [19]:
PATH = "../results/cellcellconnectivitylong/"
write_deltalake(PATH, table, mode="append", partition_by=["project_id", "measurement_type"])

In [None]:
# pivot conn_df into a matrix where the rows are pre_nuc_id, the columns are post_nuc_id
# and the values are n_syn, stored as a sparse matrix
syn_conn_matrix = conn_df.pivot_table(index='pre_nuc_id', columns='post_nuc_id', aggfunc='first', values='sum_size', fill_value=0)

In [141]:
syn_conn_matrix

post_nuc_id,20639,20995,21033,21398,21767,22378,23152,26342,26345,26656,...,762482,763930,764132,764275,764300,764488,766128,766554,767708,768156
pre_nuc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
161736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164494,0,5676,0,6592,4264,3728,3056,1244,0,0,...,0,0,0,0,0,0,0,0,0,0
188569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188961,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
189149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614445,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
616159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
617832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,id,volume,pt_root_id,orig_root_id,pt_supervoxel_id,pt_position,pt_position_lookup
1,373879,229.045043,864691136090135607,864691136090135607,96218056992431305,"[228816, 239776, 19593]","[228816, 239776, 19593]"
3,201858,93.753836,864691135373893678,864691135373893678,84955554103121097,"[146848, 213600, 26267]","[146848, 213600, 26267]"
4,600774,135.189791,864691135682378744,0,111493022281121981,"[339120, 276112, 19442]","[339520, 276480, 19506]"
5,408486,103.686144,864691135194387242,864691135194387242,98470475952865044,"[245024, 244416, 25074]","[245024, 244416, 25074]"
7,598774,31.230034,864691135741608653,864691135741608653,110718553912730154,"[334096, 273472, 20713]","[334064, 273328, 20701]"
...,...,...,...,...,...,...,...
144113,232979,779.511235,864691135496010384,864691135496010384,85094504683662856,"[147744, 200000, 24666]","[147744, 200000, 24666]"
144115,598753,792.030249,864691135743752909,864691135743752909,110506897924421202,"[332576, 269216, 20733]","[332576, 269216, 20733]"
144116,111162,800.065782,864691134912248365,864691134912248365,79244553336437996,"[105344, 130400, 26721]","[105344, 130400, 26721]"
144118,528334,896.589660,864691135968943973,864691135968943973,105853763977769997,"[298608, 203488, 15267]","[298608, 203488, 15267]"


In [67]:

items_df = pl.read_delta("../results/dataitem/")


In [68]:
items_df

id,name,neuroglancer_link,project_id
str,str,str,str
"""485509""","""864691136740606812""",,"""minnie65"""
"""263203""","""864691136210678204""",,"""minnie65"""
"""456177""","""864691134965388575""",,"""minnie65"""
"""461339""","""864691135591398411""",,"""minnie65"""
"""302377""","""864691136053189107""",,"""minnie65"""
…,…,…,…
"""256280""","""864691135938775556""",,"""minnie65"""
"""258113""","""864691135801170018""",,"""minnie65"""
"""258355""","""864691136967855566""",,"""minnie65"""
"""256602""","""864691135919756208""",,"""minnie65"""


In [None]:
# from connects_common_connectivity.models import CellCellMeasurementMatrix
# ccmm = CellCellMeasurementMatrix(
#     id='minnie65_v1412_synapse_connection_matrix',
#     project_id='minnie65',
#     modality="ELECTRON_MICROSCOPY",
#     presynaptic_index=nuc_df[nuc_df.id.isin(syn_conn_matrix.index)].id.astype('string').tolist(),
#     postsynaptic_index=nuc_df[nuc_df.id.isin(syn_conn_matrix.columns)].id.astype('string').tolist(),
#     measurement_type='SUM_ANATOMICAL_SIZE',
#     unit='ARBITRARY_UNIT',
#     values='file://./datalake/connectivity_values/project_id=minnie65/minnie65_v1412_synapse_connection_matrix.zarr'
# )

In [146]:
# CellCellMeasurementMatrix?

[31mInit signature:[39m
CellCellMeasurementMatrix(
    *,
    project_id: str,
    id: str,
    description: Optional[str] = [38;5;28;01mNone[39;00m,
    presynaptic_index: Optional[list[str]] = [38;5;28;01mNone[39;00m,
    postsynaptic_index: Optional[list[str]] = [38;5;28;01mNone[39;00m,
    measurement_type: Optional[connects_common_connectivity.models.SynapticMeasurementType] = [38;5;28;01mNone[39;00m,
    modality: Optional[connects_common_connectivity.models.Modality] = [38;5;28;01mNone[39;00m,
    values: str,
    unit: connects_common_connectivity.models.Unit,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m      Aggregated projection measurements for a cohort (e.g., all cells) for a single measurement type.
[31mInit docstring:[39m
Create a new model by parsing and validating input data from keyword arguments.

Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.

`self` is explicitly positional-