In [1]:
from pathlib import Path
# import json

import pandas as pd
import anndata as ad
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Load abc atlas data

In [2]:
abc_root = Path("/data/abc_atlas/")
brain_id = 'C57BL6J-638850'
version = '20230830'

In [3]:
# load expression matrix for log2 counts
adata_log2 = ad.read_h5ad(abc_root/f"expression_matrices/MERFISH-{brain_id}/{version}/{brain_id}-log2.h5ad", backed='r')

In [4]:
# load in the cell metadata
gene_md_df = pd.read_csv(abc_root/f'metadata/MERFISH-{brain_id}/{version}/gene.csv', 
                         dtype={'cell_label':str},
                         index_col=0)

In [5]:
# load in the cell metadata
cell_md_df = pd.read_csv(abc_root/f'metadata/MERFISH-{brain_id}/{version}/cell_metadata.csv', 
                         dtype={'cell_label':str},
                         index_col=0)

In [6]:
# load in the _cluster_annotation metadata csv
cluster_md_df = pd.read_csv(abc_root/f'metadata/MERFISH-{brain_id}/{version}/views/cell_metadata_with_cluster_annotation.csv', 
                            dtype={'cell_label':str},
                            index_col=0)

In [7]:
# load in the _parcellation_annotation metadata csv
ccf_md_df = pd.read_csv(abc_root/f'metadata/MERFISH-{brain_id}-CCF/{version}/views/cell_metadata_with_parcellation_annotation.csv', 
                        dtype={'cell_label':str},
                        index_col=0)

# # flip y coords so the sections don't display upside down
# flip_y=True
# if flip_y:
#     ccf_md_df[['y_section', 'y_reconstructed']] *= -1

# What's in each metadata file?

## expression matrices (log2 & raw) h5ad

- 4,334,174 cells (~4.3 million)

In [8]:
print(f'{adata_log2.n_obs=}')
adata_log2

## gene.csv metadata

In [9]:
print(f'{len(gene_md_df)=}')
gene_md_df

## cell_metadata.csv

In [14]:
print(f'{len(cell_md_df)=}')
cell_md_df.head(5)

In [19]:
# counts, bins = np.histogram(cell_md_df['average_correlation_score'].values)
fig = plt.figure()
ax = plt.gca()
ax.hist(cell_md_df['average_correlation_score'].values, bins=100)
ax.set_xlabel('average_correlation_score')
ax.set_ylabel('# of cells')
plt.show()

In [12]:
print(f'{len(ccf_md_df)=}')
ccf_md_df