In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import anndata as ad
import pandas as pd
import numpy as np

import scanpy as sc

from thalamus_merfish_analysis import ccf_plots as cplots
from thalamus_merfish_analysis import abc_load as abc

import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc

%matplotlib inline

## Load abc atlas data

In [3]:
version = '20230830'
adata_th_zi = abc.load_adata_thalamus(version=version, 
                             transform='raw', # will manually norm+log2 later
                             subset_to_TH_ZI=True, 
                             with_metadata=True, 
                             flip_y=True,
                             round_z=True,
                             with_colors=False) # need colors to come with

In [6]:
adata_th_zi

In [24]:
# ['x_ccf','y_ccf','z_ccf'] in adata_th_zi.obs.columns
assert_message = 'cell_by_gene obs columns must match format output by sis.spot_table.cell_by_gene_anndata()' 
assert {'x_ccf','y_ccf','z_ccf'}.issubset(adata_th_zi.obs.columns), assert_message

In [9]:
adata_th_zi.obs.head(3)

In [12]:
adata_th_zi.var

In [7]:
adata_th_zi.obs.index

In [4]:
adata_th_zi_neurons = abc.filter_by_class_thalamus(adata_th_zi, filter_nonneuronal=True,
                                                   filter_midbrain=True)

# filter to thalamus boundaries (add a buffer here if wanted)
filter_buffer = 0  # 5
realigned=False
obs_filtered = abc.filter_by_thalamus_coords(adata_th_zi_neurons.obs.copy(), 
                                             realigned=realigned, 
                                             buffer=filter_buffer)
adata_th_zi_neurons = adata_th_zi_neurons[obs_filtered.index].copy() 

In [5]:
gene_list = [gene for gene in adata_th_zi_neurons.var_names if 'Blank' not in gene]
adata_th_zi_neurons = adata_th_zi_neurons[:,gene_list]

In [6]:
# adata_th_zi_neurons_left = adata_th_zi_neurons[adata_th_zi_neurons.obs['left_hemisphere']]

In [7]:
# adata = adata_th_zi_neurons_left.copy()
adata = adata_th_zi_neurons.copy()
adata

In [8]:
# which columns in obs to use for plotting
if realigned:
    ccf_label = 'parcellation_substructure_realigned'
    coords = 'section'
else:
    ccf_label = 'parcellation_substructure'
    coords = 'reconstructed'
    
x_col = 'x_'+coords
y_col = 'y_'+coords
section_col = 'z_'+coords

sections_all = sorted(adata.obs[section_col].unique())

In [9]:
# copy over
adata.obs['x_cirro'] = adata.obs[x_col].copy()
adata.obs['y_cirro'] = adata.obs[y_col].copy()

# set up cirro coords for four columns of TH sections
x_shift = 7
y_shift = -5
count = 0
n_cols = 4
for i, sec in enumerate(reversed(sections_all)):
    # 4 sections per row
    if i in [n_cols,n_cols*2,n_cols*3]:
        count = 0
        curr_y_shift = y_shift*(i/n_cols)
    curr_x_shift = x_shift*(count)
    
    mask = (adata.obs[section_col]==sec)
    adata.obs.loc[mask, 'x_cirro'] += curr_x_shift
    
    if i >=n_cols:
        adata.obs.loc[mask, 'y_cirro'] += curr_y_shift
        
    count+=1

In [10]:
# add coordinates to obsm
adata.obsm['cirro_spatial'] = adata.obs[['x_cirro','y_cirro']].to_numpy()
adata.obsm['ccf_spatial'] = adata.obs[['x_ccf','y_ccf','z_ccf']].to_numpy()

In [11]:
plt.scatter(adata.obs['x_cirro'], adata.obs['y_cirro'], s=0.001)
plt.axis('equal')
plt.show()

## Make color lists for .uns

.uns\["cluster_colors"\] should be a list of colors as hex strings (e.g. #D1C9BA)  in the order of the .obs.cluster.cat.categories

and same for .uns\["subclass_colors"\] matching the categories of .obs.subclass

In [12]:
# no idea if this makes a difference, but sorting it makes it easier for me to 
# check the order is correct
class_cat_sorted = sorted(adata.obs['class'].cat.categories)
adata.obs['class'] = adata.obs['class'].cat.reorder_categories(class_cat_sorted, ordered=False)

subclass_cat_sorted = sorted(adata.obs['subclass'].cat.categories)
adata.obs['subclass'] = adata.obs['subclass'].cat.reorder_categories(subclass_cat_sorted, ordered=False)

supertype_cat_sorted = sorted(adata.obs['supertype'].cat.categories)
adata.obs['supertype'] = adata.obs['supertype'].cat.reorder_categories(supertype_cat_sorted, ordered=False)

cluster_cat_sorted = sorted(adata.obs['cluster'].cat.categories)
adata.obs['cluster'] = adata.obs['cluster'].cat.reorder_categories(cluster_cat_sorted, ordered=False)

In [13]:
# using explicitly .cat.categories.to_list() to ensure the colors are in the right order
palettes = {level: abc.get_taxonomy_palette(level) 
            for level in ['class','subclass','supertype','cluster']}
adata.uns['class_colors'] = [palettes['class'][x] for x in adata.obs['class'].cat.categories]

adata.uns['subclass_colors'] = [palettes['subclass'][x] for x in adata.obs['subclass'].cat.categories]

adata.uns['supertype_colors'] = [palettes['supertype'][x] for x in adata.obs['supertype'].cat.categories]

adata.uns['cluster_colors'] = [palettes['cluster'][x] for x in adata.obs['cluster'].cat.categories]

In [14]:
adata.uns

## Generate UMAP, tSNE, etc.

### on raw

In [15]:
# PCA pre-processing
sc.pp.pca(adata)
sc.pl.pca(adata)

In [16]:
# more pre-processing
sc.pp.neighbors(adata)

In [17]:
# UMAP
sc.tl.umap(adata)
sc.pl.umap(adata)

In [18]:
# tSNE - takes much longer than UMAP to run
sc.tl.tsne(adata)
sc.pl.tsne(adata)

In [19]:
adata.obsm

### on log2

In [20]:
# log transform gene expr for dimensionality reduction
adata_log2 = adata.copy()
adata_log2.X = np.asarray(np.log2(1 + adata.X*1e6/np.sum(adata.X.toarray(), axis=1, keepdims=True)))

In [21]:
# PCA pre-processing
sc.pp.pca(adata_log2)
sc.pl.pca(adata_log2)

# more pre-processing
sc.pp.neighbors(adata_log2)

In [22]:
# UMAP
sc.tl.umap(adata_log2)
sc.pl.umap(adata_log2)

In [23]:
# tSNE - takes much longer than UMAP to run
sc.tl.tsne(adata_log2)
sc.pl.tsne(adata_log2)

In [24]:
adata_log2.obsm

### copy raw results over to adata_log2

In [38]:
adata_log2.obsm['X_umap_raw'] = adata.obsm['X_umap']
adata_log2.obsm['X_tsne_raw'] = adata.obsm['X_tsne']

## Load & save SpaGCN domains results

In [26]:
# load in SpaGCN domain results
# temporarily a static file in '../code/resources' until I get a reproducible run setup for the spagcn capsule
spagcn_domains_df_all = pd.read_parquet('/code/resources/spagcn_predicated_domains.parquet')

# need to convert to categories since they're imported as int64
spagcn_domains_df = spagcn_domains_df_all[['res1pt4']].copy().astype('str')
spagcn_domains_df = spagcn_domains_df.astype('category')
spagcn_domains_df.rename(columns={'res1pt4':'SpaGCN_domains'},inplace=True)
spagcn_domains_df['SpaGCN_domains']

adata_log2.obs = adata_log2.obs.join(spagcn_domains_df, on='cell_label')

adata_log2.obs['SpaGCN_domains'] = adata_log2.obs['SpaGCN_domains'].cat.add_categories('no data').fillna('no data')

adata_log2.obs['SpaGCN_domains']

In [27]:
spg_domain_cats = adata_log2.obs['SpaGCN_domains'].cat.categories
print(f'{spg_domain_cats=}')
spg_palette_sns = sns.color_palette(cc.glasbey, n_colors=len(spg_domain_cats))

# set the 'no data' category color to white so it doesn't show up in cirro
spg_palette_sns[-1] = (1.0, 1.0, 1.0)

# need RGB dict for sns plotting to check colors
palette_dict_sns = dict(zip(spg_domain_cats, spg_palette_sns))

# need hex strings for cirro
spg_palette_cirro = list(spg_palette_sns.as_hex())
print(spg_palette_cirro)

In [28]:
fig = plt.figure(figsize=(20,15))
ax = fig.gca()
sns.scatterplot(adata_log2.obs, ax=ax, x='x_cirro', y='y_cirro', 
                hue='SpaGCN_domains', s=10, palette=palette_dict_sns, 
                linewidth=0, legend=False)
plt.axis('equal')

In [29]:
adata_log2.uns['SpaGCN_domains_color'] = spg_palette_cirro

# Load & save NSF results

In [30]:
adata_nsf = ad.read_zarr("/root/capsule/data/nsf_2000_adata/nsf_2000_adata.zarr")

In [31]:
nsf_cols = ['nsf_tot', 'nsf0', 'nsf1', 'nsf2', 'nsf3', 'nsf4', 
            'nsf5', 'nsf6', 'nsf7', 'nsf8', 'nsf9', 'nsf10', 
            'nsf11', 'nsf12', 'nsf13', 'nsf14', 'nsf15', 'nsf16', 
            'nsf17', 'nsf18', 'nsf19', 'nsf20', 'nsf21', 'nsf22', 
            'nsf23', 'nsf24', 'nsf25', 'nsf26', 'nsf27', 'nsf28', 
            'nsf29']
nsf_df = adata_nsf.obs[nsf_cols].copy()
nsf_df

In [32]:
adata_log2.obs = adata_log2.obs.join(nsf_df, on='cell_label')
adata_log2.obs.head(3)

In [33]:
adata_log2.obs[nsf_cols] = adata_log2.obs[nsf_cols].fillna(0)

In [34]:
adata_log2.obs.head(5)

## Clean up obs

In [35]:
cols_to_remove = ['parcellation_division', 'parcellation_index', 
                  # 'parcellation_structure','parcellation_substructure', 
                  'x_ccf', 'y_ccf', 'z_ccf', 'x_cirro', 'y_cirro']

adata_log2.obs.drop(columns=cols_to_remove, inplace=True)

## Save as h5ad

In [36]:
adata_log2

In [39]:
adata_log2.write('/results/wmb_abc_atlas_v20230830_th_nsf_spagcn_for_cirro_log2CPM.h5ad', compression="gzip")

In [40]:
adata.write('/results/wmb_abc_atlas_v20230830_th_nsf_spagcn_for_cirro_raw.h5ad', compression="gzip")

In [41]:
adata_raw = adata_log2.copy()

In [42]:
adata_raw.X = adata.X.copy()

In [43]:
adata_raw.write('/results/wmb_abc_atlas_v20230830_th_nsf_spagcn_for_cirro_raw.h5ad', compression="gzip")

In [46]:
adata_MKlog2X = abc.load_adata_thalamus(version=version, 
                             transform='log2', # will manually norm+log2 later
                             subset_to_TH_ZI=True, 
                             with_metadata=False, 
                             flip_y=True,
                             round_z=True,
                             with_colors=False)

adata_MKlog2X_th_zi_neurons = abc.filter_by_class_thalamus(adata_MKlog2X, 
                                                           filter_nonneuronal=True,
                                                           filter_midbrain=True)

# filter to thalamus boundaries (add a buffer here if wanted)
filter_buffer = 0  # 5
realigned=False
obs_filtered_MKlog2X = abc.filter_by_thalamus_coords(adata_MKlog2X_th_zi_neurons.obs.copy(), 
                                             realigned=realigned, 
                                             buffer=filter_buffer)
adata_MKlog2X_th_zi_neurons = adata_MKlog2X_th_zi_neurons[obs_filtered_MKlog2X.index].copy()

gene_list = [gene for gene in adata_MKlog2X_th_zi_neurons.var_names if 'Blank' not in gene]
adata_MKlog2X_th_zi_neurons = adata_MKlog2X_th_zi_neurons[:,gene_list]

adata_MKlog2X_th_zi_neurons

In [47]:
adata_log2CPV = adata_log2.copy()
adata_log2CPV.X = adata_MKlog2X_th_zi_neurons.X.copy()

In [49]:
adata_log2CPV.obs.rename(columns={'SpaGCN_domains':'spagcn'},inplace=True)

In [51]:
adata_log2CPV.uns['spagcn_colors'] = adata_log2CPV.uns['SpaGCN_domains_color']

In [52]:
adata_log2CPV.write('/results/wmb_abc_atlas_v20230830_th_nsf_spagcn_for_cirro_log2CPV.h5ad', compression="gzip")