In [1]:
import anndata as ad
import pandas as pd
import shapely
import shapely.plotting as splot
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

sns.set_style('white')

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [3]:
adata = ad.read_h5ad("../data/merfish_638850_AIT17.custom_CCF_annotated_TH_ZI_only_2023-05-04_00-00-00/atlas_brain_638850_AIT17_custom_CCF_annotated_TH_ZI_only.h5ad")
adata

In [4]:
# nice to have everything for plotting in the obs dataframe
adata.obs['cirro_x'] = adata.obsm['spatial_cirro'][:,0]
adata.obs['cirro_y'] = adata.obsm['spatial_cirro'][:,1]

# define and view CCF polygons

## calc polygons

In [111]:
import hdbscan

from collections import defaultdict
from shapely.ops import unary_union
from sklearn.cluster import OPTICS, DBSCAN, k_means

def poly_from_points(X, min_points=0, allow_holes=False):
    if X.shape[0] < min_points:
        return None
    poly = shapely.concave_hull(shapely.multipoints(X), allow_holes=allow_holes, ratio=0.3)
    if type(poly) is shapely.Polygon:
        return poly
    else:
        return None

In [105]:
ccf_th_names = ['AD', 'AMd', 'AMv', 'AV', 'CL', 'CM', 'Eth', 'FF', 'IAD', 'IAM', 'IGL', 'IMD', 'IntG', 'LD', 'LGd-co', 'LGd-ip', 'LGd-sh', 
                'LGv', 'LH', 'LP', 'MD', 'MGd', 'MGm', 'MGv', 'MH', 'PCN', 'PF', 'PIL', 'PO', 'POL', 'PP', 'PR', 'PT', 'PVT', 'PoT', 
                'RE', 'RH', 'RT', 'SGN', 'SMT', 'SPA', 'SPFp', 'SubG', 'VAL', 'VM', 'VPL', 'VPLpc', 'VPM', 'VPMpc', 'Xi', 'ZI']

In [112]:
def split_cells_clustering(X):
    clusters = hdbscan.HDBSCAN(min_samples=10).fit_predict(X)
    return [X[clusters==i, :] for i in set(clusters) if not i==-1]

def split_cells_midline(X, midline_gap=100):
    centroids, _, _ = k_means(X, 2, n_init=5)
    midpoint = np.mean(centroids, 0)[0]
    if np.min(np.abs(X[:,0] - midpoint)) > midline_gap:
        left = X[X[:,0] < midpoint]
        right = X[X[:,0] > midpoint]
        return [left, right]
    else:
        return [X]

def get_polygon_from_obs(df, min_points=50, split_clusters=True):
    X = df[['cirro_x','cirro_y']].values
    if X.shape[0] > min_points:
        if split_clusters:
            groups = split_cells_clustering(X)
        else:
            groups = split_cells_midline(X)
        if len(groups)>0:
            return unary_union([poly_from_points(x, allow_holes=not split_clusters) for x in groups])
        else:
            return None
                
def get_ccf_polygons(data, min_points=50, midline_gap=100):
    ccf_polygons = defaultdict(dict)  
    for (name, section), df in data.groupby(['CCF_acronym', 'section']):
        poly = get_polygon_from_obs(df, min_points, split_clusters=name not in ccf_th_names)
        if poly is not None:
            ccf_polygons[name][section] = poly
    return ccf_polygons

In [113]:
ccf_polygons = get_ccf_polygons(adata.obs)

In [114]:
ccf_polygons_th_zi = {x: ccf_polygons[x] for x in ccf_th_names}

## view all slices with polygons and labeled points

In [132]:
def plot_shape(poly, **kwargs):
    edgecolor='black'
    if type(poly) is shapely.GeometryCollection:
        for subpoly in poly.geoms:
            patch = splot.plot_polygon(subpoly, add_points=False, edgecolor=edgecolor, **kwargs)
    else:
        patch = splot.plot_polygon(poly, add_points=False, edgecolor=edgecolor, **kwargs)
    return patch

import colorcet as cc
def plot_ccf_section(ccf_polygons, section, highlight=[], palette=None, labels=True, bg_shapes=True, ax=None):
    ccf_names = ccf_polygons.keys()
    if palette is None:
        palette = dict(zip(ccf_names, sns.color_palette(cc.glasbey, n_colors=len(ccf_names))))
    elif palette=='bw':
        palette = {x: '#BBBBBB' for x in ccf_names}
    if highlight=='all':
        highlight = ccf_names
    patches = []
    if bg_shapes:
        for i, name in enumerate(ccf_names):
            if section in ccf_polygons[name] and name not in highlight:
                patches.append(plot_shape(ccf_polygons[name][section], color=palette[name], ax=ax, 
                                alpha=0.1, label=name if labels else None))
    for name in highlight:
        if section in ccf_polygons[name]:
            patches.append(plot_shape(ccf_polygons[name][section], color=palette[name], ax=ax,
                                alpha=0.4, label=name if labels else None))
    return patches

def plot_ccf_overlay(obs, ccf_polygons, sections=None, point_hue='CCF_acronym', legend='cells', min_group_count=10, highlight=[], 
                     outlines_only=False, bg_cells=None, bg_shapes=True, axes=False):
    obs = obs.copy()
    if sections is None:
        sections = obs['section'].unique()
    else:
        ccf_polygons = {x: y for x, y in ccf_polygons.items() 
                        if len(sections & y.keys()) > 0}
    ccf_names = ccf_polygons.keys()
    shape_palette = 'bw' if outlines_only else dict(zip(ccf_names, sns.color_palette(cc.glasbey, n_colors=len(ccf_names))))
    
#     string rep allows adding 'other'
    obs[point_hue] = obs[point_hue].astype(str)
    point_group_names = obs[point_hue].value_counts().loc[lambda x: x>min_group_count].index
    obs = obs.loc[lambda df: df[point_hue].isin(point_group_names)]
    
    if point_hue == 'CCF_acronym':
        point_palette = shape_palette.copy()
        extra_names = point_group_names.difference(ccf_names)
        extra_palette = dict(zip(extra_names, sns.color_palette(cc.glasbey, n_colors=len(point_group_names))[-len(extra_names):]))
        point_palette.update(extra_palette)
    else:
        point_palette = dict(zip(point_group_names, sns.color_palette(cc.glasbey, n_colors=len(point_group_names))))
    point_palette.update(other='grey')
    
    for section in sections:
        secdata = obs.loc[lambda df: (df['section']==section)].copy() #& df['CCF_acronym'].isin(ccf_names)]
        if len(secdata) < min_group_count:
            continue
        print(section)
        fig, ax = plt.subplots(figsize=(8,4))
        
        patches = plot_ccf_section(ccf_polygons, section, highlight=highlight, palette=shape_palette, bg_shapes=bg_shapes,
                                   labels=legend in ['ccf', 'both'], ax=ax)
        
        if bg_cells is not None:
            sns.scatterplot(bg_cells.loc[lambda df: (df['section']==section)], x='cirro_x', y='cirro_y', c='grey', s=2, alpha=0.5)
        # lump small groups if legend list is too long
        sec_group_counts = secdata[point_hue].value_counts(ascending=True)
        if len(sec_group_counts) > 10:
            point_groups_section = sec_group_counts.loc[lambda x: x>min_group_count].index
            secdata.loc[lambda df: ~df[point_hue].isin(point_groups_section), point_hue] = 'other'
        secdata[point_hue] = pd.Categorical(secdata[point_hue])
        
        sns.scatterplot(secdata, x='cirro_x', y='cirro_y', hue=point_hue, s=2, palette=point_palette, legend=legend in ['cells', 'both'])
        if legend:
            plt.legend(ncols=2, loc='upper center', bbox_to_anchor=(0.5, 0))
        plt.axis('image')
        if not axes:
            sns.despine(left=True, bottom=True)
            plt.xticks([])
            plt.yticks([])
        plt.xlabel('')
        plt.ylabel('')
        plt.show()
            

In [115]:
plot_ccf_overlay(adata.obs, ccf_polygons)

### define thalamus subset (requires atlas download)

can skip now, result saved

In [11]:
# import urllib.request, json 
# with urllib.request.urlopen("http://api.brain-map.org/api/v2/structure_graph_download/1.json") as url:
#     atlas = json.load(url)['msg']

In [12]:
from bg_atlasapi.bg_atlas import BrainGlobeAtlas
atlas = BrainGlobeAtlas("allen_mouse_100um", brainglobe_dir='/scratch/brainglobe/')

In [13]:
ccf_polygons_th_zi = {x: ccf_polygons[x] for x in ccf_polygons.keys() 
                      #549: thalamus, 797: ZI
                      if (549 in atlas.structures[x]['structure_id_path'] 
                          and not 549==atlas.structures[x]['structure_id_path'][-1])
                      or 797 in atlas.structures[x]['structure_id_path']
                     }

In [14]:
len(ccf_polygons_th_zi)

## view thalamus subset with subclass labels

In [116]:
# may need to tweak clustering params on fine 
plot_ccf_overlay(adata.obs.loc[[],:], ccf_polygons_th_zi, sections=adata.obs['section'].unique(), highlight='all', 
                 bg_cells=adata_neuronal.obs, bg_shapes=True, min_group_count=0, legend=False)

In [118]:
plot_ccf_overlay(adata_neuronal.obs, ccf_polygons_th_zi, highlight=ccf_th_names, point_hue='subclass_id_label', legend=False, outlines_only=True)

# Investigate prong 1 cluster-nucleus groups

In [20]:
nuclei_df = pd.read_csv("../resources/Prong 1 Vitessce links by nucleus.csv", index_col=0)

In [82]:
def get_obs_from_annotated_clusters(name, adata, nuclei_df):
    clusters = nuclei_df.loc[name, "annotated clusters"].split(', ')
    obs = adata.obs.loc[lambda df: df['cluster_label'].str[:4].isin(clusters)]
    return obs

def plot_nucleus_cluster_comparison_slices(obs, ccf_polygons, nuclei, bg_cells=None, bg_shapes=True, legend='cells', **kwargs):
    sections_points = obs['section'].value_counts().loc[lambda x: x>10].index
    nuclei = [nuclei] if type(nuclei) is str else nuclei
    sections_nuclei = set.union(*[set(ccf_polygons[x].keys()) for x in nuclei])
    sections = sorted(sections_nuclei.union(sections_points))
    plot_ccf_overlay(obs, ccf_polygons, sections, point_hue='cluster_label', legend=legend, highlight=nuclei, bg_cells=bg_cells, bg_shapes=bg_shapes, **kwargs)

In [22]:
adata_neuronal = adata[adata.obs["division_id_label"].isin(["3 PAL-sAMY-TH-HY-MB-HB neuronal","2 Subpallium GABAergic","4 CBX-MOB-other neuronal"])].copy()
# cluster_id as string to simplify querying
adata_neuronal.obs['cluster_id'] = pd.Categorical(adata_neuronal.obs['cluster_id'].astype(int).astype(str))

## VM

In [23]:
# obs = get_obs_from_annotated_clusters('VM', adata, nuclei_df)
obs = adata_neuronal.obs.query("cluster_label=='1142 TH Prkcd Grin2c Glut_13'")
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=['VM', 'VAL'], legend='both', bg_cells=adata_neuronal.obs, bg_shapes=False)

In [24]:
obs = adata_neuronal.obs.query("supertype_label=='TH Prkcd Grin2c Glut_13'")
obs['cluster_label'].value_counts().loc[lambda x: x>5]

In [92]:
# remove 1140? annotated as VPMpc
clusters = ['1138', '1141', '1142']
obs = adata.obs.loc[lambda df: df['cluster_label'].str[:4].isin(clusters)]
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=['VM'], legend='cells', bg_cells=adata_neuronal.obs, bg_shapes=False, outlines_only=True)

### diff exp

In [26]:
import scanpy as sc

In [27]:
sc.tl.rank_genes_groups(adata_neuronal, 'cluster_id', groups=['1142'], reference='1138', method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [28]:
adata_neuronal

In [120]:
def plot_expression_ccf(adata_neuronal, section, gene, polygons, nuclei=[], bg_shapes=False, axes=False):
    subset = adata_neuronal[adata_neuronal.obs.query(f"section=='{section}'").index]
    fig, ax = plt.subplots(figsize=(8,4))
    plot_ccf_section(ccf_polygons_th_zi, section, highlight=nuclei, bg_shapes=bg_shapes, ax=ax, palette='bw')

    x, y = subset.obsm['spatial_cirro'].T
    c = subset[:,gene].X.toarray().squeeze()
    im = plt.scatter(x=x, y=y, c=c, s=2, cmap='magma')
    plt.colorbar(label="log2(CPM+1)")
    plt.axis('image')
    plt.title(gene)
    if not axes:
        # sns.despine(left=True, bottom=True)
        plt.xticks([])
        plt.yticks([])
    plt.xlabel('')
    plt.ylabel('')
    plt.show()

In [133]:
section='1199651048'

gene = "Calb1"
nuclei=['VM', 'VAL']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [134]:
section='1199651048'

gene = "Necab1"
nuclei=['VM', 'VAL']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [31]:
sc.tl.rank_genes_groups(adata_neuronal, 'supertype_label', groups=['TH Prkcd Grin2c Glut_13'], method='wilcoxon', tie_correct=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [32]:
sc.tl.rank_genes_groups(adata_neuronal, 'cluster_id', groups=['1142'], method='wilcoxon', tie_correct=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [135]:
# ISH diff ex search
# https://mouse.brain-map.org/gene/show/87708

section='1199651048'

gene = "Galnt18"
nuclei=['VM', 'VAL']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [136]:
section='1199651057'
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [137]:

gene = "Stxbp6"
nuclei=['VM', 'VAL']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [138]:
section='1199651048'

gene = "Spon1"
nuclei=['VM', 'VAL']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [139]:
section='1199651057'
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

### outlining VM / composition

In [39]:
import geopandas as gp
neuron_points = gp.GeoSeries.from_xy(*adata_neuronal.obsm['spatial_cirro'].T, index=adata_neuronal.obs_names)

In [40]:
adata_vm = adata[adata.obs.query("section=='1199651048' & cluster_id==1142").index].copy()
df = adata_vm.obs
X = df[['cirro_x','cirro_y']].values
df['spatial_cluster'] = hdbscan.HDBSCAN(min_samples=10).fit_predict(X)
core_clusters = df['spatial_cluster'].value_counts().index[:2]
df['spatial_group'] = df['spatial_cluster'].apply(lambda x: 'core' if x in core_clusters else 'other')

In [41]:
df['spatial_group'].value_counts()

In [42]:
sns.scatterplot(data=df,  x='cirro_x', y='cirro_y', hue='spatial_cluster', s=4,  legend=True, palette='tab10')

In [43]:
nucleus = get_polygon_from_obs(df.query("spatial_group=='core'"))
subset = neuron_points.geometry.intersects(nucleus)
adata_neuronal.obs['in_nucleus'] = subset.values
adata_nucleus = adata_neuronal[subset]

In [44]:
# sc.pl.embedding(adata_nucleus, "spatial_cirro", color="supertype_label")
sc.pl.embedding(adata_nucleus, "spatial_cirro", color="cluster_label", s=20)

In [45]:
adata_neuronal.obs.query("section=='1199651048' & supertype_label=='TH Prkcd Grin2c Glut_13'").groupby(['cluster_label'])['in_nucleus'].apply(lambda x: x.value_counts())

In [46]:
# need to include VAL types ??

In [47]:
sc.tl.rank_genes_groups(adata_vm, 'spatial_group', groups=['core'], reference='other', method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_vm, groups=['core'], n_genes=20)

## anterior

In [76]:
nuclei=['AD', 'AV', 'AMd', 'AMv']
section = '1199651060'
# plot_ccf_section(ccf_polygons_th_zi, section, highlight=nuclei, )
plot_ccf_overlay(adata.obs.loc[[],:], ccf_polygons_th_zi, sections=[section], highlight=nuclei, legend='both', bg_cells=adata_neuronal.obs, bg_shapes=True, min_group_count=0)

In [87]:
clusters = [str(x) for x in list(range(1169, 1171+1)) + list(range(1095, 1098+1))]
obs = adata.obs.loc[lambda df: df['cluster_label'].str[:4].isin(clusters)]

plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=nuclei, legend='cells', bg_cells=adata_neuronal.obs, bg_shapes=False, outlines_only=True)

In [49]:
# plot_nucleus_cluster_comparison_slices('AD', adata, nuclei_df, ccf_polygons_th_zi)
obs = get_obs_from_annotated_clusters('AD', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei='AD')

In [50]:

# plot_nucleus_cluster_comparison_slices('AD', adata, nuclei_df, ccf_polygons_th_zi)
obs = get_obs_from_annotated_clusters('AD', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei='AD')

### diff exp

In [51]:
sc.tl.rank_genes_groups(adata_neuronal, 'subclass_label', groups=['AV Col27a1 Glut'], method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [140]:
section='1199651060'

gene = "Col27a1"
nuclei=['AD', 'AV', 'AMd', 'AMv']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [53]:
sc.tl.rank_genes_groups(adata_neuronal, 'subclass_label', groups=['AD Serpinb7 Glut'], method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [141]:
section='1199651060'

gene = "Syndig1l"
nuclei=['AD', 'AV', 'AMd', 'AMv']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [55]:
sc.tl.rank_genes_groups(adata_neuronal, 'supertype_label', groups=['TH Prkcd Grin2c Glut_9'], method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [142]:
section='1199651060'

gene = "Cbln1"
nuclei=['AD', 'AV', 'AMd', 'AMv']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [146]:
adata_neuronal.obs['anterior_type'] = (adata_neuronal.obs['supertype_label'].str.contains('Glut_9') |
                                       adata_neuronal.obs['supertype_label'].str.contains('AD') |
                                       adata_neuronal.obs['supertype_label'].str.contains('AV')).astype(str)

In [147]:
sc.tl.rank_genes_groups(adata_neuronal, 'anterior_type', method='wilcoxon', rankby_abs=True)
sc.pl.rank_genes_groups(adata_neuronal, n_genes=20)

In [148]:
section='1199651060'

gene = "C1ql3"
nuclei=['AD', 'AV', 'AMd', 'AMv']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

In [149]:
section='1199651060'

gene = "Shox2"
nuclei=['AD', 'AV', 'AMd', 'AMv']
plot_expression_ccf(adata_neuronal, section, gene, ccf_polygons, nuclei=nuclei, bg_shapes=False)

## MD

In [65]:
obs = get_obs_from_annotated_clusters('MD', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, 'MD', bg_shapes=False, bg_cells=adata_neuronal.obs)

## other

In [58]:
obs = get_obs_from_annotated_clusters('PT', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei='PT', bg_cells=adata_neuronal.obs, bg_shapes=False)

In [59]:
obs = adata_neuronal.obs.query("supertype_label.str.contains('PVT-PT')", engine='python')
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=['PT','PVT'], bg_cells=adata_neuronal.obs, bg_shapes=False, legend='both')

In [60]:
obs = get_obs_from_annotated_clusters('PVT', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=['PT','PVT'], bg_cells=adata_neuronal.obs, bg_shapes=False, legend='both')

In [61]:
obs = get_obs_from_annotated_clusters('LGd', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei=['LGd-co', 'LGd-ip', 'LGd-sh'], bg_cells=adata_neuronal.obs, bg_shapes=False)

In [62]:
obs = get_obs_from_annotated_clusters('SMT', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei='SMT', bg_cells=adata_neuronal.obs, bg_shapes=False)

In [63]:
obs = get_obs_from_annotated_clusters('LD', adata, nuclei_df)
plot_nucleus_cluster_comparison_slices(obs, ccf_polygons_th_zi, nuclei='LD', bg_cells=adata_neuronal.obs, bg_shapes=False)

# OLD: clustering tests

In [None]:
plot_ccf_overlay(adata.obs, ccf_polygons)

In [7]:
data = adata.obs.query("section=='1199651024'")

ccf_polygons = defaultdict(dict)  
for (name, section), df in data.groupby(['CCF_acronym', 'section']):
    X = df[['cirro_x','cirro_y']].values
    if X.shape[0] > 100:
        clusters = OPTICS(min_cluster_size=50, min_samples=10, n_jobs=2).fit_predict(X)
        results = [poly_from_points(X[clusters==i, :]) for i in set(clusters) if not i==-1]
        ccf_polygons[name][section] = unary_union(results)
        

In [20]:
df = adata.obs.query("section=='1199651033' & CCF_acronym=='MB'").copy()
X = df[['cirro_x','cirro_y']].values
df['spatial_cluster'] = hdbscan.HDBSCAN(min_samples=10).fit_predict(X)
sns.scatterplot(data=df,  x='cirro_x', y='cirro_y', hue='spatial_cluster', s=2,  legend=True, palette='tab10')

In [17]:
df = adata.obs.query("section=='1199651033' & CCF_acronym=='MB'").copy()
X = df[['cirro_x','cirro_y']].values
df['spatial_cluster'] = OPTICS(min_samples=10, xi=0.01, max_eps=1000, predecessor_correction=False, n_jobs=4).fit_predict(X)
sns.scatterplot(data=df,  x='cirro_x', y='cirro_y', hue='spatial_cluster', s=2,  legend=True, palette='tab10')