In [1]:
import anndata as ad
import pandas as pd
import scanpy as sc
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns

In [25]:
# Load sequencing data

ad_th = ad.read_h5ad("/data/allen-brain-cell-atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-TH-log2.h5ad", backed='r')
ad_hy = ad.read_h5ad("/data/allen-brain-cell-atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-HY-log2.h5ad", backed='r')

In [36]:
cells_df = pd.read_csv("/data/allen-brain-cell-atlas/metadata/WMB-10X/20230630/cell_metadata.csv", index_col=0)
cells_df.head()

In [33]:
clusters_df = pd.read_csv("/data/allen-brain-cell-atlas/metadata/WMB-taxonomy/20230630/views/cluster_to_cluster_annotation_membership_pivoted.csv", index_col=0)
clusters_df.head()

In [40]:
ad_joined = ad.concat([ad_th.to_memory(), ad_hy.to_memory()])

In [42]:
ad_joined.obs = ad_joined.obs.join(cells_df, rsuffix='_md').join(clusters_df, on='cluster_alias')

## all thalamus subclasses

In [44]:
subclasses = ['TH Prkcd Grin2c Glut', 'RT ZI Gnb3 Gaba', 'ZI Pax6 Gaba',
                  'MH Tac2 Glut', 'PF Fzd5 Glut', 'PVT-PT Ntrk1 Glut',
                  'RE-Xi Nox4 Glut', 'CM-IAD-CL-PCN Glut',
                  'SPA-SPFm-SPFp-POL-PIL-PoT Glut', 'AV Col27a1 Glut',
                  'LGv-SPFp-SPFm Gata3 Gaba', 'LH Pou4f1 Sox1 Glut',
                  'AD Serpinb7 Glut', 'MG-POL-SGN Glut', 'LGv Otx2 Gaba']

In [49]:
adata_neuronal = ad_joined[ad_joined.obs["subclass"].str[4:].isin(subclasses)]
adata_neuronal.shape

In [50]:
adata_neuronal.obs['region_of_interest_acronym'].value_counts()

In [51]:
adata_neuronal.obs["subclass"].value_counts()

In [92]:
from importlib import reload
reload(dprime)

In [93]:
import dprime
import scipy.spatial.distance as distance
from diskcache import Cache

cache = Cache("/scratch/cache")

# @cache.memoize()
def tx_dprime(type_label, features=None, type_list=None, n_folds=5, r=3, zero_inflated=True, n_subsample=1000, **kwargs ):
    global data
    adata = data[data.obs.groupby(type_label).sample(n_subsample, replace=True).index]
    df = adata.obs
    type_labels = df[type_label]
    data = adata.X if features is None else adata[:, features].X
    if type_list is None:
        type_list = type_labels.unique()
    # d-prime calculation
    if zero_inflated:
        dprime_results = dprime.zinb_dprime(
            data, type_list, type_labels, n_folds=n_folds,
            r=r, **kwargs)
    else:
        dprime_results = dprime.negative_binomial_dprime(
            data, type_list, type_labels, n_folds=n_folds,
            r=r, **kwargs)

    dprime_mat = distance.squareform(
        [np.abs(dprime_results[k]["dprime"]) for k in dprime_results])
    output_dprime_df = pd.DataFrame(dprime_mat, index=type_list, columns=type_list)
#     return output_dprime_df, dprime_results
    return output_dprime_df


In [94]:
data = adata_neuronal
result = tx_dprime(type_label='subclass', zero_inflated=False, r=1)

In [96]:
result.to_csv("resources/th_subclass_dprime.csv")

## load and plot

In [62]:
result = pd.read_csv("resources/th_subclass_dprime.csv", index_col=0)

In [95]:
plt.figure(figsize=(7,7))
sns.heatmap(result, cmap='viridis_r', vmin=0, vmax=5, cbar=True, cbar_kws=dict(label="distinctness d'"))
plt.axis('image')

In [99]:
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

X = squareform(result.values)
Z = hierarchy.linkage(X, method='single')
order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))

In [100]:
sns.clustermap(result, row_linkage=Z, col_linkage=Z, cmap='viridis_r', vmin=0, vmax=5, cbar=True, cbar_kws=dict(label="distinctness d'"))
# plt.axis('image')

In [65]:
plt.figure(figsize=(7,7))
sns.heatmap(dprime.iloc[order, order], cmap='viridis_r', vmin=0, vmax=5, cbar=True, cbar_kws=dict(label="distinctness d'"))
plt.axis('image')

In [31]:
bold = ['AD Serpinb7 Glut',
 'AV Col27a1 Glut',
 'TH Prkcd Grin2c Glut 9 (AM)',
 'TH Prkcd Grin2c Glut 13 (VM/VAL)',
 'TH Prkcd Grin2c Glut 10 (MD)',
        ]

In [34]:
Z = hierarchy.linkage(X, method='single')
order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))

plt.figure(figsize=(7,6))
sns.heatmap(dprime.iloc[order, order], cmap='viridis_r', vmin=0, vmax=4, cbar=True, cbar_kws=dict(label="distinctness d'"))
plt.axis('image')
plt.xticks

ax = plt.gca()
from matplotlib.patches import Rectangle
args = dict(linewidth=1.5, edgecolor='red', facecolor='none')
k = dprime.shape[0]
boxes = [
    ax.add_patch(Rectangle((0,4), k, 1, **args)),
    ax.add_patch(Rectangle((0,10), k, 1, **args)),
    ax.add_patch(Rectangle((0,18), k, 3, **args)),
]
labels = dprime.index[order]
labels = [r"$\mathbf{" + x.replace(' ', '\ ') + "}$" if x in bold else x for x in labels]
ax.set_yticklabels(labels)
ax.tick_params(labelbottom=False) 
plt.show()

In [51]:
types = ["277 TH Prkcd Grin2c Glut_3",
         "280 TH Prkcd Grin2c Glut_6"
        ]

In [30]:
facs_dprime = tx_dprime(seqData, cluster_label='supertype_id_label', type_list=types, zero_inflated=False, r=1)

In [31]:
facs_dprime

In [54]:
# higher r / lower dispersion
tx_dprime(seqData, cluster_label='supertype_id_label', type_list=types, zero_inflated=False, r=10)

In [32]:
tx_dprime(seqData, cluster_label='supertype_id_label', features=seqData.var_names[:10000], type_list=types, zero_inflated=False, r=1)

In [41]:
sc.pp.highly_variable_genes(seqData, flavor='seurat_v3', n_top_genes=10000)

In [42]:
tx_dprime(seqData, cluster_label='supertype_id_label', features=seqData.var.query('highly_variable').index, type_list=types, zero_inflated=False, r=1)

In [43]:
hvg = sc.experimental.pp.highly_variable_genes(seqData, n_top_genes=10000, inplace=False)

In [45]:
tx_dprime(seqData, cluster_label='supertype_id_label', features=hvg.query('highly_variable').index, type_list=types, zero_inflated=False, r=1)