In [1]:
import pandas as pd
import numpy as np
import os
import sys
from collections import Counter
import gget
import scipy

from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG.KGML import KGML_parser

import scanpy as sc
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns

# Marker genes

In [2]:
def getGenes(pdf, cellType, ui_upper=None):
    genes = pdf[pdf['cell type'] == cellType]
    
    if not ui_upper is None:
        genes = genes[genes['ubiquitousness index'] < ui_upper]
    return genes['official gene symbol'].to_list()


pdfPath = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/panglaodb/pandb.tsv.gz"
pandDf = pd.read_csv(pdfPath, sep="\t")

fb = getGenes(pandDf, 'Fibroblasts')
hp = getGenes(pandDf, 'Hematopoietic stem cells')

print(f"FB genes: {len(fb)}")
print(f"HP genes: {len(hp)}")

print(Counter(fb + hp).most_common(5)) # there are three genes shared in both lists


marker_genes = {
    'Fibroblast' : adata.var[adata.var['gene_name'].isin(fb)].index,
    'HSC' : adata.var[adata.var['gene_name'].isin(hp)].index,
}

id2name = dict(zip(adata.var.index.values, adata.var['gene_name'].values))
name2id = dict(zip(adata.var['gene_name'].values, adata.var.index.values))

FB genes: 179
HP genes: 88
[('THY1', 2), ('CD44', 2), ('EGR1', 2), ('RUNX1', 2), ('IL1R1', 1)]


NameError: name 'adata' is not defined

# Distance to target

In [None]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/adaptive_sampling/data/tabula_sapiens_filtered.h5ad"
sdf = sc.read_h5ad(fpath)

sc.pp.normalize_total(sdf, target_sum=1e6)
sc.pp.log1p(sdf)

sdf

In [None]:
cellmap = {
    'fibroblast' : 'FB',
    'hematopoietic stem cell' : 'HSC',
}

sdf.obs['celltype'] = sdf.obs['cell_ontology_class'].map(cellmap)

sdf.obs[['organ_tissue', 'celltype']].value_counts()

In [None]:
# get the overlapping gene sets
our_genes = adata.var['gene_name'].unique()
ts_genes = sdf.var['gene_symbol'].unique()

print(f"{our_genes.shape=}")
print(f"{ts_genes.shape=}")

common_genes = np.intersect1d(our_genes, ts_genes)
print(f"{common_genes.shape=}")

In [None]:
# generate aggregate signatures

exp_genes = adata.var['gene_name'].to_list()

cellmap = {
    'fibroblast' : 'FB',
    'hematopoietic stem cell' : 'HSC',
}

sdf.obs['celltype'] = sdf.obs['cell_ontology_class'].map(cellmap)

signatures = {}
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

fig, axs = plt.subplots(3, 1)

for i, celltype in enumerate(sdf.obs['celltype'].unique()):
    tmp = sdf[sdf.obs['celltype'] == celltype].copy()

    gene_list = tmp.var.drop_duplicates(subset='gene_symbol')
    v_genes = gene_list[gene_list['gene_symbol'].isin(common_genes)].index
  
    X = tmp[:, v_genes].X
    sig = np.mean(X, axis=0)
    print(X.shape, sig.shape)
    signatures[celltype] = sig.copy()

    axs[i].plot(sig.T, c=colors[i])
    axs[i].set_xticks([])
    axs[i].set_ylabel('TPM')
    axs[i].set_title(f"{celltype} Signature (Tabula Sapiens)")
    

    
diff = np.ravel(signatures['FB'] - signatures['HSC'])

axs[2].plot(diff, c='grey', zorder=1)
axs[2].set_xticks([])
axs[2].set_ylabel('TPM')
axs[2].set_title(f"Difference")

diff_ind = np.where(np.abs(diff) > 4.5)

print(f"{diff.shape=} {diff_ind[0].shape=}")

axs[2].scatter(diff_ind, 
               diff[diff_ind[0]], 
               zorder=3, 
               c='r', 
               ec='k',
               lw=0.5,
               s=10,)

axs[2].axhline(y=0, lw=1, zorder=2, c='k')
genes  = v_genes[diff_ind[0]]

for x, y, val in zip(diff_ind[0], diff[diff_ind[0]], genes):
    buff = 100
    ha = 'left'
    axs[2].text(x+buff, y, str(val),
                ha=ha,
                fontsize=4,
                fontweight='bold')

plt.tight_layout()
sns.despine()

# Global differences

In [None]:
metric = 'euclidean'

res = []

for i, (cluster, group) in enumerate(adata.obs.groupby('clusters')):
    print(f"C{int(cluster)+1} {group.shape=}")
    v_gene_ids = adata.var[adata.var['gene_name'].isin(v_genes)].index
    X = adata[group.index, v_gene_ids].X

    for cluster_comp, sig in signatures.items():
        d = scipy.spatial.distance.cdist(X, sig.reshape(1, -1), metric=metric)
        d = pd.DataFrame(d)
        d.columns = [metric]
        d['expression'] = f'C{int(cluster)+1}'
        d['signature'] = f'{cluster_comp}'
        
        res.append(d)

res = pd.concat(res)


plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 7, 3

fig, axs = plt.subplots(1, 2)

for i, (cluster, group) in enumerate(res.groupby('expression')):
    print(i, cluster)
    
    sns.histplot(data=group, 
                 x=metric, 
                 bins=50,
                 kde=True,
                 ax=axs[i],
                 palette=colors,
                 hue='signature')

    axs[i].set_xlabel(f'{metric.title()} Distance')
    # axs[i].set_xlabel(f'Jensen-Shannon Divergence')
    axs[i].set_ylabel('n Cells')
    axs[i].set_title(f"{cluster} Cells")
    
    if i == 0:
        axs[i].legend().remove()
    else:
        sns.move_legend(axs[i],
                        title="Signature",
                        loc='upper right',
                        frameon=False)

plt.tight_layout()
sns.despine()

# local distances

In [None]:
# get overlapping gene sets
marker_genes = list(set(fb + hp))
print(f"{len(marker_genes)=}")


# get the overlapping gene sets
our_genes = [x for x in adata.var['gene_name'].to_list() if x in marker_genes]
ts_genes = [x for x in sdf.var['gene_symbol'].to_list() if x in marker_genes]

print(f"{len(our_genes)=}")
print(f"{len(ts_genes)=}")

common_genes = np.intersect1d(our_genes, ts_genes)
print(f"{common_genes.shape=}")
common_genes


In [None]:
# generate aggregate signatures

exp_genes = adata.var['gene_name'].to_list()

cellmap = {
    'fibroblast' : 'FB',
    'hematopoietic stem cell' : 'HSC',
}

sdf.obs['celltype'] = sdf.obs['cell_ontology_class'].map(cellmap)

signatures = {}
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 4

fig, axs = plt.subplots(3, 1)

for i, celltype in enumerate(sdf.obs['celltype'].unique()):
    tmp = sdf[sdf.obs['celltype'] == celltype].copy()

    gene_list = tmp.var.drop_duplicates(subset='gene_symbol')
    v_genes = gene_list[gene_list['gene_symbol'].isin(common_genes)].index
  
    X = tmp[:, v_genes].X
    sig = np.mean(X, axis=0)
    print(X.shape, sig.shape)
    signatures[celltype] = sig.copy()

    axs[i].plot(sig.T, c=colors[i])
    axs[i].set_xticks([])
    axs[i].set_ylabel('TPM')
    axs[i].set_title(f"{celltype} Signature (Tabula Sapiens)")
        
diff = np.ravel(signatures['FB'] - signatures['HSC'])

axs[2].plot(diff, c='grey', zorder=1)
axs[2].set_ylabel('TPM')
axs[2].set_title(f"Difference")

diff_ind = np.where(np.abs(diff) > 4.5)

print(f"{diff.shape=} {diff_ind[0].shape=}")

axs[2].scatter(diff_ind, 
               diff[diff_ind[0]], 
               zorder=3, 
               c='r', 
               ec='k',
               lw=0.5,
               s=10,)

axs[2].axhline(y=0, lw=1, zorder=2, c='k')


genes  = v_genes[diff_ind[0]]
print(genes)

for x, y, val in zip(diff_ind[0], diff[diff_ind[0]], genes):
    buff = 1
    ha = 'left'

    axs[2].text(x+buff, y, str(val),
                ha=ha,
                fontsize=4,
                fontweight='bold')

plt.tight_layout()
sns.despine()

In [None]:
metric = 'cosine'

res = []

colors2 = ['g', 'r']

for i, (cluster, group) in enumerate(adata.obs.groupby('clusters')):
    v_gene_ids = adata.var[adata.var['gene_name'].isin(common_genes)].index
    X = adata[group.index, v_gene_ids].X

    for cluster_comp, sig in signatures.items():
        print(f"C{int(cluster)+1} vs. {cluster_comp}")
        print(f"{X.shape=} {sig.shape=}")
        d = scipy.spatial.distance.cdist(X, sig.reshape(1, -1), metric=metric)
        d = pd.DataFrame(d)
        d.columns = ['distance']
        d['expression'] = f'C{int(cluster)+1}'
        d['signature'] = f'{cluster_comp}'
        res.append(d)

res = pd.concat(res, ignore_index=True)

t = res.replace([np.inf, -np.inf], np.nan)
gx = t.groupby(['expression', 'signature'], dropna=True).mean()
print(gx)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 7, 3

fig, axs = plt.subplots(1, 2)

for i, (cluster, group) in enumerate(res.groupby('expression')):
    sns.histplot(data=group, 
                 x='distance', 
                 bins=50,
                 # kde=True,
                 ax=axs[i],
                 palette=colors,
                 hue='signature')

    t = group.replace([np.inf, -np.inf], np.nan)
    means = t.groupby(['signature'])['distance'].mean()
    for j, xc in enumerate(means):
        axs[i].axvline(x=xc, c=colors2[j], lw=2, alpha=0.6)
    
    axs[i].set_xlabel(f'{metric.title()} Distance')
    # axs[i].set_xlabel(f'Jensen-Shannon Divergence')
    axs[i].set_ylabel('n Cells')
    axs[i].set_title(f"{cluster} Cells")
    # axs[i].set_xscale('log')
    
    
    if i == 0:
        axs[i].legend().remove()
    else:
        sns.move_legend(axs[i],
                        title="Signature",
                        loc='upper left',
                        frameon=False)

plt.tight_layout()
sns.despine()

In [None]:
buffer = 0.01
t = res.replace([np.inf, -np.inf], np.nan)
pdx = t.groupby(['expression', 'signature'], dropna=True).mean()
pdx = pdx.reset_index()
print(pdx)

nameMap = {
    'FB' : 'Fibroblst',
    'HSC' : 'HSC'
}

pdx['sig_names'] = pdx['signature'].map(nameMap)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 2, 2.5
sns.barplot(data=pdx, 
            x='expression', 
            y='distance',
            palette=colors,
            ec='k',
            hue='sig_names')

plt.ylabel(f'{metric.title()} Distance')
plt.xlabel(f'')
plt.ylim([pdx['distance'].min()-buffer, pdx['distance'].max()+buffer])

sns.move_legend(plt.gca(),
                title="Signature",
                loc='upper right',
                bbox_to_anchor=(1.8, 1.05),
                frameon=False)

sns.despine()

In [None]:
buffer = 0.01
t = res.replace([np.inf, -np.inf], np.nan)
pdx = t.groupby(['expression', 'signature'], dropna=True).mean()
pdx = pdx.reset_index()
print(pdx)

nameMap = {
    'FB' : 'Fibroblst',
    'HSC' : 'HSC'
}

pdx['sig_names'] = pdx['signature'].map(nameMap)
pdx = pdx[pdx['signature'] == 'FB']

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 2, 2.5
sns.barplot(data=pdx, 
            x='expression', 
            y='distance',
            palette=['#BEE0B4'],
            ec='k',)

plt.ylabel(f'{metric.title()} Distance')
plt.xlabel(f'')
plt.ylim([pdx['distance'].min()-buffer, pdx['distance'].max()+buffer])
# '#BEE0B4', '#EBAFAF'

sns.despine()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 2, 2.5

nameMap = {
    'FB' : 'Fibroblst',
    'HSC' : 'HSC'
}

res['sig_names'] = res['signature'].map(nameMap)

sns.boxplot(data=res,
            x='expression',
            y='distance',
            hue='sig_names',
            palette=colors,
            showfliers=False)

plt.ylabel(f'{metric.title()} Distance')
plt.xlabel(f'')
# plt.gca().set_xticklabels(['Fibroblast', 'HSC'])

sns.move_legend(plt.gca(),
                title="Signature",
                loc='upper right',
                bbox_to_anchor=(1.8, 1.05),
                frameon=False)

sns.despine()