# Dependencies and files

## Imports

In [None]:
import squidpy as sq
import warnings
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.signal import medfilt2d, wiener, convolve2d

import time
from tqdm.notebook import tqdm
import requests

sc.settings.set_figure_params(dpi=80)
#sc.set_figure_params(facecolor="white", figsize=(8, 8))
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.settings.verbosity = 3

# Global variables

In [None]:
layers = ['L1', 'L2', 'L3', 'L4', 'L5', "L6", 'WM']

In [None]:
cluster_color = ['red', 'yellow', 'orange', 'green', 'purple', 'blue', 'olive', 'pink', 'brown']

## Load files

### DE results

## Load normalized pseudobulks

In [None]:
pb_dir_path = '../../data/pseudobulks/'

In [None]:
os.listdir(pb_dir_path)

In [None]:
adata_pb = sc.read_h5ad(pb_dir_path + 'pb_mammals_filtered.h5ad')
#adata_pb_norm_mean.var = anova_res
adata_pb

In [None]:
sc.pp.normalize_total(adata_pb, target_sum=1e4)
sc.pp.log1p(adata_pb)

In [None]:
scaled_adata_list = list()

for sample in adata_pb.obs['sample_id'].unique():
    scaled_adata_list.append(sc.pp.scale(adata_pb[adata_pb.obs['sample_id'] == sample], copy=True))

adata_pb_norm = an.concat(scaled_adata_list , merge='same',uns_merge="unique")
adata_pb_norm

## Specie-specific genes

In [None]:
from statsmodels.stats.multitest import multipletests
import pandas as pd
res_dict = dict()
res_dict['human-chimp'] = pd.read_csv('../results/edgeR_human_chimp.csv', index_col=0)
res_dict['human-macaque'] = pd.read_csv('../results/edgeR_human_macaque.csv', index_col=0)
res_dict['chimp-macaque'] = pd.read_csv('../results/edgeR_chimp_macaque.csv', index_col=0)

for key, df in res_dict.items():
    mult_test = multipletests(df['PValue'], method='fdr_bh')
    df['p_val_adj'] = mult_test[1]

res_dict['human-chimp'].head()

In [None]:
sign_genes = lambda df: df[df.p_val_adj < 0.05].index
sign_genes_dict = {key:sign_genes(df) for key, df in res_dict.items()}
sign_genes_dict

Попробуем определить человеко-специфичные гены как $(HC\bigcap HM)\setminus CM$

In [None]:
hs_genes = sign_genes_dict['human-chimp'].intersection(sign_genes_dict['human-macaque']).difference(sign_genes_dict['chimp-macaque'])
hs_genes, hs_genes.size

In [None]:
background_genes = res_dict['human-chimp'].index.tolist()
len(background_genes)

In [None]:
with open('results/background_genes.txt', 'w', newline='') as f:
    writer = f.write(','.join(background_genes))

## Helpful functions

In [None]:
def apply_filter(adata, value_columns, sample_column, method='median', size=3, key='_filtered'):
    # filter method
    if method == 'median':
        filter_func = medfilt2d
        kwargs = {'kernel_size': 3}
    elif method == 'wiener':
        filter_func = wiener
        kwargs = {'mysize': size}
    elif method == 'mean':
        filter_matrix = np.ones((size, size)) / size ** 2
        filter_func = convolve2d
        kwargs = {'in2': filter_matrix, 'mode': 'same'}
    else:
        raise ValueError("Invalid filter method. Choose 'median' or 'wiener'")

    # rotation matrix
    A = 1 / np.sqrt(2) * np.array([[1, -1], [1, 1]])

    # get sample list
    sample_list = adata.obs[sample_column].unique().to_list()
    results_list = list()
    
    for sample in tqdm(sample_list):
        # sub-sample data
        obs_mask = (adata.obs[sample_column] == sample)
        raw_value = adata.obs.loc[obs_mask, value_columns].copy()
        coordinates = adata.obsm['spatial'][obs_mask.values]

        # result df template
        result_df = raw_value.copy()
        result_df.columns = result_df.columns + key

        # center data to perform rotation
        x_max, y_max = coordinates.max(axis=0)
        x_min, y_min = coordinates.min(axis=0)

        x_center = x_min + (x_max - x_min) / 2
        y_center = y_min + (y_max - y_min) / 2

        centered_coord = coordinates - np.array([x_center, y_center])

        # rotate data
        rotate_coord = centered_coord @ A

        # convert coordinates to grid coordinates
        floor_divide = np.floor_divide(rotate_coord, 200)
        x_max, y_max = floor_divide.max(axis=0)
        x_min, y_min = floor_divide.min(axis=0)
        real_coord = floor_divide - np.array([x_min, y_min])
        
        # create 2d matrix template for the data
        x = np.arange(0, x_max - x_min + 1)
        y = np.arange(0, y_max - y_min + 1)
        coord = np.array(np.meshgrid(x, y)).T.reshape(-1, 2)

        # create 2d representation of the data
        df = pd.DataFrame(coord, columns=['x', 'y']).set_index(['x', 'y'])
        df[value_columns] = 0
        df.loc[real_coord.tolist(), value_columns] = raw_value.values

        for value in df.columns.to_list():
            X = df.reset_index().pivot(index='x', columns='y', values=value).values
            X_filt = filter_func(X, **kwargs)
            result_df[value + key] = pd.DataFrame(X_filt).T.unstack().loc[real_coord.tolist()].values
        results_list.append(result_df)
        
    # concat data
    filtered_df = pd.concat(results_list, axis=0)
    adata.obs[filtered_df.columns] = filtered_df

In [None]:
def p_val_group(value):
    if value >= 0.05:
        return 'No significance'
    elif  0.01 <= value < 0.05:
        return 'p < 0.05'
    elif  0.001 <= value < 0.01:
        return 'p < 0.01'
    else:
        return 'p < 0.001'

In [None]:
color_discrete_map = {'No significance': 'darkblue',
                      'p < 0.05': 'purple',
                      'p < 0.01': 'darkorange',
                      'p < 0.001': 'yellow'}

In [None]:
class GeneCluster:
    def __init__(self, genes, descr, label, background_genes, **kwargs):
        self.label = label
        self.genes = genes
        self.descr = descr
        self.background_genes = background_genes
        self.userlist_id = self._get_userlist_id()
        self.background_id = self._get_background_id()
        self.enrichment_res = dict()

    
    def _get_userlist_id(self):
        base_url = "https://maayanlab.cloud/speedrichr"

        description = "sample gene set with background"

        res = requests.post(
            base_url+'/api/addList',
            files=dict(
              list=(None, '\n'.join(self.genes)),
              description=(None, description),
            )
          )
        if res.ok:
            userlist_response = res.json()
        else:
            raise Exception('Error analyzing gene list')
        return userlist_response['userListId']
    
    def _get_background_id(self):
        base_url = "https://maayanlab.cloud/speedrichr"

        res = requests.post(
            base_url+'/api/addbackground',
            data=dict(background='\n'.join(self.background_genes))
        )

        if res.ok:
            background_response = res.json()
        else:
            raise Exception('Error analyzing gene list')
        return background_response['backgroundid']

    def enrich(self, gene_set_library):
        # get enrichment results
        base_url = "https://maayanlab.cloud/speedrichr"

        res = requests.post(
                base_url+'/api/backgroundenrich',
                data=dict(
                userListId=self.userlist_id,
                backgroundid=self.background_id,
                backgroundType=gene_set_library,
                )
            )
        if res.ok:
            data = res.json()
        else:
            raise Exception('Error analyzing gene list')
        # convert results to df
        columns = ['Rank', 'Term', 'p-val', 'Z-score', 'Combined score', 'Overlapping genes', 'Adjusted P-value', 'Old p-value', 'Old adjusted p-value']
        results = pd.DataFrame(data[gene_set_library], columns=columns)
        # preprocess df
        results.Term = results.Term.astype(str)
        results['num_overlap_genes'] = results['Overlapping genes'].apply(lambda x: len(x))
        results['neg_log10(p_adj)'] = - np.log10(results['Adjusted P-value'])
        results['cluster_label'] = self.label
        # save to enrichment_res
        self.enrichment_res[gene_set_library] = results

    @staticmethod
    def enrich_geneclusters(geneclusters, gene_set_library):
        for gc in geneclusters:
            gc.enrich(gene_set_library)

# Clusterisation

In [None]:
df = pd.DataFrame(adata_pb_norm.X, index=adata_pb_norm.obs_names, columns=adata_pb_norm.var_names)

In [None]:
sample_order = dict()
for cond in ['human', 'chimp', 'macaque']:
    samples = df.loc[df.index.str.contains(cond)].index
    order = sorted(samples.tolist(), key=lambda x: x[-2:])
    order_laminar = list(filter(lambda x: 'L' in x, order))
    order_wm = list(filter(lambda x: 'WM' in x, order))
    sample_order[cond] = order_laminar + order_wm
sample_order['all'] = sample_order['human'] + sample_order['chimp'] + sample_order['macaque']
df = df.loc[sample_order['all']]
df_hs = df[hs_genes].copy()

## Hierarchial

In [None]:
sns.set(font_scale=0.8)
cluster_grid = sns.clustermap(df_hs, figsize=(12, 10), cmap="RdBu_r", center=0, metric='cosine', annot_kws={"size": 2}, row_cluster=False)

In [None]:
from sklearn.metrics.pairwise import pairwise_kernels
fig, ax = plt.subplots(1, 2, figsize=(20, 10))

gene_order = df_hs.columns[cluster_grid.dendrogram_col.reordered_ind]
sns.heatmap(df_hs[gene_order], cmap="RdBu_r", center=0, cbar=True, ax=ax[0])

aff_matrix = pairwise_kernels(df_hs.T, metric='cosine')
aff_df = pd.DataFrame(aff_matrix, index=df_hs.columns, columns=df_hs.columns)
sns.heatmap(aff_df.loc[gene_order, gene_order], cmap="RdBu_r", center=0, cbar=True, ax=ax[1], vmin=-1, vmax=1)

In [None]:
sns.clustermap(df_hs.loc[sample_order['human']], figsize=(12, 4), cmap="RdBu_r", center=0, metric='cosine', annot_kws={"size": 2}, row_cluster=False)

In [None]:
from sklearn.metrics.pairwise import pairwise_kernels
fig, ax = plt.subplots(1, 2, figsize=(20, 10))

df_sub = df_hs.loc[sample_order['human']]
gene_order = df_sub.columns[cluster_grid.dendrogram_col.reordered_ind]
sns.heatmap(df_sub[gene_order], cmap="RdBu_r", center=0, cbar=True, ax=ax[0])

aff_matrix_h = pairwise_kernels(df_sub.T, metric='cosine')
aff_h_df = pd.DataFrame(aff_matrix_h, index=df_sub.columns, columns=df_sub.columns)
sns.heatmap(aff_h_df.loc[gene_order, gene_order], cmap="RdBu_r", center=0, cbar=True, ax=ax[1], vmin=-1, vmax=1)

## Spectral clusterisation

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics.pairwise import pairwise_kernels
#aff_matrix = (pairwise_kernels(df_hs.T, metric='cosine') + 1) / 2
n_clusters = 7
clustering = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=4)
#clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=4, assign_labels='cluster_qr')
#clustering.fit(aff_matrix)
#clustering.fit(df_hs.T[sample_order['human']])
clustering.fit(df_hs.T)

In [None]:
labels = pd.Series(clustering.labels_, index=hs_genes).sort_values()
order = labels.sort_values().index

In [None]:
labels.to_csv(f'./results/human_clusters_edgeR_{n_clusters}.csv')

In [None]:
cluster_color = ['red', 'yellow', 'orange', 'green', 'purple', 'blue', 'olive', 'pink', 'brown', 'cyan', 'pink', 'violet', 'crimson']

In [None]:
colors = cluster_color[:n_clusters]
clusters = np.arange(n_clusters)

In [None]:
from matplotlib.patches import bbox_artist
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import ticker

mpl.rcdefaults()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_hs[order], ax=ax, cmap="RdBu_r", center=0, cbar=True)
#ax.set_xticks([])
ax.set_yticks([])


divider = make_axes_locatable(ax)
cax = divider.append_axes('top', size='2%', pad=0.05)
cmap = plt.get_cmap('Set3')

cluster_size = labels.value_counts().loc[clusters].values
cluster_pos = cluster_size.cumsum()
bounds = [0] + list(cluster_pos)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
fig.colorbar(
    mpl.cm.ScalarMappable(cmap=cmap, norm=norm),
    cax=cax,
    ticks=bounds,
    orientation='horizontal',
    spacing='proportional'
)
cax.xaxis.set_major_locator(ticker.FixedLocator(cluster_pos - cluster_size / 2))
cax.xaxis.set_major_formatter(ticker.FixedFormatter(clusters))
cax.xaxis.tick_top()

# horisontal
cax = divider.append_axes('left', size='2%', pad=0.05)

cmap = mpl.colors.ListedColormap(['tab:red', 'tab:orange', 'tab:green'][::-1])
cond_size = adata_pb_norm.obs.condition.value_counts().loc[['human', 'chimp', 'macaque'][::-1]].values
cond_pos = cond_size.cumsum()
bounds = [0] + list(cond_pos)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

fig.colorbar(
    mpl.cm.ScalarMappable(cmap=cmap, norm=norm),
    cax=cax,
    ticks=bounds,
    orientation='vertical',
    spacing='proportional'
)

cax.yaxis.set_major_locator(ticker.FixedLocator(cond_pos - cond_size / 3))
cax.yaxis.set_major_formatter(ticker.FixedFormatter(['human', 'chimp', 'macaque'][::-1]))
cax.yaxis.tick_left()
cax.set_yticklabels(cax.get_yticklabels(), rotation=90, fontsize=12)

In [None]:
gene_sets = dict()
for label in labels.unique():
  gene_sets[label] = labels[labels == label].index.to_list()

for group, genes in gene_sets.items():
  print(f'{group}: {genes}')

In [None]:
from scipy.cluster.hierarchy import average, leaves_list
from scipy.spatial.distance import pdist

fig, ax = plt.subplots(1, 2, figsize=(20, 10))
aff_matrix = pairwise_kernels(df_hs.T, metric='cosine')
aff_df = pd.DataFrame(aff_matrix, index=df_hs.columns, columns=df_hs.columns)
sns.heatmap(aff_df.loc[order, order], cmap="RdBu_r", center=0, cbar=True, ax=ax[0], vmin=-1, vmax=1)

max_order = []

for label in range(n_clusters):
    genes = labels[labels == label].index
    linkage = average(pdist(df_hs[genes].T.values, 'cosine'))
    ind_order = leaves_list(linkage)
    gene_order = genes[ind_order]
    max_order += gene_order.to_list()

sns.heatmap(aff_df.loc[max_order, max_order], cmap="RdBu_r", center=0, cbar=True, ax=ax[1], vmin=-1, vmax=1)

## Cluster profiles

In [None]:
df_annotation = df_hs.copy()
df_annotation['layer'] = adata_pb_norm.obs.layer
df_annotation['condition'] = adata_pb_norm.obs.condition

In [None]:
layer_mean = df_annotation.groupby(['condition', 'layer']).mean()
layer_mean.head()

In [None]:
# fig, axes = plt.subplots(2, 2,  figsize=(14, 3.5 * 2), gridspec_kw={'hspace': 0.4})
# for gene, ax in zip(layer_mean.columns, axes.flatten()):
#     (
#             layer_mean[gene]
#             .reorder_levels(['layer', 'condition'])
#             .unstack().loc[layers]
#             .plot
#             .line(color={'human': 'tab:red', 'chimp': 'tab:orange', 'macaque': 'tab:green'}, ax=ax, marker='.')
#     )
#     ax.grid(False)

In [None]:
mpl.rcdefaults()
fig, axes = plt.subplots((n_clusters + 1) // 2, 2,  figsize=(14, 3.5 * n_clusters // 2), gridspec_kw={'hspace': 0.4})
for label, ax in zip(labels.unique(), axes.flatten()):
    genes = labels[labels == label].index.tolist()

    (
        layer_mean[genes]
        .mean(axis=1)
        .reorder_levels(['layer', 'condition'])
        .unstack().loc[layers]
        .plot
        .line(color={'human': 'tab:red', 'chimp': 'tab:orange', 'macaque': 'tab:green'}, ax=ax, marker='.')
    )
    
    ax.legend(fontsize=10, loc="upper left", bbox_to_anchor=(1, 1), prop={"size":12})
    if label % 2 == 0:
        ax.get_legend().remove()
    ax.set_title(f'Cluster {label}', fontsize=16)
    ax.grid(False)
    #ax.hlines(0, xmin=0, xmax=6, colors='gray', linestyles='dashed')

In [None]:
grid = np.linspace(1, 7, 400)
conditions = ['human', 'chimp', 'macaque']
cond_colors = ['tab:red', 'tab:orange', 'tab:green']

In [None]:
df_cont = df_annotation.copy()
df_cont['layer'] = df_cont.layer.map(dict(zip(sorted(df_cont.layer.unique()), range(1, 8)))).astype(int)

In [None]:
from scipy.interpolate import UnivariateSpline

array = np.zeros((grid.size, hs_genes.size))
df_list = []

for specie in conditions:
    for i, gene in enumerate(hs_genes.to_list()):
        #fit spline
        subs_index = df_cont[df_cont.condition == specie].index
        targ_dataset = df_cont.loc[subs_index].copy().sort_values(by='layer')
        x = targ_dataset['layer'].values
        y = targ_dataset[gene].values
        spl = UnivariateSpline(x, y, k=3)
        array[:, i] = spl(grid)
    df_specie = pd.DataFrame(array, columns=hs_genes)
    df_specie['condition'] = specie
    df_list.append(df_specie.copy())
df_spline = pd.concat(df_list)

In [None]:
mpl.rcdefaults()
fig, axes = plt.subplots((n_clusters + 1) // 2, 2,  figsize=(14, 3.5 * n_clusters // 2), gridspec_kw={'hspace': 0.4})
for label, ax in zip(labels.unique(), axes.flatten()):
    genes = labels[labels == label].index.tolist()
    # plot the splines
    genes = labels[labels == label].index.tolist()
    df_cluster_spline = df_spline[genes].mean(axis=1).to_frame()
    df_cluster_spline['condition'] = df_spline['condition']
    df_cluster_spline = df_cluster_spline.pivot(columns='condition', values=0)
    df_cluster_spline.index = grid
    df_cluster_spline.plot.line(color={'human': 'tab:red', 'chimp': 'tab:orange', 'macaque': 'tab:green'}, ax=ax)

    # plot the dots
    # df_layers = layer_mean[genes].mean(axis=1).reorder_levels(['layer', 'condition']).unstack().loc[layers][conditions]
    # df_layers.index = np.arange(1, 8)
    # ax.set_prop_cycle(color=['tab:red', 'tab:orange', 'tab:green'])
    # ax.plot(df_layers, 'o', alpha=0.7)
    # ax.legend(conditions)
    
    
    ax.legend(fontsize=10, loc="upper left", bbox_to_anchor=(1, 1), prop={"size":12})
    ax.xaxis.set_major_locator(ticker.FixedLocator(np.arange(1, 8)))
    ax.xaxis.set_major_formatter(ticker.FixedFormatter(layers))
    
    if label % 2 == 0:
        ax.get_legend().remove()
    ax.set_title(f'Cluster {label}', fontsize=16)
    ax.grid(False)
    #ax.hlines(0, xmin=1, xmax=6, colors='gray', linestyles='dashed')

# 4 Species

In [None]:
pb_dir_path = '../../data/pseudobulks/'
adata_pb_4spe = sc.read_h5ad(pb_dir_path + '4spe_pb_mean_filt_nor_mean.h5ad')
#adata_pb_norm_mean.var = anova_res
adata_pb_4spe

In [None]:
df_4spe = pd.DataFrame(adata_pb_4spe.X, index=adata_pb_4spe.obs_names, columns=adata_pb_4spe.var_names)

In [None]:
hs_genes_inter = df_4spe.columns.intersection(hs_genes)
hs_genes_inter.size

In [None]:
sample_order = dict()
for cond in ['human', 'chimp', 'macaque', 'treeshrew']:
    samples = df_4spe.loc[df_4spe.index.str.contains(cond)].index
    order_s = sorted(samples.to_list(), key=lambda x: x[:2])
    order_laminar = list(filter(lambda x: 'L' in x, order_s))
    order_wm = list(filter(lambda x: 'WM' in x, order_s))
    sample_order[cond] = order_wm + order_laminar
sample_order['all'] = sample_order['human'] + sample_order['chimp'] + sample_order['macaque'] + sample_order['treeshrew']
df_4spe = df_4spe.loc[sample_order['all']]
df_4spe_hs = df_4spe[hs_genes_inter]

In [None]:
order_4spe = order.intersection(hs_genes_inter)

In [None]:
from matplotlib.patches import bbox_artist
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import ticker

mpl.rcdefaults()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_4spe_hs[order_4spe], ax=ax, cmap="RdBu_r", vmax=0.5, vmin=-0.5, center=0)
divider = make_axes_locatable(ax)
cax = divider.append_axes('top', size='2%', pad=0.05)
cmap = mpl.colors.ListedColormap(colors)

cluster_size = labels.value_counts().loc[clusters].values
cluster_pos = cluster_size.cumsum()
bounds = [0] + list(cluster_pos)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
fig.colorbar(
    mpl.cm.ScalarMappable(cmap=cmap, norm=norm),
    cax=cax,
    ticks=bounds,
    orientation='horizontal',
    spacing='proportional'
)
cax.xaxis.set_major_locator(ticker.FixedLocator(cluster_pos - cluster_size / 2))
cax.xaxis.set_major_formatter(ticker.FixedFormatter(clusters))
cax.xaxis.tick_top()

## Cluster profiles

In [None]:
df_annotation = df_4spe_hs.copy()
df_annotation['layer'] = adata_pb_4spe.obs.layer
df_annotation['condition'] = adata_pb_4spe.obs.condition

In [None]:
layer_mean = df_annotation.groupby(['condition', 'layer']).mean()
layer_mean.head()

In [None]:
fig, axes = plt.subplots((n_clusters + 1) // 2, 2,  figsize=(14, 3.5 * n_clusters // 2), gridspec_kw={'hspace': 0.4})
sns.set(font_scale=1)
for label, ax in zip(labels.unique(), axes.flatten()):
    genes = labels[labels == label].index.intersection(hs_genes_inter).tolist()

    (
        layer_mean[genes]
        .mean(axis=1)
        .reorder_levels(['layer', 'condition'])
        .unstack().loc[layers]
        .plot
        .line(color={'human': 'tab:red', 'chimp': 'tab:orange', 'macaque': 'tab:green', 'treeshrew': 'tab:blue'}, ax=ax, marker='.')
    )
    ax.legend(fontsize=10)
    ax.set_title(f'Cluster {label}', fontsize=16)
    ax.grid(False)
    ax.hlines(0, xmin=0, xmax=6, colors='gray', linestyles='dashed')

fig.suptitle('Mean averaging for profiles', fontsize=20)

# Functional analysis

In [None]:
import gseapy
#gseapy.get_library_name()

In [None]:
enr_bg = gseapy.enrichr(gene_list=hs_genes.to_list(),
                        gene_sets=['GO_Biological_Process_2021', 'Azimuth_Cell_Types_2021', 'SynGO_2022', 'KEGG_2021_Human'],
                        outdir=None, # don't write to disk
                        background=background_genes
                       ).results

In [None]:
enr_bg[enr_bg['Adjusted P-value'] < 0.05].index.size

In [None]:
# categorical scatterplot
try:
    ax = gseapy.dotplot(enr_bg,
                column="Adjusted P-value",
                x='Gene_set', # set x axis, so you could do a multi-sample/library comparsion
                size=3,
                top_term=6,
                figsize=(5,7),
                title = "Enrichment",
                xticklabels_rot=15, # rotate xtick labels
                show_ring=True, # set to False to revmove outer ring
                marker='o',
                )

    ax.tick_params(axis='both', which='major', labelsize=10)
except e:
    print(e)

In [None]:
gene_clust = list()
enrich_results = list()

for label in tqdm(sorted(labels.unique())):
    genes = labels[labels == label].index.tolist()
    descr = f'Genes for spectral clustering ({n_clusters} clusters) of human-specific genes. Cluster label: {label}'
    gene_clust.append(GeneCluster(genes, descr, label, background_genes))
    time.sleep(1)

## GO_Biological_Process_2023

In [None]:
db_name = 'GO_Biological_Process_2023'
GeneCluster.enrich_geneclusters(gene_clust,db_name )

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.cluster_label = 'culster' + enrich_res.cluster_label.astype(str)
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=100)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

x_ticks = ax.get_xticklabels()
x_ticks_pos = {text_obj.get_text():text_obj.get_position()[0] for text_obj in x_ticks}

y_ticks = ax.get_yticklabels()
y_ticks_pos = {text_obj.get_text():text_obj.get_position()[1] for text_obj in y_ticks}

text_df = enrich_res_plot.reset_index().copy()
for i in range(text_df.shape[0]):
    text_df_sub = text_df.iloc[i, :].to_dict()
    x=x_ticks_pos[str(text_df_sub['cluster_label'])]
    y=y_ticks_pos[text_df_sub['Term']]
    text = ','.join(text_df_sub['Overlapping genes'])
    #ax.annotate(text, (x, y), xycoords='data', xytext=(x + 0.1, y - 0.1), textcoords='data', arrowprops={'arrowstyle': '-'})
    ax.text(x + 0.05, y - 0.1, text, fontsize=7)

## GO_Molecular_Function_2023

In [None]:
db_name = 'GO_Molecular_Function_2023'
GeneCluster.enrich_geneclusters(gene_clust,db_name )

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 20)].index.unique()
terms.size

In [None]:
enrich_res[enrich_res.cluster_label == 5]

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=100)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## GO_Cellular_Component_2023

In [None]:
db_name = 'GO_Cellular_Component_2023'
GeneCluster.enrich_geneclusters(gene_clust,db_name )

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 20)].index.unique()
terms.size

In [None]:
enrich_res[enrich_res.cluster_label == 5]

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=100)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## WikiPathway_2023_Human

In [None]:
db_name = 'WikiPathway_2023_Human'
GeneCluster.enrich_geneclusters(gene_clust,db_name )

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 20)].index.unique()
terms.size

In [None]:
enrich_res[enrich_res.cluster_label == 5]

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=100)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## Azimuth Cell Types 2021


In [None]:
db_name = 'Azimuth_Cell_Types_2021'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## Azimuth 2023


In [None]:
db_name = 'Azimuth_2023'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## Allen_Brain_Atlas_10x_scRNA_2021


In [None]:
db_name = 'Allen_Brain_Atlas_10x_scRNA_2021'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## Reactome_2022

In [None]:
db_name = 'Reactome_2022'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## SynGO_2022

In [None]:
db_name = 'SynGO_2022'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
enrich_res[enrich_res.cluster_label == 2]

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## MGI_Mammalian_Phenotype_Level_4_2021

In [None]:
db_name = 'MGI_Mammalian_Phenotype_Level_4_2021'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## KEGG_2021_Human

In [None]:
db_name = 'KEGG_2021_Human'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

## UK_Biobank_GWAS_v1

In [None]:
db_name = 'UK_Biobank_GWAS_v1'
GeneCluster.enrich_geneclusters(gene_clust, db_name)

In [None]:
enrich_res = pd.concat([gc.enrichment_res[db_name] for gc in gene_clust])
enrich_res = enrich_res.set_index(['Term'])
enrich_res.head()

In [None]:
terms = enrich_res[(enrich_res['Adjusted P-value'] < 0.05) & (enrich_res.Rank < 10)].index.unique()
terms.size

In [None]:
from pandas.api.types import CategoricalDtype
enrich_res_plot = enrich_res.loc[terms].copy()
cat_type = CategoricalDtype(categories=['No significance', 'p < 0.05', 'p < 0.01', 'p < 0.001'], ordered=True)
enrich_res_plot['p-value'] = enrich_res_plot['Adjusted P-value'].apply(p_val_group).astype(cat_type)
enrich_results.append(enrich_res_plot)

In [None]:
mpl.rcdefaults()
fig = plt.figure(figsize=(7, 10), dpi=120)

ax = sns.scatterplot(
    data=enrich_res_plot.reset_index(),
    x='cluster_label',
    y='Term',
    size='num_overlap_genes',
    sizes=(20, 250),
    hue='p-value',
    palette=color_discrete_map
)

ax.tick_params(labelsize=10)
ax.legend(fontsize=15)
ax.set_xlabel("Cluster label",fontsize=15)
ax.set_ylabel("", fontsize=10)
ax.grid(False)
ax.set_title(db_name)

sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

# Profile visualization

In [None]:
specie = 'human'

# load data
adata_sp = sc.read_h5ad(f'../../data/raw_adata/{specie}.h5ad')
adata_sp.obs_names_make_unique()

# normalize data
sc.pp.normalize_total(adata_sp, target_sum=1e4)
sc.pp.log1p(adata_sp)

# subsample human-specific genes
adata_sp = adata_sp[:, labels.index]

# scale data
scaled_adata_list = list()
for sample in adata_sp.obs.sample_id.unique():
    scaled_adata_list.append(sc.pp.scale(adata_sp[adata_sp.obs['sample_id'] == sample], copy=True))

scaled_adata = an.concat(scaled_adata_list , merge='same',uns_merge="unique")
adata_pb_norm

# calculate average levels
for cluster in labels.unique():
    genes = labels[labels == cluster].index
    scaled_adata.obs[f'Cluster{cluster}'] = scaled_adata[:, genes].X.mean(axis=1)

In [None]:
fig, axes = plt.subplots(5, 7, figsize=(25, 15), gridspec_kw={'wspace': 0.5}, dpi=120)
names = scaled_adata.obs.columns[scaled_adata.obs.columns.str.contains('Cluster')].to_list()
cmap = mpl.cm.coolwarm
norm_dict = {
    'Cluster0': {'vmin': -0.3, 'vmax': 0.4},
    'Cluster1': {'vmin': -0.4, 'vmax': 0.35},
    'Cluster2': {'vmin': -0.6, 'vmax': 0.3},
    'Cluster3': {'vmin': -0.6, 'vmax': 0.5},
    'Cluster4': {'vmin': -0.4, 'vmax': 0.75},
    'Cluster5': {'vmin': -0.5, 'vmax': 0.5}
    }

sq.pl.spatial_scatter(scaled_adata,
                        color='label',
                        library_key='sample_id', ncols=5, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, -1])

for k, cluster in enumerate(names):
    norm = mpl.colors.TwoSlopeNorm(vcenter=0)
    sq.pl.spatial_scatter(scaled_adata,
                        color=cluster,
                        library_key='sample_id', ncols=5, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, k])

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='wiener')

In [None]:
fig, axes = plt.subplots(5, 7, figsize=(25, 15), gridspec_kw={'wspace': 0.5}, dpi=120)
names = scaled_adata.obs.columns[scaled_adata.obs.columns.str.contains('Cluster')].to_list()
cmap = mpl.cm.coolwarm
norm_dict = {
    'Cluster0': {'vmin': -0.3, 'vmax': 0.4},
    'Cluster1': {'vmin': -0.4, 'vmax': 0.25},
    'Cluster2': {'vmin': -0.6, 'vmax': 0.2},
    'Cluster3': {'vmin': -0.6, 'vmax': 0.3},
    'Cluster4': {'vmin': -0.4, 'vmax': 0.9},
    'Cluster5': {'vmin': -0.5, 'vmax': 0.4}
    }

sq.pl.spatial_scatter(scaled_adata,
                        color='label',
                        library_key='sample_id', ncols=5, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, -1])

for k, cluster in enumerate(norm_dict.keys()):
    norm = mpl.colors.TwoSlopeNorm(vcenter=0)
    sq.pl.spatial_scatter(scaled_adata,
                        color=cluster + '_filtered',
                        library_key='sample_id', ncols=5, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, k])

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm')

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='median', size=5)

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm', norm=norm)

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='mean', size=5)

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm', norm=norm)

In [None]:
specie = 'macaque'

# load data
adata_sp = sc.read_h5ad(f'../../data/raw_adata/{specie}.h5ad')
adata_sp.obs_names_make_unique()

# normalize data
sc.pp.normalize_total(adata_sp, target_sum=1e4)
sc.pp.log1p(adata_sp)

# subsample human-specific genes
adata_sp = adata_sp[:, labels.index]

# scale data
scaled_adata_list = list()
for sample in adata_sp.obs.sample_id.unique():
    scaled_adata_list.append(sc.pp.scale(adata_sp[adata_sp.obs['sample_id'] == sample], copy=True))

scaled_adata = an.concat(scaled_adata_list , merge='same',uns_merge="unique")
adata_pb_norm

# calculate average levels
for cluster in labels.unique():
    genes = labels[labels == cluster].index
    scaled_adata.obs[f'Cluster{cluster}'] = scaled_adata[:, genes].X.mean(axis=1)

In [None]:
fig, axes = plt.subplots(4, 7, figsize=(25, 12), gridspec_kw={'wspace': 0.4, 'hspace': 0.2}, dpi=100)
names = scaled_adata.obs.columns[scaled_adata.obs.columns.str.contains('Cluster')].to_list()
cmap = mpl.cm.coolwarm
norm_dict = {
    'Cluster0': {'vmin': -0.4, 'vmax': 0.4},
    'Cluster1': {'vmin': -0.4, 'vmax': 0.3},
    'Cluster2': {'vmin': -0.5, 'vmax': 0.8},
    'Cluster3': {'vmin': -0.6, 'vmax': 0.5},
    'Cluster4': {'vmin': -0.5, 'vmax': 0.7},
    'Cluster5': {'vmin': -0.5, 'vmax': 0.5}
    }

sq.pl.spatial_scatter(scaled_adata,
                        color='label',
                        library_key='sample_id', ncols=5, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, -1])

for k, cluster in enumerate(names):
    norm = mpl.colors.TwoSlopeNorm(vcenter=0)
    sq.pl.spatial_scatter(scaled_adata,
                        color=cluster,
                        library_key='sample_id', ncols=4, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, k])

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='wiener', size=5)

In [None]:
fig, axes = plt.subplots(4, 7, figsize=(25, 15), gridspec_kw={'wspace': 0.5}, dpi=120)
names = scaled_adata.obs.columns[scaled_adata.obs.columns.str.contains('Cluster')].to_list()
cmap = mpl.cm.coolwarm
norm_dict = {
    'Cluster0': {'vmin': -0.4, 'vmax': 0.3},
    'Cluster1': {'vmin': -0.4, 'vmax': 0.3},
    'Cluster2': {'vmin': -0.5, 'vmax': 0.7},
    'Cluster3': {'vmin': -0.6, 'vmax': 0.4},
    'Cluster4': {'vmin': -0.4, 'vmax': 0.6},
    'Cluster5': {'vmin': -0.5, 'vmax': 0.4}
    }

sq.pl.spatial_scatter(scaled_adata,
                        color='label',
                        library_key='sample_id', ncols=4, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, -1])

for k, cluster in enumerate(norm_dict.keys()):
    norm = mpl.colors.TwoSlopeNorm(vcenter=0)
    sq.pl.spatial_scatter(scaled_adata,
                        color=cluster + '_filtered',
                        library_key='sample_id', ncols=4, img=True, size=1.3, cmap=cmap, norm=norm, fig=fig, ax=axes[:, k])

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm', norm=norm)

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='median', size=5)

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm', norm=norm)

In [None]:
apply_filter(scaled_adata, ['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5'], 'sample_id', method='mean', size=5)

In [None]:
cmap = mpl.cm.coolwarm
norm = mpl.colors.TwoSlopeNorm(vcenter=0)

sq.pl.spatial_scatter(scaled_adata,
                        color=['Cluster0_filtered', 'Cluster1_filtered', 'Cluster2_filtered', 'Cluster3_filtered', 'Cluster4_filtered', 'Cluster5_filtered', 'label'],
                        library_key='sample_id', ncols=7, img=True, size=1.3, cmap='coolwarm', norm=norm)