### Notebook for the functional comparison of CMC genotypes with `expimap`

#### Environment: scArches

- **Developed by:** Carlos Talavera-López Ph.D
- **Modified by:** Alexandra Cirnu
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Date of creation:** 240216
- **Date of modification:** 240222

### Import required modules

In [None]:
import gdown
import torch
import anndata
import warnings
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd
import scarches as sca
import matplotlib.pyplot as plt

### Set up working environment

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

In [None]:
warnings.simplefilter(action = 'ignore')
%config InlineBackend.print_figure_kwargs = {'facecolor' : "w"}
%config InlineBackend.figure_format = 'retina'
torch.set_float32_matmul_precision('medium')

In [None]:
def X_is_raw(adata): return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

### Read in whole data and split in WT (reference) and other genotypes (query)

In [None]:
ACM = sc.read_h5ad('/Users/alex/data/ACM_cardiac_leuco/processed_merged/ACM_6mpi_scANVI_cellstates_macrophages_ac240223.raw.h5ad')
ACM

In [None]:
if sp.sparse.issparse(ACM.X):
   ACM.X = ACM.X.toarray().astype(np.float32)

In [None]:
ACM.obs['condition'].value_counts()

In [None]:
Ctr = ACM[ACM.obs['condition'].isin(['Pkp2_Ctr_PBS', 'Ttn_Ctr_noninf'])]
Ctr

In [None]:
Affected = ACM[~ACM.obs['condition'].isin(['Pkp2_Ctr_PBS', 'Ttn_Ctr_noninf'])]
Affected

### Read the Reactome annotations

In [None]:
url = 'https://drive.google.com/uc?id=1136LntaVr92G1MphGeMVcmpE0AqcqM6c'
output = 'reactome.gmt'
gdown.download(url, output, quiet=False)

### Prepare reference data with ReactomeDB pathways

In [None]:
sca.utils.add_annotations(Ctr, 'reactome.gmt', min_genes = 6, clean = True)

- Remove all genes that are present in the data but absent in ReactomeDB

In [None]:
Ctr._inplace_subset_var(Ctr.varm['I'].sum(1)>0)

### Calculate HVGs

In [None]:
ref_raw = Ctr.copy()
Ctr.layers['counts'] = Ctr.X.copy()

sc.pp.highly_variable_genes(
    Ctr,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "sample",
    subset = True,
    span = 1
)
Ctr

In [None]:
X_is_raw(Ctr)

- Filter out all annotations (terms) with less than 12 genes.

In [None]:
select_terms = Ctr.varm['I'].sum(0)>12
Ctr.uns['terms'] = np.array(Ctr.uns['terms'])[select_terms].tolist()
Ctr.varm['I'] = Ctr.varm['I'][:, select_terms]

- Filter out genes not present in any of the terms after selection of HVGs.

In [None]:
Ctr._inplace_subset_var(Ctr.varm['I'].sum(1)>0)

### Create expiMap model and train it on reference dataset

In [None]:
intr_cvae = sca.models.EXPIMAP(
    adata = Ctr,
    condition_key = 'condition',
    hidden_layer_sizes = [256, 256, 256],
    recon_loss = 'nb'
)

In [None]:
ALPHA = 0.7

In [None]:
early_stopping_kwargs = {
    "early_stopping_metric": "val_unweighted_loss", # val_unweighted_loss
    "threshold": 0,
    "patience": 50,
    "reduce_lr": True,
    "lr_patience": 13,
    "lr_factor": 0.1,
}
intr_cvae.train(
    n_epochs = 400,
    alpha_epoch_anneal = 100,
    alpha = ALPHA,
    alpha_kl = 0.5,
    weight_decay = 0.,
    early_stopping_kwargs = early_stopping_kwargs,
    use_early_stopping = True,
    monitor_only_val = False,
    seed = 1712,
)

In [None]:
MEAN = False

In [None]:
Ctr.obsm['X_cvae'] = intr_cvae.get_latent(mean = MEAN, only_active = True)

### Plot latent space of the reference dataset

In [None]:
sc.pp.neighbors(Ctr, use_rep = "X_cvae", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(Ctr, min_dist = 0.3, spread = 4, random_state = 1712)

In [None]:
sc.pl.umap(Ctr, frameon = False, color = ['C_scANVI', 'donor', 'condition'], size = 3, legend_fontsize = 5, ncols = 4)

### Read and format query dataset

In [None]:
Affected = Affected[:, Ctr.var_names].copy()
Affected

In [None]:
Affected.uns['terms'] = Ctr.uns['terms']

### Initlizling the model for query training

In [None]:
q_intr_cvae = sca.models.EXPIMAP.load_query_data(Affected, intr_cvae)

In [None]:
q_intr_cvae.train(n_epochs = 400, 
                  alpha_epoch_anneal = 100, 
                  weight_decay = 0.,
                  alpha_kl = 0.1, 
                  seed = 1712, 
                  use_early_stopping = True)

### Get latent representation of reference + query dataset

In [None]:
All_Cond = sc.AnnData.concatenate(Ctr, Affected, batch_key = 'batch_join', uns_merge = 'same')
All_Cond

In [None]:
All_Cond.obsm['X_cvae'] = q_intr_cvae.get_latent(All_Cond.X, 
                                                All_Cond.obs['condition'], 
                                                mean = MEAN, 
                                                only_active = True)

In [None]:
sc.pp.neighbors(All_Cond, use_rep = "X_cvae", n_neighbors = 20, metric = 'minkowski')
sc.tl.umap(All_Cond, min_dist = 0.5, spread = 6, random_state = 1712)

In [None]:
#All_Cond.obs['condition_joint'] = All_Cond.obs.region.astype(str)
#All_Cond.obs['condition_joint'][All_Cond.obs['condition_joint'].astype(str)=='nan']='condition'

In [None]:
sc.pl.umap(All_Cond, frameon = False, color = ['C_scANVI', 'donor', 'genotype', 'infection', 'condition'], size = 3, legend_fontsize = 5, ncols = 1)                  #'condition_joint

### Calculate directions of upregulation for each latent score and put them to

In [None]:
q_intr_cvae.latent_directions(adata = All_Cond)

### Do gene set enrichment test for condition in reference + query using Bayes Factors.

In [None]:
plt.rcParams['figure.figsize'] = [16, 20]

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = ['Pkp2_Ctr_PBS'], use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Pkp2_HetKO_PBS', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Pkp2_Ctr_MCMV', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Pkp2_HetKO_MCMV', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Ttn_Ctr_noninf', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Ttn_HetKO_noninf', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Ttn_Ctr_MCMV', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

In [None]:
q_intr_cvae.latent_enrich(groups = 'condition', comparison = 'Ttn_HetKO_MCMV', use_directions = True, adata = All_Cond)
fig = sca.plotting.plot_abs_bfs(All_Cond, yt_step = 0.3, scale_y = 1.5, fontsize = 6)

### Plot the latent variables for query + reference corresponding to the annotations 

In [None]:
terms = All_Cond.uns['terms']
select_terms = ['NEURONAL_SYSTEM', 'INNATE_IMMUNE_SYSTEM', 'GPCR_DOWNSTREAM_SIGNALING', 'APOPTOTIC_EXECUTION_PHASE', 'CELL_CYCLE_MITOTIC', 'L1CAM_INTERACTIONS', 'MEMBRANE_TRAFFICKING', 'METABOLISM_OF_CARBOHYDRATES', 'SIGNALING_BY_NOTCH', 'METABOLISM_OF_NUCLEOTIDES', 'TRANSLATION']
idx = [terms.index(term) for term in select_terms]

In [None]:
latents = (q_intr_cvae.get_latent(All_Cond.X, All_Cond.obs['condition'], mean = MEAN) * All_Cond.uns['directions'])[:, idx]

All_Cond.obs['NEURONAL_SYSTEM'] = latents[:, 0]
All_Cond.obs['INNATE_IMMUNE_SYSTEM'] = latents[:, 1]
All_Cond.obs['GPCR_DOWNSTREAM_SIGNALING'] = latents[:, 2]
All_Cond.obs['APOPTOTIC_EXECUTION_PHASE'] = latents[:, 3]
All_Cond.obs['CELL_CYCLE_MITOTIC'] = latents[:, 4]
All_Cond.obs['L1CAM_INTERACTIONS'] = latents[:, 5]
All_Cond.obs['MEMBRANE_TRAFFICKING'] = latents[:, 6]
All_Cond.obs['METABOLISM_OF_CARBOHYDRATES'] = latents[:, 7]
All_Cond.obs['SIGNALING_BY_NOTCH'] = latents[:, 8]
All_Cond.obs['METABOLISM_OF_NUCLEOTIDES'] = latents[:, 9]
All_Cond.obs['TRANSLATION'] = latents[:, 10]

In [None]:
plt.rcParams['figure.figsize'] = [5,5]
sc.pl.scatter(All_Cond, x = 'NEURONAL_SYSTEM', y = 'INNATE_IMMUNE_SYSTEM', color = 'condition', size = 15)
sc.pl.scatter(All_Cond, x = 'METABOLISM_OF_NUCLEOTIDES', y = 'CELL_CYCLE_MITOTIC', color = 'condition', size = 15)
sc.pl.scatter(All_Cond, x = 'APOPTOTIC_EXECUTION_PHASE', y = 'INNATE_IMMUNE_SYSTEM', color = 'condition', size = 15)

sc.pl.scatter(All_Cond, x = 'NEURONAL_SYSTEM', y = 'INNATE_IMMUNE_SYSTEM', color = 'C_scANVI', size = 15)
sc.pl.scatter(All_Cond, x = 'METABOLISM_OF_NUCLEOTIDES', y = 'CELL_CYCLE_MITOTIC', color = 'C_scANVI', size = 15)
sc.pl.scatter(All_Cond, x = 'APOPTOTIC_EXECUTION_PHASE', y = 'INNATE_IMMUNE_SYSTEM', color = 'C_scANVI', size = 15)