# SageNet Batch Integration Mouse Organogenesis Imputed

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 30.01.2023
- **Date of Last Modification:** 30.01.2023

## 1. Setup

### 1.1 Import Libraries

In [18]:
import copy
import os
import random
from datetime import datetime

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scarches as sca
import scanpy as sc
import scipy.sparse as sp
import squidpy as sq
import torch
import torch_geometric.data as geo_dt
from scarches.models.sagenet.utils import glasso

### 1.2 Define Parameters

In [14]:
## Dataset
dataset = "seqfish_mouse_organogenesis"
batch1 = "embryo1"
batch2 = "embryo2"
batch3 = "embryo3"
n_neighbors = 4 # 12
reference_removed_cell_type = "Presomitic mesoderm"

spatial_key = "spatial"

## Others
random_seed = 42
load_timestamp = None

### 1.3 Run Notebook Setup

In [15]:
sc.set_figure_params(figsize=(6, 6))
sns.set_style("whitegrid", {'axes.grid' : False})

NameError: name 'sns' is not defined

In [4]:
# Ignore future warnings and user warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

NameError: name 'warnings' is not defined

In [5]:
# Get time of notebook execution for timestamping saved artifacts
now = datetime.now()
current_timestamp = now.strftime("%d%m%Y_%H%M%S")

### 1.4 Configure Paths and Create Directories

In [8]:
# Define paths
data_folder_path = "../../datasets/srt_data/gold/"
figure_folder_path = f"../../figures/method_benchmarking/{dataset}/sagenet"

# Create required directories
os.makedirs(figure_folder_path, exist_ok=True)
os.makedirs(model_artifacts_folder_path, exist_ok=True)

## 2. Data

### 2.1 Load Data

In [9]:
adata = ad.read_h5ad(f"{data_folder_path}/{dataset}_imputed.h5ad")

In [10]:
# Keep only highly variable genes
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=4000,
    batch_key="batch",
    subset=True)

adata.X = data.X.toarray()

  hvg = hvg.append(missing_hvg, ignore_index=True)
  hvg = hvg.append(missing_hvg, ignore_index=True)
  hvg = hvg.append(missing_hvg, ignore_index=True)


In [11]:
adata_batch1 = adata[adata.obs["batch"] == "embryo1"]
adata_batch2 = adata[adata.obs["batch"] == "embryo2"]
adata_batch3 = adata[adata.obs["batch"] == "embryo3"]

adata_batch_list = [adata_batch1,
                    adata_batch2,
                    adata_batch3]

### 2.2 Artificially Remove Cell Type from Embryo 1 for Recovery by Query (Embryo 3)

In [12]:
# Artificially remove cell type from reference for recovery by query
adata_batch_list[0] = adata_batch_list[0][adata_batch_list[0].obs["celltype_mapped_refined"] != reference_removed_cell_type]

### 2.3 Compute Spatial Neighbor Graphs

In [16]:
for i in range(len(adata_batch_list)):
    # Compute (separate) spatial neighborhood
    sq.gr.spatial_neighbors(adata_batch_list[i],
                            coord_type="generic",
                            spatial_key=spatial_key,
                            n_neighs=n_neighbors)
    # Make adjacency matrix symmetric
    adata_batch_list[i].obsp["spatial_connectivities"] = adata_batch_list[i].obsp["spatial_connectivities"].maximum(
        adata_batch_list[i].obsp["spatial_connectivities"].T)

### 2.4 Combine Data for Integration

In [19]:
adata_integration = ad.concat(adata_batch_list, join="inner")

# Combine spatial neighborhood graphs as disconnected components
connectivities_extension_batch1 = sp.csr_matrix((adata_batch_list[0].shape[0],
                                                 (adata_batch_list[1].shape[0] +
                                                  adata_batch_list[2].shape[0])))
connectivities_extension_batch2_before = sp.csr_matrix((adata_batch_list[1].shape[0],
                                                        adata_batch_list[0].shape[0]))
connectivities_extension_batch2_after = sp.csr_matrix((adata_batch_list[1].shape[0],
                                                       (adata_batch_list[2].shape[0])))
connectivities_extension_batch3 = sp.csr_matrix((adata_batch_list[2].shape[0],
                                                 (adata_batch_list[0].shape[0] +
                                                  adata_batch_list[1].shape[0])))

connectivities_batch1 = sp.hstack((adata_batch_list[0].obsp["spatial_connectivities"],
                                   connectivities_extension_batch1))
connectivities_batch2 = sp.hstack((connectivities_extension_batch2_before,
                                   adata_batch_list[1].obsp["spatial_connectivities"],
                                   connectivities_extension_batch2_after))
connectivities_batch3 = sp.hstack((connectivities_extension_batch3,
                                   adata_batch_list[2].obsp["spatial_connectivities"]))

connectivities = sp.vstack((connectivities_batch1,
                            connectivities_batch2,
                            connectivities_batch3))

adata_integration.obsp["spatial_connectivities"] = connectivities

In [None]:
# Use GPU if available
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print(device)

# Estimate gene interaction network
glasso(adata_integration, [0.25, 0.5]) # the adjacency matrix of the built graph is added under 'adata.varm["adj"]'
    
# Compute spatial partitoning with 3 different resolutions to capture different granularities
sc.tl.leiden(adata_integration,
             resolution=.05,
             random_state=random_seed,
             key_added="leiden_0.05",
             adjacency=adata.obsp["spatial_connectivities"]) # the partitioning is added under 'adata.obs["leiden_0.05"]'
sc.tl.leiden(adata_integration,
             resolution=.1,
             random_state=random_seed,
             key_added="leiden_0.1",
             adjacency=adata.obsp["spatial_connectivities"]) # the partitioning is added under 'adata.obs["leiden_0.1"]'
sc.tl.leiden(adata_integration,
             resolution=.5,
             random_state=random_seed,
             key_added="leiden_0.5",
             adjacency=adata.obsp["spatial_connectivities"]) # the partitioning is added under 'adata.obs["leiden_0.5"]'
sc.pl.spatial(adata_integration,
              color=["leiden_0.05", "leiden_0.1", "leiden_0.5"],
              frameon=False,
              ncols=3,
              spot_size=.1,
              title=["leiden_0.05", "leiden_0.1", "leiden_0.5"],
              legend_loc=None)

# Define model object
sg_obj = sca.models.sagenet(device=device)

sg_obj.train(adata_integration,
             comm_columns=["leiden_0.05", "leiden_0.1", "leiden_0.5"],
             tag="integrated",
             epochs=15,
             verbose = False,
             importance=True)

cuda:0


  precision_[indices != idx, idx] = -precision_[idx, idx] * coefs
  precision_[idx, indices != idx] = -precision_[idx, idx] * coefs


In [None]:
    sg_obj.load_query_data(adata)
    
    # Use SageNet cell-cell-distances for UMAP generation
    sc.pp.neighbors(adata_one_shot, use_rep="dist_map")
    sc.tl.umap(adata_one_shot)
    fig = sc.pl.umap(adata_one_shot,
                     color=[cell_type_key],
                     title="Latent Space with Cell Types: SageNet",
                     return_fig=True)
    fig.savefig(f"{figure_folder_path}/latent_sagenet_cell_types_run_{run_number}_{current_timestamp}.png",
                bbox_inches="tight")
    
    # Compute latent Leiden clustering
    sc.tl.leiden(adata=adata,
                 resolution=leiden_resolution,
                 random_state=random_seed,
                 key_added=f"latent_sagenet_leiden_{str(leiden_resolution)}")
    
    # Create subplot of latent Leiden cluster annotations in physical and latent space
    fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(6, 12))
    title = fig.suptitle(t="Latent and Physical Space with Leiden Clusters: SageNet")
    sc.pl.umap(adata=adata,
               color=[f"latent_sagenet_leiden_{str(leiden_resolution)}"],
               title=f"Latent Space with Leiden Clusters",
               ax=axs[0],
               show=False)
    sc.pl.spatial(adata=adata,
                  color=[f"latent_sagenet_leiden_{str(leiden_resolution)}"],
                  spot_size=0.03,
                  title=f"Physical Space with Leiden Clusters",
                  ax=axs[1],
                  show=False)

    # Create and position shared legend
    handles, labels = axs[0].get_legend_handles_labels()
    lgd = fig.legend(handles, labels, bbox_to_anchor=(1.25, 0.9185))
    axs[0].get_legend().remove()
    axs[1].get_legend().remove()

    # Adjust, save and display plot
    plt.subplots_adjust(wspace=0, hspace=0.2)
    fig.savefig(f"{figure_folder_path}/latent_physical_comparison_sagenet_leiden_run_{run_number}_{current_timestamp}.png",
                bbox_extra_artists=(lgd, title),
                bbox_inches="tight")
    plt.show()
    
    # Use UMAP embedding of cell-cell distances as latent features
    adata_original.obsm[latent_key + f"_run{run_number}"] = adata.obsm["X_umap"]