# Data Simulations

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of AI for Health (AIH), Talavera-López Lab
- **Date of Creation:** 24.05.2024
- **Date of Last Modification:** 17.06.2024

## 1. Setup

### 1.1 Import Libraries

In [1]:
import warnings

import numpy as np
import pandas as pd
import random
import scanpy as sc
import squidpy as sq

from nichecompass.utils import (add_gps_from_gp_dict_to_adata,
                                compute_communication_gp_network,
                                visualize_communication_gp_network,
                                create_new_color_dict,
                                extract_gp_dict_from_mebocost_es_interactions,
                                extract_gp_dict_from_nichenet_lrt_interactions,
                                extract_gp_dict_from_omnipath_lr_interactions,
                                filter_and_combine_gp_dict_gps,
                                generate_enriched_gp_info_plots)

  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/connectionpool.py", line 404, in _make_request
    self._validate_conn(conn)
  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/connectionpool.py", line 1058, in _validate_conn
    conn.connect()
  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
  File "/home/aih/sebastian.birk/miniconda3/envs/nichecompass/lib/python3.9/site-packages/urllib3/util/ssl_.py", line 493, in _ss

### 1.2 Run Notebook Setup

In [2]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### 1.3 Configure Paths and Create Directories

In [3]:
species = "mouse"

ga_data_folder_path = "../datasets/ga_data"
gp_data_folder_path = "../datasets/gp_data"
so_data_folder_path = "../datasets/srt_data"
omnipath_lr_network_file_path = f"{gp_data_folder_path}/omnipath_lr_network.csv"
collectri_tf_network_file_path = f"{gp_data_folder_path}/collectri_tf_network_{species}.csv"
nichenet_lr_network_file_path = f"{gp_data_folder_path}/nichenet_lr_network_v2_{species}.csv"
nichenet_ligand_target_matrix_file_path = f"{gp_data_folder_path}/nichenet_ligand_target_matrix_v2_{species}.csv"
mebocost_enzyme_sensor_interactions_folder_path = f"{gp_data_folder_path}/metabolite_enzyme_sensor_gps"
gene_orthologs_mapping_file_path = f"{ga_data_folder_path}/human_mouse_gene_orthologs.csv"

## 2. Simulate Data

### 2.1 Simulate Gene Interactions Based on Prior Knowledge Gene Programs (GPs)

- Here we retrieve GPs based on three databases of prior knowledge of inter- and intracellular interaction pathways:
    - OmniPath (Ligand-Receptor GPs)
    - MEBOCOST (Enzyme-Sensor GPs)
    - NicheNet (Combined Interaction GPs)
- These GPs are also used as prior GPs supplied to NicheCompass. We will use these to artificially upregulate some programs and check whether this can be recovered by NicheCompass.

In [4]:
cell_types = [f"Celltype{n}" for n in range(1,5)]
fold_changes = [1, 3, 5, 10]
fold_changes_probs = [0.7, 0.1, 0.1, 0.1]

In [5]:
# Retrieve OmniPath GPs (source: ligand genes; target: receptor genes)
omnipath_gp_dict = extract_gp_dict_from_omnipath_lr_interactions(
    species=species,
    min_curation_effort=0,
    load_from_disk=True,
    save_to_disk=False,
    lr_network_file_path=omnipath_lr_network_file_path,
    gene_orthologs_mapping_file_path=gene_orthologs_mapping_file_path,
    plot_gp_gene_count_distributions=False)

omnipath_gp_df = pd.DataFrame(omnipath_gp_dict.values())

omnipath_gp_df["celltypeA"] = np.random.choice(cell_types, size=len(omnipath_gp_df))
omnipath_gp_df["celltypeB"] = np.random.choice(cell_types, size=len(omnipath_gp_df))
omnipath_gp_df["sources"] = omnipath_gp_df["sources"].apply(lambda x: ",".join(map(str, x)))
omnipath_gp_df["targets"] = omnipath_gp_df["targets"].apply(lambda x: ",".join(map(str, x)))
omnipath_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(omnipath_gp_df), p=fold_changes_probs)
omnipath_gp_df.drop(["sources_categories", "targets_categories"], axis=1, inplace=True)
omnipath_gp_df.to_csv(f"{gp_data_folder_path}/simulations/omnipath_gps_simulations.csv", index=False)

In [6]:
# Retrieve MEBOCOST GPs (source: enzyme genes; target: sensor genes)
mebocost_gp_dict = extract_gp_dict_from_mebocost_es_interactions(
    dir_path=mebocost_enzyme_sensor_interactions_folder_path,
    species=species,
    plot_gp_gene_count_distributions=False)

mebocost_gp_df = pd.DataFrame(mebocost_gp_dict.values())

mebocost_gp_df["celltypeA"] = np.random.choice(cell_types, size=len(mebocost_gp_df))
mebocost_gp_df["celltypeB"] = np.random.choice(cell_types, size=len(mebocost_gp_df))
mebocost_gp_df["sources"] = mebocost_gp_df["sources"].apply(lambda x: np.random.choice(x, size=min(10, len(x)), replace=False)) # only keep 10 source genes
mebocost_gp_df["targets"] = mebocost_gp_df["targets"].apply(lambda x: np.random.choice(x, size=min(10, len(x)), replace=False)) # only keep 10 target genes
mebocost_gp_df["sources"] = mebocost_gp_df["sources"].apply(lambda x: ",".join(map(str, x)))
mebocost_gp_df["targets"] = mebocost_gp_df["targets"].apply(lambda x: ",".join(map(str, x)))
mebocost_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(mebocost_gp_df), p=fold_changes_probs)
mebocost_gp_df.drop(["sources_categories", "targets_categories"], axis=1, inplace=True)
mebocost_gp_df.to_csv(f"{gp_data_folder_path}/simulations/mebocost_gps_simulations.csv", index=False)

In [7]:
# Retrieve NicheNet GPs (source: ligand genes; target: receptor genes, target genes)
nichenet_gp_dict = extract_gp_dict_from_nichenet_lrt_interactions(
    species=species,
    version="v2",
    keep_target_genes_ratio=1.,
    max_n_target_genes_per_gp=250,
    load_from_disk=True,
    save_to_disk=False,
    lr_network_file_path=nichenet_lr_network_file_path,
    ligand_target_matrix_file_path=nichenet_ligand_target_matrix_file_path,
    gene_orthologs_mapping_file_path=gene_orthologs_mapping_file_path,
    plot_gp_gene_count_distributions=False)

nichenet_gp_df = pd.DataFrame(nichenet_gp_dict.values())

nichenet_gp_df["celltypeA"] = np.random.choice(cell_types, size=len(nichenet_gp_df))
nichenet_gp_df["celltypeB"] = np.random.choice(cell_types, size=len(nichenet_gp_df))
nichenet_gp_df["targets"] = nichenet_gp_df["targets"].apply(lambda x: np.random.choice(x, size=min(10, len(x)), replace=False)) # only keep 10 target genes
nichenet_gp_df["sources"] = nichenet_gp_df["sources"].apply(lambda x: ",".join(map(str, x)))
nichenet_gp_df["targets"] = nichenet_gp_df["targets"].apply(lambda x: ",".join(map(str, x)))
nichenet_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(nichenet_gp_df), p=fold_changes_probs)
nichenet_gp_df.drop(["sources_categories", "targets_categories"], axis=1, inplace=True)
nichenet_gp_df.to_csv(f"{gp_data_folder_path}/simulations/nichenet_gps_simulations.csv", index=False)

In [8]:
# Add GPs into one combined dictionary for model training
combined_gp_dict = dict(omnipath_gp_dict)
combined_gp_dict.update(mebocost_gp_dict)
combined_gp_dict.update(nichenet_gp_dict)

In [9]:
# Filter and combine GPs to avoid overlaps
combined_new_gp_dict = filter_and_combine_gp_dict_gps(
    gp_dict=combined_gp_dict,
    gp_filter_mode="subset",
    combine_overlap_gps=True,
    overlap_thresh_source_genes=0.9,
    overlap_thresh_target_genes=0.9,
    overlap_thresh_genes=0.9)

print("Number of gene programs before filtering and combining: "
      f"{len(combined_gp_dict)}.")
print(f"Number of gene programs after filtering and combining: "
      f"{len(combined_new_gp_dict)}.")

Number of gene programs before filtering and combining: 2324.
Number of gene programs after filtering and combining: 1818.


In [10]:
# Retrieve combined GPs
combined_gp_df = pd.DataFrame(combined_new_gp_dict.values())

sampled_idx = np.random.choice([1, 2, 3, 4], size=len(combined_gp_df))
cell_type_region_choices = [[idx] + list(set(range(1, 5)) - set([idx])) for idx in sampled_idx]

combined_gp_df["celltypeA"] = ["Celltype" + str(idx) for idx in sampled_idx]
combined_gp_df["regionA"] = ["Region" + str(np.random.choice(cell_type_region_choices[idx],
                                                             p=[0.7, 0.1, 0.1, 0.1])) for idx in range(len(combined_gp_df))] # this should be aligned with 'region_celltype_df' from simulation
combined_gp_df["celltypeB"] = np.random.choice(cell_types, size=len(combined_gp_df))
combined_gp_df["sources"] = combined_gp_df["sources"].apply(lambda x: np.random.choice(x, size=min(10, len(x)), replace=False)) # only keep 10 source genes
combined_gp_df["targets"] = combined_gp_df["targets"].apply(lambda x: np.random.choice(x, size=min(10, len(x)), replace=False)) # only keep 10 target genes
combined_gp_df["sources"] = combined_gp_df["sources"].apply(lambda x: ",".join(map(str, x)))
combined_gp_df["targets"] = combined_gp_df["targets"].apply(lambda x: ",".join(map(str, x)))
combined_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(combined_gp_df), p=fold_changes_probs)
combined_gp_df.drop(["sources_categories", "targets_categories"], axis=1, inplace=True)
combined_gp_df.to_csv(f"{gp_data_folder_path}/simulations/combined_gps_simulations.csv", index=False)

In [13]:
# Get unique number of genes in combined gp dataframe
combined_gp_source_genes = combined_gp_df["sources"].apply(lambda x: x.split(",")).values
combined_gp_target_genes = combined_gp_df["targets"].apply(lambda x: x.split(",")).values
unique_target_genes = set([gene for genes in combined_gp_target_genes for gene in genes])
unique_source_genes = set([gene for genes in combined_gp_source_genes for gene in genes])
unique_prior_genes = unique_target_genes | unique_source_genes
print(len(unique_prior_genes)) # 4501

4501


In [19]:
# Get total number of unique genes
all_genes_df = pd.DataFrame(combined_new_gp_dict.values())
all_source_genes = all_genes_df["sources"]
all_target_genes = all_genes_df["targets"]
unique_target_genes = set([gene for genes in all_source_genes for gene in genes])
unique_source_genes = set([gene for genes in all_target_genes for gene in genes])
unique_genes = unique_target_genes | unique_source_genes
print(len(unique_genes)) # 13327

13327


In [26]:
# Simulate de-novo genes and GPs.
# Include all genes that are not included in combined gp dict as de novo genes
unique_denovo_genes = [f"denovo_gene{i}" for i in range (len(unique_genes) - len(unique_prior_genes))]
#all_genes = list(unique_prior_genes) + unique_denovo_genes
all_genes = unique_denovo_genes
n_denovo_gps = 182
de_novo_gp_df = pd.DataFrame({"sources": [random.sample(all_genes, 1) for i in range(n_denovo_gps)] , "targets": [random.sample(all_genes, 10) for i in range(n_denovo_gps)]})
de_novo_gp_df["sources"] = de_novo_gp_df["sources"].apply(lambda x: ",".join(map(str, x)))
de_novo_gp_df["targets"] = de_novo_gp_df["targets"].apply(lambda x: ",".join(map(str, x)))

sampled_idx = np.random.choice([1, 2, 3, 4], size=len(de_novo_gp_df))
cell_type_region_choices = [[idx] + list(set(range(1, 5)) - set([idx])) for idx in sampled_idx]
de_novo_gp_df["celltypeA"] = ["Celltype" + str(idx) for idx in sampled_idx]
de_novo_gp_df["regionA"] = ["Region" + str(np.random.choice(cell_type_region_choices[idx],
                                                            p=[0.7, 0.1, 0.1, 0.1])) for idx in range(len(de_novo_gp_df))] # this should be aligned with 'region_celltype_df' from simulation
de_novo_gp_df["celltypeB"] = np.random.choice(cell_types, size=len(de_novo_gp_df))
de_novo_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(de_novo_gp_df), p=fold_changes_probs)
all_gp_df = pd.concat([combined_gp_df, de_novo_gp_df])
all_gp_df.to_csv(f"{gp_data_folder_path}/simulations/combined_denovo_gps_simulations.csv", index=False)

In [27]:
# Get unique number of genes
all_gp_source_genes = all_gp_df["sources"].apply(lambda x: x.split(",")).values
all_gp_target_genes = all_gp_df["targets"].apply(lambda x: x.split(",")).values
unique_target_genes = set([gene for genes in all_gp_target_genes for gene in genes])
unique_source_genes = set([gene for genes in all_gp_source_genes for gene in genes])
unique_all_genes = unique_target_genes | unique_source_genes
print(len(unique_all_genes)) # 6290

6290


In [42]:
# Create different prior GP fold changes
for fold_change_descriptor, fold_changes_probs in zip(
    ["weak", "medium", "strong"],
    [[0.7, 0.1, 0.1, 0.1],
     [0.9, 1/30, 1/30, 1/30],
     [0.95, 1/60, 1/60, 1/60]]):
    all_gp_df["fold_change"] = np.random.choice(fold_changes, size=len(all_gp_df), p=fold_changes_probs)
    all_gp_df.to_csv(f"{gp_data_folder_path}/simulations/combined_denovo_gps_simulations_{fold_change_descriptor}_fc.csv", index=False)

### 2.2 Simulate Data Based on Gene Interactions

Run '../scripts/data_simulation/data_simulation.R' with simulated gene interactions as input.

### 2.3 Explore Simulated Data

In [None]:
adata_ref_gps_1 = sc.read_h5ad(f"{so_data_folder_path}/simulations/simulated_ref_gps_1.h5ad")
adata_ref_gps_1.obsm["spatial"] = adata_ref_gps_1.obs[["x", "y"]].values

In [None]:
# Visualize cell types in tissue
sq.pl.spatial_scatter(adata_ref_gps_1, color="cell_type", shape=None) 

In [None]:
# Visualize cell types in tissue
sq.pl.spatial_scatter(adata_ref_gps_1, color="region_label", shape=None) 

In [None]:
combined_gp_df[combined_gp_df["fold_change"] == 10]

In [None]:
sq.pl.spatial_scatter(adata_ref_gps_1, color="Cxcl10", shape=None) 

In [None]:
# Reduce sparsity for cell interaction genes? E.g. add counts of 1?
# Different number of genes and locations


In [None]:
adata_free = sc.read_h5ad("./simulated_free_1.h5ad")
adata_free.obsm["spatial"] = adata_free.obs[["x", "y"]].values

In [None]:
# Visualize cell types in tissue
sq.pl.spatial_scatter(adata_free, color="cell_type", shape=None) 

In [None]:
# Visualize cell types in tissue
sq.pl.spatial_scatter(adata_free, color="region_label", shape=None) 

In [None]:
adata_free.uns

In [None]:
sq.pl.spatial_scatter(adata_free, color="EFNA1", shape=None) 

In [None]:
sq.pl.spatial_scatter(adata_free, color="EPHA1", shape=None) 

In [None]:
adata_ref.uns

In [None]:
sq.pl.spatial_scatter(adata_ref, color="Bdnf", shape=None) 

In [None]:
sq.pl.spatial_scatter(adata_ref, color="Drd4", shape=None) 

In [None]:
sq.pl.spatial_scatter(adata_ref, color="Fyn", shape=None) 

In [None]:
sq.pl.spatial_scatter(adata_ref, color="Mapk7", shape=None) 