In [None]:
import os
import anndata
import anndata as ad
from pathlib import Path
import os
import pandas as pd
import numpy as np
import pypath.inputs.biomart
import torch
from bokeh.transform import transform
from tqdm import tqdm
from scipy.stats import norm
from typing import Tuple

# Loading data

In [None]:
x_hat_s = anndata.read_h5ad(Path(os.getcwd()).parents[0] / "data" / "processed" / "mouse1_slice153_x_hat_s.h5ad")

# Method
## Calculate Ligand and Receptor score

For each ligand (receptor) _g_ in cluster _k_, we compute a score $S(g,k)\in [0,1]$ aimed at measuring how much the observed ligand (receptor) average expression level $\bar{X}_{g}^{k}$ is high compared to the average expression levels observable by ranche for random genes in the same cluster _k_. The distribution of average gene expression level observable by chance was obtained, using a permutation approach, as follows:
i) randomly permuting row/genes in matrix $X^k$ independently for each column/cell;
ii) computing the average genes expression levels in such shuffled version of $X^{k}$;
iii) iterating steps (i) and (ii) multiple times.

In [None]:
import liana as li
lr_df = li.resource.select_resource("mouseconsensus")
lr_df["ligand"]=lr_df["ligand"].str.lower()
lr_df["receptor"]=lr_df["receptor"].str.lower()

## Calculate Ligand and Receptor Pair Score

## Define utility functions to evaluate the intercellular scores

In [None]:
def compute_gene_cluster_stats(adata: ad.AnnData, cluster_key="@label", verbose: int = 0) -> ad.AnnData:
    clusters = adata.obs[cluster_key].cat.categories
    cluster_stats = {}
    for cluster in clusters:
        if verbose == 1 | verbose == 2:
            print(f"\nProcessing cluster: {cluster}")
        cell_mask = adata.obs[cluster_key] == cluster
        cluster_data = adata[cell_mask].X
        cluster_stats[cluster] = {
            "mu": np.mean(cluster_data, axis=None),
            "sd": np.std(cluster_data, axis=None),
            "counts": cluster_data.shape[0]
        }
        gene_means = np.mean(cluster_data, axis=0)
        gene_scores = norm.cdf(gene_means, loc=cluster_stats[cluster]["mu"],
                               scale=cluster_stats[cluster]["sd"] / np.sqrt(cluster_stats[cluster]["counts"]))
        gene_scores[gene_means == 0] = 0
        score_key = f'score_{cluster}'
        adata.var[score_key] = gene_scores

    return adata

In [None]:
def prepare_lr_data(adata: ad.AnnData, lr_pairs_df: pd.DataFrame, verbose: int = 0) -> Tuple[ad.AnnData, pd.DataFrame]:
    # First, identify complex molecules
    lr_pairs_df["is_ligand_complex"] = lr_pairs_df["ligand"].str.contains("_")
    lr_pairs_df["ligand_complex_components"] = lr_pairs_df["ligand"].str.split("_")
    lr_pairs_df["is_receptor_complex"] = lr_pairs_df["receptor"].str.contains("_")
    lr_pairs_df["receptor_complex_components"] = lr_pairs_df["receptor"].str.split("_")
    lr_pairs_df["is_interaction_complex"] = lr_pairs_df["is_ligand_complex"] | lr_pairs_df["is_receptor_complex"]

    # Get all unique components
    ligands = set(lr_pairs_df["ligand_complex_components"].explode())
    receptors = set(lr_pairs_df["receptor_complex_components"].explode())

    # Find which components exist in adata
    valid_ligands = ligands.intersection(adata.var_names)
    valid_receptors = receptors.intersection(adata.var_names)

    # Filter lr_pairs_df to keep only pairs where all components exist in adata
    valid_pairs_mask = lr_pairs_df.apply(
        lambda row: all(comp in valid_ligands for comp in row["ligand_complex_components"]) and
                   all(comp in valid_receptors for comp in row["receptor_complex_components"]),
        axis=1
    )

    lr_pairs_df = lr_pairs_df[valid_pairs_mask].copy()

    # Update adata to keep only genes that are part of valid pairs
    all_valid_genes = valid_ligands.union(valid_receptors)
    adata = adata[:, adata.var_names.isin(all_valid_genes)].copy()

    if verbose:
        print(f"Filtered from {len(ligands.union(receptors))} to {len(all_valid_genes)} total genes")
        print(f"Filtered from {len(valid_pairs_mask)} to {valid_pairs_mask.sum()} valid L-R pairs")

    return adata, lr_pairs_df

In [None]:
def compute_lr_scores(adata: ad.AnnData, lr_pairs_df: pd.DataFrame, cluster_key="@label", verbose: int = 0) -> pd.DataFrame:
    # Create base L-R pairs first (more efficient)
    lr_pairs = lr_pairs_df["ligand"] + "&" + lr_pairs_df["receptor"]
    clusters = adata.obs[cluster_key].astype("category").cat.categories

    # Create all combinations
    interactions = [
        f"{lr}&{source}&{target}"
        for lr in lr_pairs
        for source in clusters
        for target in clusters
    ]

    # Create DataFrame with ordered interactions
    lr_scores = pd.DataFrame(
        [x.split('&') for x in interactions],
        columns=['ligand', 'receptor', 'source', 'target'],
        index=interactions
    )
    lr_scores['is_ligand_complex'] = lr_scores['ligand'].str.contains("_")
    lr_scores['is_receptor_complex'] = lr_scores['receptor'].str.contains("_")
    lr_scores["ligand_score"] = np.zeros(len(lr_scores))
    lr_scores["receptor_score"] = np.zeros(len(lr_scores))

    def compute_complex_score(adata, complex_molecule: str, cluster_score_key: str) -> float:
        components = complex_molecule.split("_")
        component_scores = adata.var.loc[components, cluster_score_key].values
        complex_score = np.exp(np.mean(np.log(component_scores)))
        return complex_score

    for i, row in tqdm(enumerate(lr_scores.itertuples()), total=len(lr_scores), disable=verbose != 2, desc="Calculating L-R scores"):
        # print(f"Processing interaction {row}")
        if row.is_ligand_complex:
            lr_scores.loc[row[0],"ligand_score"] = compute_complex_score(adata, row.ligand, f"score_{row.source}")
        else:
            lr_scores.loc[row[0],"ligand_score"] = adata.var.loc[row.ligand,f"score_{row.source}"]
        if row.is_receptor_complex:
            lr_scores.loc[row[0],"receptor_score"] = compute_complex_score(adata, row.receptor, f"score_{row.target}")
        else:
            lr_scores.loc[row[0],"receptor_score"] = adata.var.loc[row.receptor, f"score_{row.target}"]
        # print(f"Processed interaction to {lr_scores.loc[row[0]]}")
    lr_scores["score"] = np.minimum(lr_scores["ligand_score"], lr_scores["receptor_score"])

    return lr_scores

In [None]:
data = x_hat_s.copy()
data, complex_info = prepare_lr_data(data, lr_df)
data = compute_gene_cluster_stats(data, cluster_key='subclass', verbose=2)

## Calculate Intercellular scores

In [None]:
try:
    lr_scores = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "processed" / "mouse1_slice153_lr_inter_scores.csv",
                            index_col=0)
except FileNotFoundError:
    lr_scores = compute_lr_scores(adata, complex_info, cluster_key='subclass', verbose=2)
    lr_scores.to_csv(
    Path(os.getcwd()).parents[0] / "data" / "processed" / "mouse1_slice153_lr_inter_scores.csv")

In [None]:
# select top 3 lr pairs between clusters ASTRO and L2/3 IT
source = "Astro"
target = "L2/3 IT"
df = lr_scores.query(f"source == '{source}' & target == '{target}'")
df.sort_values(by="score", ascending=False).head(10)

In [None]:
import matplotlib.pyplot as plt
import squidpy as sq
fig, axs = plt.subplots(ncols = 2)
axs = axs.flatten()
data.obsm["spatial"] = np.array([(x,y) for x,y in zip(data.obs["centroid_x"], data.obs["centroid_y"])])
features = ["gnai2","cnr1"]
groups = [source, target]
for i,ax in enumerate(axs):
    sq.pl.spatial_scatter(
        data[(data.obs["subclass"] == groups[i]),:],
        color = features[i],
        shape=None,
        size=10,
        ax=ax,
    )

In [None]:
import seaborn as sns
subset_lr_scores = lr_scores.query(f"source == '{source}' & target == '{target}'")
sns.relplot(subset_lr_scores, kind="scatter", x="ligand_score", y="receptor_score", row="source", col="target", hue="score", palette="viridis", aspect=1.5,
            facet_kws=dict(margin_titles=True),)

# Let's implement somewhat an idea of score with a distance metric


In [None]:
from torch.utils.data import DataLoader, Dataset
import torch

In [None]:
subdata = data[(data.obs["subclass"] == source) | (data.obs["subclass"] == target),:]
top_k_interactions = 200
print(lr_df.shape)
potential_interactions = lr_scores.query(f"source == '{source}' & target == '{target}'").loc[:,["ligand","receptor","score"]].sort_values(by="score",ascending=False).head(top_k_interactions)
print(potential_interactions)

In [None]:
n_spatial_locations = subdata.shape[0]
n_potential_interactions = potential_interactions.shape[0]
x_lr = torch.zeros((n_spatial_locations, n_spatial_locations, n_potential_interactions), dtype=torch.float32)

In [None]:
# Indexing torch tensors
t = torch.ones((3,4,5), dtype=torch.float)
print(f"Shape: {t.size()}")
print(f"First row 2D Matrix: {t[0,:,:]}")
## assigning zeros to whole first column on second dimension slice
t[:,0,:] = 0
# Joining tensors
t1 = torch.cat([t,t], dim=1)
# Arithmetic operations
y1 = t[:,:,0] @ t[:,:,0].T
y2 = t[:,:,0].matmul(t[:,:,0].T)
y3 = torch.rand_like(y1)
torch.matmul(t[:,:,0], t[:,:,0].T, out=y3)

In [None]:
potential_interactions

In [None]:
single_lr = torch.zeros((n_spatial_locations, n_spatial_locations), dtype=torch.float32)
which_interaction_index = 0
ligand_mask = torch.from_numpy(subdata[:, potential_interactions.iloc[which_interaction_index]["ligand"]].X)
receptor_mask = torch.from_numpy(subdata[:, potential_interactions.iloc[which_interaction_index]["receptor"]].X)
spatial_position_matrix = torch.tensor(subdata.obsm["spatial"], dtype=torch.float32)
print(spatial_position_matrix.shape)
dist_matrix = torch.cdist(spatial_position_matrix, spatial_position_matrix, p=2)
print(dist_matrix.shape)
# How can i display these distances in a 2d plot


In [None]:
single_lr = torch.div(ligand_mask @ receptor_mask.T, dist_matrix + 1e-5)
# set the diagonal to zero
single_lr = single_lr.fill_diagonal_(0)


In [None]:
plt.boxplot(single_lr.numpy().flatten())
plt.show()
print(np.quantile(single_lr.numpy().flatten(), 0.95))

In [None]:
import networkx as nx
mask = single_lr > np.quantile(single_lr.numpy().flatten(), 0.95)
masked_single_lr = single_lr * mask
g = nx.from_numpy_array(masked_single_lr.numpy())
pos = [(x,y) for x,y in zip(spatial_position_matrix[:,0].numpy(), spatial_position_matrix[:,1].numpy())]
nx.draw(g, pos, node_size=10, node_color="blue", alpha=0.5)

# Try to scale up

In [None]:
try_top_interactions = 10
spatial_position_matrix = torch.tensor(data.obsm["spatial"], dtype=torch.float32)
dist_matrix = torch.cdist(spatial_position_matrix, spatial_position_matrix, p=2)

# first iteration
ligand_mask = torch.from_numpy(data[:, potential_interactions.iloc[0]["ligand"]].X.astype(np.float32))
receptor_mask = torch.from_numpy(data[:, potential_interactions.iloc[0]["receptor"]].X.astype(np.float32))
single_lr = torch.div(ligand_mask @ receptor_mask.T, dist_matrix + 1e-5)
single_lr = single_lr.fill_diagonal_(0)
mask = single_lr > np.quantile(single_lr.numpy().flatten(), 0.95)
masked_single_lr = single_lr * mask
cells_inter_scores = masked_single_lr.to_sparse()
for i,k in tqdm(enumerate(potential_interactions.index)):
    if i == 0:
        continue
    if i >= try_top_interactions:
        break
    ligand_mask = torch.from_numpy(data[:, potential_interactions.iloc[i]["ligand"]].X.astype(np.float32))
    receptor_mask = torch.from_numpy(data[:, potential_interactions.iloc[i]["receptor"]].X.astype(np.float32))
    single_lr = torch.div(ligand_mask @ receptor_mask.T, dist_matrix + 1e-5)
    single_lr = single_lr.fill_diagonal_(0)
    mask = single_lr > np.quantile(single_lr.numpy().flatten(), 0.95)
    masked_single_lr = single_lr * mask
    cells_inter_scores = torch.dstack([cells_inter_scores, masked_single_lr.to_sparse()])
print(type(cells_inter_scores), cells_inter_scores.shape)
print(cells_inter_scores)


In [None]:
which_interaction_index = 0
# learn how to slice a sparse tensor, which is not supported, but we could index it (?)
plt.figure(figsize=(40, 40))

# Create the graph from the adjacency matrix
adj = cells_inter_scores.to_dense()[:,:,which_interaction_index].numpy()
g = nx.from_numpy_array(adj)

# Get positions from spatial matrix
pos = [(x,y) for x,y in zip(spatial_position_matrix[:,0].numpy(), spatial_position_matrix[:,1].numpy())]

# Get edge weights from adjacency matrix for edge colors/transparency
edge_weights = nx.get_edge_attributes(g, 'weight')
if not edge_weights:  # If weights not in graph attributes, get from adjacency matrix
    edge_weights = {(i,j): adj[i,j] for i,j in g.edges()}

# Normalize weights to use as alpha values
max_weight = max(edge_weights.values())
edge_alphas = [edge_weights[edge]/max_weight for edge in g.edges()]
# Rescale between min max of edge_alphas
# minmax_edge_alphas = (max(edge_alphas) - edge_alphas)/(max(edge_alphas) - min(edge_alphas))

# Get node colors based on subclass
# Get unique subclasses
unique_subclasses = data.obs["subclass"].unique()

# Get Tableau color palette with enough colors for all subclasses
tableau_colors = sns.color_palette("tab20", n_colors=len(unique_subclasses))

# Create dictionary mapping subclasses to RGB colors
color_map = dict(zip(unique_subclasses, tableau_colors))

# Get node colors using the color map
node_colors = [color_map[data.obs["subclass"].iloc[i]] for i in range(len(g.nodes()))]

# Draw the network
# Draw edges

# Draw nodes
nx.draw_networkx_nodes(g, pos,
                      node_size=20,
                      node_color=node_colors,  # Color nodes by subclass
                      alpha=0.5)
nx.draw_networkx_edges(g, pos,
                      edge_color='gray',
                      width=1,
                      alpha=2*edge_alphas)


plt.show()



In [None]:
from lightning import LightningDataModule
# The dataset is expected to be something like x_lr
# we will consider for each interaction a different image of n_locations x n_locations
class CustomInterScoreDataset(LightningDataModule):
    def __init__(self, annotations_file):
        self.interaction_labels = pd.read_csv(annotations_file)

    def __len__(self):
        return len(self.interaction_labels)

    def __getitem__(self, idx):
        pass





# Investigating LRTFTG interactions

In [None]:
import nichecompass as nc
lrt_interactions = nc.utils.extract_gp_dict_from_nichenet_lrt_interactions(
    species="mouse",
    gene_orthologs_mapping_file_path=Path(os.getcwd()).parents[0] / "data" / "raw" / "human_mouse_gene_orthologs.csv",
    plot_gp_gene_count_distributions=False,
)
lrt_df = pd.DataFrame.from_dict(lrt_interactions, orient='index')

In [None]:
# Import nichenet from gr.csv and lr_sig.csv
gr_df = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "gr.csv")
gr_df.index = gr_df["from"] + "&" + gr_df["to"]

In [None]:
gr_df

In [None]:
plt.boxplot(gr_df["weight"])
print(gr_df["weight"].min(), gr_df["weight"].max())

In [None]:
lr_sig_df = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "lr_sig.csv")
lr_sig_df.index = lr_sig_df["from"] + "&" + lr_sig_df["to"]

In [None]:
lr_sig_df

In [None]:
plt.boxplot(lr_sig_df["weight"])
print(lr_sig_df["weight"].min(), lr_sig_df["weight"].max())

In [None]:
print(gr_df.info())
print(lr_sig_df.info())
print(f"Number of unique senders in grn: {len(gr_df["from"].unique())}/{len(gr_df["from"])} which is the {len(gr_df["from"].unique())/len(gr_df["from"])*100:.2f}%")
print(f"Number of unique receivers in grn: {len(gr_df["to"].unique())}/{len(gr_df["to"])} which is the {len(gr_df["to"].unique())/len(gr_df["to"])*100:.2f}%")
print(f"Number of unique ligands in lr_sig: {len(lr_sig_df["from"].unique())}/{len(lr_sig_df["from"])} which is the {len(lr_sig_df["from"].unique())/len(lr_sig_df["from"])*100:.2f}%")
print(f"Number of unique receptors in lr_sig: {len(lr_sig_df["to"].unique())}/{len(lr_sig_df["to"])} which is the {len(lr_sig_df["to"].unique())/len(lr_sig_df["to"])*100:.2f}%")
print(f"Number of intersection between target genes in grn and ligands in lr: {len(set(gr_df["to"].unique()).intersection(set(lr_sig_df["from"].unique())))}/" \
      + f"{len(set(gr_df["to"].unique()))} which is the {len(set(gr_df["to"].unique()).intersection(set(lr_sig_df["from"].unique())))/len(set(gr_df["to"].unique()))*100:.2f}%")
print(f"Number of intersection between senders in grn and ligands in lr: {len(set(gr_df["from"].unique()).intersection(set(lr_sig_df["from"].unique())))}/" \
      + f"{len(set(gr_df["from"].unique()))} which is the {len(set(gr_df["from"].unique()).intersection(set(lr_sig_df["from"].unique())))/len(set(gr_df["from"].unique()))*100:.2f}%")
print(f"Number of intersection between target genes in grn and receptors in lr: {len(set(gr_df["to"].unique()).intersection(set(lr_sig_df["to"].unique())))}/" \
      + f"{len(set(gr_df["to"].unique()))} which is the {len(set(gr_df["to"].unique()).intersection(set(lr_sig_df["to"].unique())))/len(set(gr_df["to"].unique()))*100:.2f}%")

# How many senders are not receivers
print(f"Number of senders that are not target genes in grn: {len(set(gr_df["from"].unique()).difference(set(gr_df["to"].unique())))}")
print(f"Number of target genes that are not senders in grn: {len(set(gr_df["to"].unique()).difference(set(gr_df["from"].unique())))}")
print(f"Number of ligands that are not receptors in lr: {len(set(lr_sig_df["from"].unique()).difference(set(lr_sig_df["to"].unique())))}")
print(f"Number of receptors that are not ligands in lr: {len(set(lr_sig_df["to"].unique()).difference(set(lr_sig_df["from"].unique())))}")

# How many intersections do i have with the lr from consensus mouse omnipath
print(f"Number of ligands that are reported in an interaction in omnipath and present in dataset: {
len(set(lr_sig_df["from"].str.lower().unique()).intersection(lr_df["ligand"].unique()))
}/{
len(lr_sig_df["from"].unique())} which is the {len(set(lr_sig_df["from"].str.lower().unique()).intersection(lr_df["ligand"].unique()))/len(lr_sig_df["from"].unique())*100:.2f}%")
print(f"Number of receptors that are reported in an interaction in omnipath and present in dataset: {len(set(lr_sig_df["to"].str.lower().unique()).intersection(lr_df["receptor"].unique()))}/" \
      + f"{len(lr_df["receptor"].unique())} which is the {len(set(lr_df["receptor"].unique()).intersection(lr_sig_df["to"].str.lower().unique()))/len(lr_df["receptor"].unique())*100:.2f}%")

In [None]:
# Compute element of venn diagram
senders_grn = set(gr_df["from"])
receivers_grn = set(gr_df["to"])
senders_lr = set(lr_sig_df["from"])
receivers_lr = set(lr_sig_df["to"])
i = senders_grn.intersection(receivers_lr).intersection(senders_lr).intersection(receivers_lr)
h = senders_lr.intersection(receivers_lr).intersection(receivers_grn) - i
f = senders_grn.intersection(senders_lr).intersection(receivers_grn) - i
e = senders_grn.intersection(receivers_grn).intersection(receivers_lr) - i
g = senders_grn.intersection(senders_lr).intersection(receivers_lr) - i
a = senders_grn.intersection(receivers_grn) - f - e - i
b = senders_grn.intersection(senders_lr) - f - g - i
c = senders_lr.intersection(receivers_lr) - g - i - h
d = receivers_lr.intersection(receivers_grn) - e - i - h
num_isolated_senders_grn = len(senders_grn - a - b - f - g - e - i)
num_isolated_senders_lr = len(senders_lr - b - c - f - g - h - i)
num_isolated_receivers_grn = len(receivers_grn - a - d - f - e - i - h)
num_isolated_receivers_lr = len(receivers_lr - c - d - g - h - i - e)

print(len(senders_grn), len(senders_lr), len(receivers_grn), len(receivers_lr))
print(num_isolated_senders_grn, num_isolated_senders_lr, num_isolated_receivers_grn, num_isolated_receivers_lr)
print([len(a), len(b), len(c), len(d), len(e), len(f), len(g), len(h), len(i)])

In [None]:
plt.hist(gr_df["to"].value_counts().values, bins=20)

In [None]:
g = nx.DiGraph([(sender, receiver) for sender, receiver in zip(gr_df.index, gr_df["to"])])
print(g)

In [None]:
g = nx.DiGraph()
for i, row in lr_df.iterrows():
    g.add_edge(row["ligand"], row["receptor"])
print(g)
fig,axs = plt.subplots(nrows=1, ncols=1, figsize=(40,40))
node_sizes = [10*g.degree(n) for n in g.nodes()]
nx.draw(g, with_labels=True, font_size=8, pos=nx.spring_layout(g), node_size = node_sizes, ax=axs)

# Let's work with omnipath

In [None]:
from pypath import omnipath
from pypath import core
op = omnipath.db.datasets
print(op)

In [None]:
omni_net = core.network.Network(resources="omnipath", make_df=True)

In [None]:
omni_net.make_df()

In [None]:
print(omni_net.df.info())
omni_net.df.to_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "omni_net_human.csv")

In [None]:
omni_net = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "omni_net_human.csv")

In [None]:
print(omni_net.info())

In [None]:
import pypath

In [None]:
biomart_homology = pypath.inputs.biomart.biomart_homology(source_organism="human", target_organism="mouse")

In [None]:
biomart_homologene_class = pypath.utils.orthology.HomologeneOrthology(
    target="mouse", source="human",
)

In [None]:
omni_net

In [None]:
translated_omni_net = biomart_homologene_class.translate_df(omni_net, cols=["id_a","id_b"])

In [None]:
translated_omni_net

In [None]:
id_a_set = set(translated_omni_net["id_a"].unique())
id_b_set = set(translated_omni_net["id_b"].unique())
all_known_genes = id_a_set.union(id_b_set)



In [None]:
with open(Path(os.getcwd()).parents[0] / "data" / "raw" / "omni_network_swissprot_mouse_ids.txt", "w") as f:
    f.write(str(all_known_genes))
    f.close()

In [None]:
map_uniprot_to_gene_symbol = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "idmap.csv")
print(map_uniprot_to_gene_symbol.shape)
map_uniprot_to_gene_symbol.index = map_uniprot_to_gene_symbol["query"]

In [None]:
print(f"How many genes have we mapped: {len(set(map_uniprot_to_gene_symbol["query"]).intersection(all_known_genes))/len(all_known_genes)*100:.2f}%")

In [None]:
translated_omni_net["id_a"] = translated_omni_net["id_a"].map(map_uniprot_to_gene_symbol["symbol"].to_dict())
translated_omni_net["id_b"] = translated_omni_net["id_b"].map(map_uniprot_to_gene_symbol["symbol"].to_dict())

In [None]:
map_from_human_to_mouse_ortologs = pd.read_csv(Path(os.getcwd()).parents[0] / "data" / "raw" / "human_mouse_gene_orthologs.csv")
print(map_from_human_to_mouse_ortologs)
print(omni_net.df["id_a"].unique())
print(len(set(map_from_human_to_mouse_ortologs["Gene name"]).intersection(omni_net.df["id_b"])))

Omnipath can translate between a large variety of gene, protein, miRNA and small molecule ID.

Omnipath can translate homologous genes, finding orthologs between two organims.

Omnipath contains the following:
- omnipath (activity flow, enzyme-substrate, lr interactions)
- curated (PPI network)
- complex
- annotations (HUGE database)
- intercell (requires annotations)
- tf_target
- dorothea (collectri is newer)
- small_molecule
- tf_mirna
- mirna_mrna
- lncrna_mrna
- enz_sub


In [None]:
translated_omni_net.effect.value_counts()

In [None]:
print(f"How many sources of omni_net are present in x hat s: " + \
      f"{len(set(translated_omni_net['id_a'].str.lower()).intersection(x_hat_s.var_names))/len(x_hat_s.var_names)*100:.2f}% of x hat s")
print(f"How many receivers of omni_net are present in database: " + \
      f"{len(set(translated_omni_net['id_b'].str.lower()).intersection(x_hat_s.var_names))/len(x_hat_s.var_names)*100:.2f}% of x hat s")
print(f"How many source of omni_net are present in x f: " + \
      f"{len(set(translated_omni_net["id_a"].str.lower()).intersection(data.var_names))/len(data.var_names)*100:.2f}% of x f")
print(f"How many receivers of omni_net are present in x f: " + \
      f"{len(set(translated_omni_net["id_b"].str.lower()).intersection(data.var_names))/len(data.var_names)*100:.2f}% of x f")

In [None]:
d = set(translated_omni_net["id_a"].str.lower()).intersection(set(translated_omni_net["id_b"].str.lower())).intersection(data.var_names)
a = set(translated_omni_net["id_a"].str.lower()).intersection(set(translated_omni_net["id_b"]))-d
b = set(translated_omni_net["id_a"].str.lower()).intersection(data.var_names) - d
c = set(translated_omni_net["id_b"].str.lower()).intersection(data.var_names) - d
print(len(d), len(a), len(b), len(c))
print(len(data.var_names) - len(b) - len(d) - len(c))
print(len(set(translated_omni_net["id_a"].str.lower())) - len(a) - len(d) - len(b))
print(len(set(translated_omni_net["id_b"].str.lower())) - len(a) - len(d) - len(c))

In [None]:
d = set(translated_omni_net["id_a"].str.lower()).intersection(set(translated_omni_net["id_b"].str.lower())).intersection(x_hat_s.var_names)
a = set(translated_omni_net["id_a"].str.lower()).intersection(set(translated_omni_net["id_b"]))-d
b = set(translated_omni_net["id_a"].str.lower()).intersection(x_hat_s.var_names) - d
c = set(translated_omni_net["id_b"].str.lower()).intersection(x_hat_s.var_names) - d
print(len(d), len(a), len(b), len(c))
print(len(x_hat_s.var_names) - len(b) - len(d) - len(c))
print(len(set(translated_omni_net["id_a"].str.lower())) - len(a) - len(d) - len(b))
print(len(set(translated_omni_net["id_b"].str.lower())) - len(a) - len(d) - len(c))

In [None]:
a = set(translated_omni_net["id_a"].str.lower()).intersection(
    translated_omni_net["id_b"].str.lower()
).intersection(
    gr_df["from"].str.lower()
).intersection(
    gr_df["to"].str.lower()
)
g = set(translated_omni_net["id_a"].str.lower()).intersection(
    translated_omni_net["id_b"].str.lower()
).intersection(
    gr_df["from"].str.lower()
) - a
h = set(translated_omni_net["id_a"].str.lower()).intersection(
    gr_df["from"].str.lower()
).intersection(
    gr_df["to"].str.lower()
) -a
i = set(translated_omni_net["id_b"].str.lower()).intersection(
    gr_df["from"].str.lower()
).intersection(
    gr_df["to"].str.lower()
) -a
f = set(translated_omni_net["id_a"].str.lower()).intersection(
    translated_omni_net["id_b"].str.lower()
).intersection(
    gr_df["to"].str.lower()
) -a
b = set(translated_omni_net["id_a"].str.lower()).intersection(
    translated_omni_net["id_b"].str.lower()
) - g - f - a
c = set(translated_omni_net["id_a"].str.lower()).intersection(
    gr_df["from"].str.lower()
) - g - h - a
d = set(gr_df["from"].str.lower()).intersection(
    gr_df["to"].str.lower()
) - h - i - a
e = set(translated_omni_net["id_b"].str.lower()).intersection(
    gr_df["to"].str.lower()
) - f - i - a
print(len(a), len(b), len(c), len(d), len(e), len(f), len(g), len(h), len(i))

In [None]:
num_isolated_id_a = len(set(translated_omni_net["id_a"].str.lower()) -c-b-g-h-a-f)
num_isolated_id_b = len(set(translated_omni_net["id_b"].str.lower()) -b-e-g-f-a-i)
num_isolated_regulators = len(set(gr_df["from"].str.lower())-c-g-a-h-i-d)
num_isolated_regulated = len(set(gr_df["to"].str.lower())-e-f-a-h-i-d)
print(num_isolated_id_a, num_isolated_id_b, num_isolated_regulators, num_isolated_regulated)
print(len(set(translated_omni_net["id_a"].str.lower())),
      len(set(translated_omni_net["id_b"].str.lower())),
      len(set(gr_df["from"].str.lower())),
      len(set(gr_df["to"].str.lower())))

In [None]:
lr_df