# Approach 2

In [1]:
import shutil


# Check if this gets executed in google colab.
# If so, then we need to install pip packages and clone the repo and data
try:
    import google.colab
except ImportError:
    IN_COLAB = False
else:
    IN_COLAB = True


if IN_COLAB:
    !git clone https://github.com/DavidWild02/BachelorThesis-ML-Gene-Interactions.git
    %cd ./BachelorThesis-ML-Gene-Interactions

    # copy data from drive folder.
    from google.colab import drive
    drive.mount('/content/drive')
    shutil.copytree("/content/drive/MyDrive/DavidWildBachelorthesis/data", "./data", dirs_exist_ok=True)

# install packages manually, because IPython and others cannot be installed, because that crashes the runtime
!pip install matplotlib numpy pandas seaborn anndata scanpy torch scikit-learn scipy tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import numpy.typing as npt
import torch
from IPython.display import display

np.random.seed(42) # set seed to make reproducible outputs

In [3]:
adata = sc.read_h5ad("./data/subdom_processed.h5ad")
adata

AnnData object with n_obs × n_vars = 2671 × 13144
    obs: 'n_genes_by_counts', 'total_counts', 'clusters', 'ct_num_exp_genes', 'ct_score', 'ct_pseudotime'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ct_gene_corr', 'ct_correlates'
    uns: 'clusters_colors', 'clusters_sizes', 'ct_params', 'hvg', 'leiden', 'log1p', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'Ms', 'Mu', 'log_tpm', 'spliced', 'tpm', 'unspliced'
    obsp: 'connectivities', 'distances'

In [4]:
df_grn = pd.read_csv("./data/transcription_factor_adjacency_matrix.csv", index_col="gene_ids")
df_grn = df_grn.transpose()
df_grn.head()

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SUB2.g231,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
SUB2.g755,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
SUB2.g902,1,1,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
SUB2.g924,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1


## Approach 2.1 - Similarity between relative changes of means

In [5]:
def mmc(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    """
    Calculates the maximum mean change.
    """
    eta = 0.1e-10 # Used for avoiding divisions through zero
    return (y - x)/(np.maximum(x, y) + eta)

def score_similarity_relative_change(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    correlation_direction = np.sign(x * y)
    change_absolute_value = mmc(np.abs(x), np.abs(y))
    similarity_magnitude = 1 - np.abs(change_absolute_value)
    return correlation_direction * similarity_magnitude

def calculate_mean_change_similarity_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    mean_cluster_a = np.mean(cluster_a, axis=0)
    mean_cluster_b = np.mean(cluster_b, axis=0)
    relative_changes = mmc(mean_cluster_a, mean_cluster_b)
    n = relative_changes.shape[0]
    x = np.tile(relative_changes, (n, 1))
    similarity_matrix = score_similarity_relative_change(x, x.T)
    return similarity_matrix


In [None]:
from data_utils import extract_samples_of_cell_cluster

cluster_a_id = "7" # stem cells
cluster_b_id = "0" # transition cells
cluster_a = extract_samples_of_cell_cluster(adata, cluster_a_id)
cluster_b = extract_samples_of_cell_cluster(adata, cluster_b_id)

# Filter out only those genes used in the GRN. Avoid unnecessary computation
gene_indices = pd.unique(df_grn.index.append(df_grn.columns))
cluster_a = cluster_a.loc[:, gene_indices]
cluster_b = cluster_b.loc[:, gene_indices]

similarity_matrix = calculate_mean_change_similarity_matrix(cluster_a.values, cluster_b.values)
df_similarity_mean_change = pd.DataFrame(similarity_matrix, index=gene_indices, columns=gene_indices)
display(df_similarity_mean_change.head())

Unnamed: 0,SUB2.g8,SUB2.g231,SUB2.g755,SUB2.g902,SUB2.g924,SUB2.g940,SUB2.g1013,SUB2.g1081,SUB2.g1118,SUB2.g1198,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,1.0,-0.247332,-0.362465,0.445421,-0.870464,-0.517489,0.320999,-0.023565,0.353407,0.404589,...,0.143572,0.0,0.0,0.654117,0.612221,0.195182,0.790806,0.334613,0.246729,0.227296
SUB2.g231,-0.247332,1.0,0.682361,-0.110167,0.215293,0.477946,-0.079393,0.005828,-0.087409,-0.100068,...,-0.580485,0.0,0.0,-0.378115,-0.151422,-0.789151,-0.195592,-0.739159,-0.997561,-0.918993
SUB2.g755,-0.362465,0.682361,1.0,-0.161449,0.315512,0.70043,-0.116351,0.008542,-0.128098,-0.146649,...,-0.3961,0.0,0.0,-0.554128,-0.221909,-0.538486,-0.286639,-0.923159,-0.680697,-0.627085
SUB2.g902,0.445421,-0.110167,-0.161449,1.0,-0.511705,-0.2305,0.720665,-0.052905,0.793424,0.90833,...,0.06395,0.0,0.0,0.291357,0.727548,0.086938,0.563249,0.149043,0.109898,0.101242
SUB2.g924,-0.870464,0.215293,0.315512,-0.511705,1.0,0.450455,-0.368768,0.027072,-0.405999,-0.464797,...,-0.124974,0.0,0.0,-0.569385,-0.703328,-0.169899,-0.908488,-0.291268,-0.214768,-0.197853


In [7]:
refined_grn_mean_change = pd.DataFrame(np.zeros(df_grn.shape), index=df_grn.index, columns=df_grn.columns)
mask = df_grn == 1
refined_grn_mean_change[mask] = df_similarity_mean_change.loc[df_grn.index, df_grn.columns][mask]
display(refined_grn_mean_change.head())

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g231,0.0,0.0,0.0,0.0,0.0,0.166565,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.151422,-0.789151,0.0,0.0,0.0,0.0
SUB2.g755,0.0,0.0,-0.766945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.554128,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g902,0.052816,0.1951,0.21051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.291357,0.727548,0.0,0.0,0.0,0.0,0.0
SUB2.g924,0.0,0.0,-0.411389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.169899,0.0,-0.291268,-0.214768,-0.197853


## Approach 2.2 - Maximum mean difference discrepancy

In [None]:
from tqdm import tqdm

def mean_kernel_matrix(x, y, sigma = 1., device=None):
    """
        Uses the radial basis function kernel
    """
    n, d = x.shape
    m, _ = y.shape

    denominator = 2 * sigma ** 2

    # cannot use vectorization here, because it needs to much space (97 GB)
    summed_kernels = torch.zeros((d, d), device=device)
    for i in tqdm(range(n), desc="Calculating mean kernel matrix..."):
        x_i = x[i, :, None]
        for j in range(m):
            y_j = y[j, None, :]
            distance_matrix = (x_i - y_j) ** 2
            kernel_matrix = torch.exp(-distance_matrix / denominator)
            summed_kernels += kernel_matrix
    return summed_kernels / (n * m)


def calculate_mmdd_similarity_matrix(cluster_a, cluster_b, sigma = 1.):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if isinstance(cluster_a, np.ndarray):
        cluster_a = torch.from_numpy(cluster_a).float()
    if isinstance(cluster_b, np.ndarray):
        cluster_b = torch.from_numpy(cluster_b).float()
    cluster_a = cluster_a.to(device)
    cluster_b = cluster_b.to(device)

    k_aa = mean_kernel_matrix(cluster_a, cluster_a, sigma=sigma, device=device)
    k_bb = mean_kernel_matrix(cluster_b, cluster_b, sigma=sigma, device=device)
    k_ab = mean_kernel_matrix(cluster_a, cluster_b, sigma=sigma, device=device)
    k_ba = k_ab.T

    k_diag = torch.diag(k_bb) - 2 * torch.diag(k_ba) + torch.diag(k_aa)

    squared_mmdd_matrix = (
        k_diag[:, None] + k_diag[None, :]
        - 2. * k_bb - 2. * k_aa
        + 2. * k_ab + 2. * k_ba
    )

    UPPER_BOUND_KERNEL = 1 # For RBF kernel the upper bound is 1
    UPPER_BOUND_MMDD = 8 * UPPER_BOUND_KERNEL # used as normalization factor, so that result is between -1 and +1
    correlation_directions = torch.sign(squared_mmdd_matrix)
    mmdd_matrix = torch.sqrt(torch.abs(squared_mmdd_matrix) / UPPER_BOUND_MMDD )

    similarity_matrix = correlation_directions * (1. - mmdd_matrix)

    return similarity_matrix.cpu().numpy()

In [9]:
# Only use a subset of the samples from the clusters, because else it takes ages to compute
cluster_a_samples = np.random.permutation(cluster_a.values)[:100]
cluster_b_samples = np.random.permutation(cluster_b.values)[:100]

similarity_matrix = calculate_mmdd_similarity_matrix(cluster_a_samples, cluster_b_samples)
df_similarity_mmdd = pd.DataFrame(similarity_matrix, index=gene_indices, columns=gene_indices)
display(df_similarity_mmdd.head())

Calculating mean kernel matrix...: 100%|██████████| 100/100 [24:37<00:00, 14.77s/it]
Calculating mean kernel matrix...: 100%|██████████| 100/100 [20:42<00:00, 12.42s/it]
Calculating mean kernel matrix...: 100%|██████████| 100/100 [20:17<00:00, 12.17s/it]
  correlation_directions = np.sign(squared_mmdd_matrix)
  mmdd_matrix = np.abs(squared_mmdd_matrix) // 2


Unnamed: 0,SUB2.g8,SUB2.g231,SUB2.g755,SUB2.g902,SUB2.g924,SUB2.g940,SUB2.g1013,SUB2.g1081,SUB2.g1118,SUB2.g1198,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
SUB2.g231,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
SUB2.g755,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
SUB2.g902,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
SUB2.g924,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
refined_grn_mmdd = pd.DataFrame(np.zeros(df_grn.shape), index=df_grn.index, columns=df_grn.columns)
mask = df_grn == 1
refined_grn_mmdd[mask] = df_similarity_mmdd.loc[df_grn.index, df_grn.columns][mask]
display(refined_grn_mmdd.head())

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g231,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
SUB2.g755,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g902,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
SUB2.g924,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
