# Approach 2

In [1]:
import setup # sets up packages and environment in colab




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
import scanpy as sc
import pandas as pd
import numpy as np
import numpy.typing as npt
import torch
from IPython.display import display

np.random.seed(42) # set seed to make reproducible outputs

In [3]:
adata = sc.read_h5ad("./data/subdom_processed.h5ad")
adata

AnnData object with n_obs × n_vars = 2671 × 13144
    obs: 'n_genes_by_counts', 'total_counts', 'clusters', 'ct_num_exp_genes', 'ct_score', 'ct_pseudotime'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ct_gene_corr', 'ct_correlates'
    uns: 'clusters_colors', 'clusters_sizes', 'ct_params', 'hvg', 'leiden', 'log1p', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'Ms', 'Mu', 'log_tpm', 'spliced', 'tpm', 'unspliced'
    obsp: 'connectivities', 'distances'

In [4]:
df_grn = pd.read_csv("./data/transcription_factor_adjacency_matrix.csv", index_col="gene_ids")
df_grn = df_grn.transpose()
df_grn.head()

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SUB2.g231,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
SUB2.g755,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
SUB2.g902,1,1,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
SUB2.g924,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1


## Approach 2.1 - Similarity between relative changes of means

In [5]:
def mmc(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    """
    Calculates the maximum mean change.
    """
    eta = 0.1e-10 # Used for avoiding divisions through zero
    return (y - x)/(np.maximum(x, y) + eta)

def score_similarity_relative_change(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    correlation_direction = np.sign(x * y)
    change_absolute_value = mmc(np.abs(x), np.abs(y))
    similarity_magnitude = 1 - np.abs(change_absolute_value)
    return correlation_direction * similarity_magnitude

def calculate_mean_change_similarity_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    mean_cluster_a = np.mean(cluster_a, axis=0)
    mean_cluster_b = np.mean(cluster_b, axis=0)
    relative_changes = mmc(mean_cluster_a, mean_cluster_b)
    n = relative_changes.shape[0]
    x = np.tile(relative_changes, (n, 1))
    similarity_matrix = score_similarity_relative_change(x, x.T)
    return similarity_matrix


In [None]:
from data_utils import extract_samples_of_cell_cluster

cluster_a_id = "7" # stem cells
cluster_b_id = "0" # transition cells
cluster_a = extract_samples_of_cell_cluster(adata, cluster_a_id)
cluster_b = extract_samples_of_cell_cluster(adata, cluster_b_id)

similarity_matrix = calculate_mean_change_similarity_matrix(cluster_a.values, cluster_b.values)
df_similarity = pd.DataFrame(similarity_matrix, index=cluster_a.columns, columns=cluster_a.columns)
display(df_similarity.head())

Unnamed: 0,SUB2.g1,SUB2.g2,SUB2.g3,SUB2.g4,SUB2.g5,SUB2.g6,SUB2.g7,SUB2.g8,SUB2.g9,SUB2.g10,...,SUB2.g13135,SUB2.g13136,SUB2.g13137,SUB2.g13138,SUB2.g13139,SUB2.g13140,SUB2.g13141,SUB2.g13142,SUB2.g13143,SUB2.g13144
SUB2.g1,1.0,0.270713,-0.731464,-0.385749,0.257416,0.469152,0.139703,0.118576,0.51649,0.23664,...,0.0,0.0,0.0,0.186994,0.0,0.0,0.0,0.0,-0.138283,0.113126
SUB2.g2,0.270713,1.0,-0.370097,-0.104427,0.950881,0.577026,0.516057,0.438013,0.52414,0.874135,...,0.0,0.0,0.0,0.690746,0.0,0.0,0.0,0.0,-0.51081,0.417883
SUB2.g3,-0.731464,-0.370097,1.0,0.282162,-0.351919,-0.641387,-0.190992,-0.162107,-0.706104,-0.323515,...,0.0,0.0,0.0,-0.255643,0.0,0.0,0.0,0.0,0.189049,-0.154657
SUB2.g4,-0.385749,-0.104427,0.282162,1.0,-0.099298,-0.180975,-0.053891,-0.04574,-0.199236,-0.091284,...,0.0,0.0,0.0,-0.072133,0.0,0.0,0.0,0.0,0.053343,-0.043638
SUB2.g5,0.257416,0.950881,-0.351919,-0.099298,1.0,0.548684,0.542715,0.460639,0.498395,0.919289,...,0.0,0.0,0.0,0.726427,0.0,0.0,0.0,0.0,-0.537197,0.439469


In [9]:
refined_grn = pd.DataFrame(np.zeros(df_grn.shape), index=df_grn.index, columns=df_grn.columns)
mask = df_grn == 1
refined_grn[mask] = df_similarity.loc[df_grn.index, df_grn.columns][mask]
display(refined_grn.head())

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g231,0.0,0.0,0.0,0.0,0.0,0.166565,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.151422,-0.789151,0.0,0.0,0.0,0.0
SUB2.g755,0.0,0.0,-0.766945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.554128,0.0,0.0,0.0,0.0,0.0,0.0
SUB2.g902,0.052816,0.1951,0.21051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.291357,0.727548,0.0,0.0,0.0,0.0,0.0
SUB2.g924,0.0,0.0,-0.411389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.169899,0.0,-0.291268,-0.214768,-0.197853


## Approach 2.2 - Maximum mean difference discrepancy

In [None]:
def mean_kernel_matrix(x, y, sigma = 1., device=None):
    n, d = x.shape
    m, _ = y.shape

    denominator = 2 * sigma ** 2

    # cannot use vectorization here, because it needs to much space (97 GB)
    summed_kernels = torch.zeros((d, d), device=device)

    for i in range(n):
        x_i = x[i, :, None]
        for j in range(m):
            y_j = y[j, None, :]
            distance_matrix = (x_i - y_j) ** 2 
            kernel_matrix = torch.exp(-distance_matrix / denominator) 
            summed_kernels += kernel_matrix
    return summed_kernels / (n * m)

            
def calculate_mmdd_matrix(cluster_a, cluster_b, device=None):
    if isinstance(cluster_a, np.ndarray):
        cluster_a = torch.from_numpy(cluster_a).float()
    if isinstance(cluster_b, np.ndarray):
        cluster_b = torch.from_numpy(cluster_b).float()
    cluster_a = cluster_a.to(device)
    cluster_b = cluster_b.to(device)
    
    k_aa = mean_kernel_matrix(cluster_a, cluster_a, device=device)
    k_bb = mean_kernel_matrix(cluster_b, cluster_b, device=device)
    k_ab = mean_kernel_matrix(cluster_a, cluster_b, device=device)
    k_ba = k_ab.T

    k_diag = torch.diag(k_aa) - 2 * torch.diag(k_ab) + torch.diag(k_bb)

    result_matrix = (
        k_diag[:, None] + k_diag[None, :] 
        + 2 * k_bb - 2 * k_aa 
        + 2 * k_ba - 2 * k_ab
    )
    return result_matrix.cpu().numpy()

def calculate_mmdd_similarity_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    mmdd_matrix = calculate_mmdd_matrix(cluster_a, cluster_b)

    mean_cluster_a = np.mean(cluster_a, axis=1)
    mean_cluster_b = np.mean(cluster_b, axis=1)
    relative_changes = mmc(mean_cluster_a, mean_cluster_b)
    correlation_directions = np.sign(relative_changes @ relative_changes.T)

    C = 1 # The gaussian kernel has an upper bound C of 1
    normalization_factor = 1 / (8 // 2 * C)
    return correlation_directions * (1 - normalization_factor * mmdd_matrix) 

In [19]:
similarity_matrix = calculate_mmdd_similarity_matrix(cluster_a.values, cluster_b.values)
df_similarity = pd.DataFrame(similarity_matrix, index=cluster_a.columns, columns=cluster_a.columns)
display(df_similarity.head())

0
1


KeyboardInterrupt: 