# Approach 2

In [1]:
import setup # sets up packages and environment in colab




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import scanpy as sc
import pandas as pd
import numpy as np
import numpy.typing as npt
from IPython.display import display

np.random.seed(42) # set seed to make reproducible outputs

In [None]:
adata = sc.read_h5ad("./data/subdom_processed.h5ad")
adata

AnnData object with n_obs × n_vars = 2671 × 13144
    obs: 'n_genes_by_counts', 'total_counts', 'clusters', 'ct_num_exp_genes', 'ct_score', 'ct_pseudotime'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'ct_gene_corr', 'ct_correlates'
    uns: 'clusters_colors', 'clusters_sizes', 'ct_params', 'hvg', 'leiden', 'log1p', 'neighbors', 'paga', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'Ms', 'Mu', 'log_tpm', 'spliced', 'tpm', 'unspliced'
    obsp: 'connectivities', 'distances'

In [4]:
df_grn = pd.read_csv("./data/transcription_factor_adjacency_matrix.csv", index_col="gene_ids")
df_grn = df_grn.transpose()
df_grn.head()

gene_ids,SUB2.g1,SUB2.g2,SUB2.g11,SUB2.g16,SUB2.g18,SUB2.g21,SUB2.g22,SUB2.g26,SUB2.g27,SUB2.g31,...,SUB2.g12978,SUB2.g12979,SUB2.g12980,SUB2.g13052,SUB2.g13056,SUB2.g13090,SUB2.g13093,SUB2.g13119,SUB2.g13123,SUB2.g13134
SUB2.g8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SUB2.g231,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
SUB2.g755,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
SUB2.g902,1,1,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
SUB2.g924,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1


## Approach 2.1 - Similarity between relative changes of means

In [None]:
def mmc(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    """
    Calculates the maximum mean change.
    """
    return (y - x)/(np.maximum(x, y))

def score_similarity_relative_change(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    correlation_direction = np.sign(x * y)
    change_absolute_value = mmc(np.abs(x), np.abs(y))
    similarity_magnitude = 1 - np.abs(change_absolute_value)
    return correlation_direction * similarity_magnitude

def calculate_mean_change_similarity_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    mean_cluster_a = np.mean(cluster_a, axis=1)
    mean_cluster_b = np.mean(cluster_b, axis=1)
    relative_changes = mmc(mean_cluster_a, mean_cluster_b)
    n = relative_changes.shape[0]
    x = np.tile(relative_changes, (n, 1))
    similarity_matrix = score_similarity_relative_change(x, x.T)
    return similarity_matrix


## Approach 2.2 - Maximum mean difference discrepancy

In [None]:
FloatScalar = np.floating | float

# TODO: vectorize this to make it faster

def gaussian_kernel_scalar(x: FloatScalar, y: FloatScalar, sigma: FloatScalar = 1.) -> float:
    numerator = (x - y) ** 2
    denominator = 2 * sigma ** 2
    return np.exp(- numerator / denominator)

def mean_kernel(x: npt.ArrayLike, y: npt.ArrayLike) -> npt.ArrayLike:
    n = x.shape[0]
    m = y.shape[0]

    sum_ = 0.
    for i in range(n):
        for j in range(m):
            sum_ += gaussian_kernel_scalar(x[i], y[j])
    return 1. / (n * m) * sum_

def calculate_mmdd_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    """
    Maximum mean difference discrepancy
    """
    num_genes = cluster_a.shape[1]
    similarity_matrix = np.zeros((num_genes, num_genes))
    for i in range(num_genes):
        for j in range(num_genes):
            x_a = cluster_a[:, i]
            x_b = cluster_b[:, i]
            y_a = cluster_a[:, j]
            y_b = cluster_b[:, j]

            similarity_matrix[i][j] = \
                + mean_kernel(x_a, x_a) - 2*mean_kernel(x_a, x_b) + mean_kernel(x_b, x_b) \
                + mean_kernel(y_a, y_a) - 2*mean_kernel(y_a, y_b) + mean_kernel(y_b, y_b) \
                + 2*mean_kernel(x_b, y_b) - 2*mean_kernel(x_a, y_a) \
                + 2*mean_kernel(x_b, y_a) - 2*mean_kernel(x_a, y_b)
            
    
def calculate_mmdd_similarity_matrix(cluster_a: npt.ArrayLike, cluster_b: npt.ArrayLike):
    mmdd_matrix = calculate_mmdd_matrix(cluster_a, cluster_b)

    mean_cluster_a = np.mean(cluster_a, axis=1)
    mean_cluster_b = np.mean(cluster_b, axis=1)
    relative_changes = mmc(mean_cluster_a, mean_cluster_b)
    correlation_directions = np.sign(relative_changes @ relative_changes.T)

    C = 1 # The gaussian kernel has an upper bound C of 1
    normalization_factor = 1 / (8 // 2 * C)
    return correlation_directions * (1 - normalization_factor * mmdd_matrix) 