In [None]:
import os
import torch
import pandas as pd
import scanpy as sc

In [None]:
import SpatialGlue

In [None]:
from scipy.spatial import cKDTree
from scipy.stats import mode
def cKD_refine_label(coords, labels, k):
    # Step 1: Build KD-Tree
    tree = cKDTree(coords.copy())

    # Step 2: Find k-nearest neighbors for each spot
    # k+1 because the closest point is itself
    distances, neighbors = tree.query(coords, k=k+1)

    # Exclude self-neighbor (first column)
    neighbors = neighbors[:, 1:]

    # Step 3: Reassign labels
    new_labels = labels.copy()
    for i, nbrs in enumerate(neighbors):
        # Get the labels of neighboring spots
        neighbor_labels = labels[nbrs]
        # Find the most common label among neighbors
        most_common_label = mode(neighbor_labels, keepdims=True).mode[0]
        # Reassign the label
        new_labels[i] = most_common_label
    return (new_labels)

In [None]:
pip install torch

In [None]:
from numpy.random import default_rng
import scanpy as sc
# import squidpy as sq
from anndata import AnnData
import scipy
# sc.logging.print_header()
from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np
import pandas as pd
import seaborn as sns
import os
import torch
import pandas as pd
import scanpy as sc
from sklearn import metrics
import multiprocessing as mp
from sklearn.metrics.cluster import normalized_mutual_info_score, homogeneity_score, completeness_score
from numpy import genfromtxt

In [None]:
# data_name = 'CBMCs_nCells_10201_nGenes_100_'
RNA_counts = pd.read_csv('/Users/melancholy/Desktop/SpatialGlue/scDesign3/Amplify_CBMCs_nCells_1992_nGenes_100_RNA.csv', delimiter=',')
ADT_counts = pd.read_csv('/Users/melancholy/Desktop/SpatialGlue/scDesign3/Amplify_CBMCs_nCells_1992_nGenes_100_ADT.csv', delimiter=',')
metadata = pd.read_csv('/Users/melancholy/Desktop/SpatialGlue/scDesign3/Amplify_CBMCs_nCells_1992_nGenes_100_metadata.csv', delimiter=',') 

In [None]:
# Create anndata obj
adata_omics1 = AnnData(RNA_counts.values, obsm={"spatial": metadata.iloc[:,2:4].values}, uns={"spatial": metadata.iloc[:,2:4].values})
adata_omics1.obs_names = [f"Cell_{i:d}" for i in range(adata_omics1.n_obs)]
adata_omics1.var_names = list(RNA_counts.columns.values)
adata_omics2 = AnnData(ADT_counts.values, obsm={"spatial": metadata.iloc[:,2:4].values}, uns={"spatial": metadata.iloc[:,2:4].values})
adata_omics2.obs_names = [f"Cell_{i:d}" for i in range(adata_omics1.n_obs)]
adata_omics2.var_names = list(ADT_counts.columns.values)

In [None]:
adata_omics1

## SpatialGlue

In [None]:
# Specify data type
data_type = '10x'
# Fix random seed
from SpatialGlue.preprocess import fix_seed
random_seed = 2022
fix_seed(random_seed)

In [None]:
from SpatialGlue.preprocess import clr_normalize_each_cell, pca

In [None]:
sc.pp.filter_genes(adata_omics2, min_cells=0)
adata_omics2 = adata_omics2[adata_omics1.obs_names].copy()

In [None]:
# Protein
adata_omics2 = clr_normalize_each_cell(adata_omics2)
adata_omics2.obsm['feat'] = pca(adata_omics2, n_comps=adata_omics2.n_vars-1)

In [None]:
from scipy.sparse import issparse, csr_matrix
# Kiểm tra dữ liệu và chuyển đổi nếu cần
if not issparse(adata_omics2.X):
    adata_omics2.X = csr_matrix(adata_omics2.X)

In [None]:
#RNA
sc.pp.filter_genes(adata_omics1, min_cells=0)
sc.pp.filter_cells(adata_omics1, min_genes=0)

In [None]:
# sc.pp.highly_variable_genes(adata_omics1, flavor="seurat_v3", n_top_genes=500)
sc.pp.normalize_total(adata_omics1, target_sum=1e4)
sc.pp.log1p(adata_omics1)

In [None]:
adata_omics1.obsm['feat'] = pca(adata_omics1, n_comps=adata_omics2.n_vars-1)

In [None]:
import pandas as pd

df_omics1 = pd.DataFrame(adata_omics1.X, index=adata_omics1.obs_names)
df_omics2 = pd.DataFrame(adata_omics2.X, index=adata_omics2.obs_names)
common_samples = df_omics1.index.intersection(df_omics2.index)
adata_omics2 = adata_omics2[adata_omics2.obs.index.isin(common_samples)]
adata_omics1 = adata_omics1[adata_omics1.obs.index.isin(common_samples)]

In [None]:
from SpatialGlue.preprocess import construct_neighbor_graph
data = construct_neighbor_graph(adata_omics1, adata_omics2, datatype=data_type)

In [None]:
from SpatialGlue.SpatialGlue_pyG import Train_SpatialGlue

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
model = Train_SpatialGlue(data, datatype=data_type, device=device)

In [None]:
# train model
output = model.train()

In [None]:
adata = adata_omics1.copy()
adata.obsm['emb_latent_omics1'] = output['emb_latent_omics1']
adata.obsm['emb_latent_omics2'] = output['emb_latent_omics2']
adata.obsm['SpatialGlue'] = output['SpatialGlue']
adata.obsm['alpha'] = output['alpha']
adata.obsm['alpha_omics1'] = output['alpha_omics1']
adata.obsm['alpha_omics2'] = output['alpha_omics2']

In [None]:
true_labels = np.array(metadata['label'])
n_clusters = len(set(true_labels))
n_clusters

In [None]:
# we set 'mclust' as clustering tool by default. Users can also select 'leiden' and 'louvain'
from SpatialGlue.utils import clustering
tool = 'mclust' # mclust, leiden, and louvain
clustering(adata, key='SpatialGlue', add_key='SpatialGlue', n_clusters=n_clusters, method=tool, use_pca=True, start=0.9, end=1.1, increment=0.02)

In [None]:
print(adjusted_rand_score(true_labels, adata.obs['SpatialGlue']))
print(normalized_mutual_info_score(true_labels, adata.obs['SpatialGlue']))

## GraphGBM

In [None]:
from sklearn.decomposition import PCA
# from GraphST.utils import refine_label
from sklearn.preprocessing import StandardScaler
# from GraphST.utils import mclust_R
import numpy as np
from numpy import dot, array
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from sklearn.mixture import BayesianGaussianMixture
from mvlearn.datasets import load_UCImultifeature
from mvlearn.embed import GCCA
# gcca = GCCA(n_components = 20)
# Xs_latents = gcca.fit_transform(Xs)

from mvlearn.plotting import crossviews_plot
from mvlearn.embed import DCCA
from mvlearn.embed import MCCA
from mvlearn.embed import KMCCA

In [None]:
embedding = adata_omics1.obsm['feat']
n_neighbors = 4
connectivity = kneighbors_graph(adata_omics1.obsm['spatial'], n_neighbors=n_neighbors, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
embedding_RNA = connectivity.dot(embedding)
adata_omics1.obsm['spatial_RNAfeat'] = embedding_RNA

embedding = adata_omics2.obsm['feat']
embedding_Pro = connectivity.dot(embedding)
adata_omics1.obsm['spatial_Profeat'] = embedding_Pro

# Standardize the data
scaler_a = StandardScaler()
scaler_b = StandardScaler()

data_a_train = scaler_a.fit_transform(embedding_RNA)
data_b_train = scaler_b.fit_transform(embedding_Pro)

# Define and train the CCA model
n_components = 5  # Number of canonical components
cca = CCA(n_components=n_components)
cca.fit(data_a_train, data_b_train)

In [None]:
### SOTA
from sklearn.mixture import BayesianGaussianMixture
from mvlearn.datasets import load_UCImultifeature
from mvlearn.embed import GCCA

from mvlearn.plotting import crossviews_plot
from mvlearn.embed import DCCA
from mvlearn.embed import MCCA
from mvlearn.embed import KMCCA

true_labels = np.array(metadata['label'])
n_clusters = len(set(true_labels))

Xs = [data_a_train-data_a_train.min(), data_b_train-data_b_train.min()] # multiview data
mcca = KMCCA(n_components = 20, kernel = 'poly', regs = 1)
mcca.fit(Xs)

Xs_latents = mcca.transform(Xs)
adata_omics1.obsm['emb_pca'] = np.concatenate((Xs_latents[0,:,:], Xs_latents[1,:,:]), axis=1)

In [None]:
# GraphBGM: use BayesianGaussianMixture
gmm = BayesianGaussianMixture(n_components=n_clusters, covariance_type='full', random_state=0, init_params = 'random_from_data', n_init = 5, max_iter = 1000)

# Step 4: Fit GMM
gmm.fit(adata_omics1.obsm['emb_pca'])
cluster_labels = gmm.predict(adata_omics1.obsm['emb_pca'])
adata_omics1.obs['domain'] = cluster_labels  

refine_cluster_labels = cKD_refine_label(np.array(adata_omics1.obsm['spatial']), cluster_labels, k = 45)

# print('GBM - no post-preprocessing')
# print(adjusted_rand_score(true_labels[true_labels>=0], cluster_labels[true_labels>=0]))
# print(normalized_mutual_info_score(true_labels[true_labels>=0], cluster_labels[true_labels>=0]))
# print(homogeneity_score(true_labels[true_labels>=0], cluster_labels[true_labels>=0]))
# print(completeness_score(true_labels[true_labels>=0], cluster_labels[true_labels>=0]))
print('GBM - with post-preprocessing')
print(adjusted_rand_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0]))
print(normalized_mutual_info_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0]))
print(homogeneity_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0]))
print(completeness_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0]))

In [None]:
print('SpatialGlue')
print(adjusted_rand_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0]))
print(normalized_mutual_info_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0]))
print(homogeneity_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0]))
print(completeness_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0]))

In [None]:
ari_bgm = adjusted_rand_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0])
nmi_bgm = normalized_mutual_info_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0])
hom_bgm = homogeneity_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0])
com_bgm = completeness_score(true_labels[true_labels>=0], refine_cluster_labels[true_labels>=0])

ari_sg = adjusted_rand_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0])
nmi_sg = normalized_mutual_info_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0])
hom_sg = homogeneity_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0])
com_sg = completeness_score(true_labels[true_labels>=0], adata.obs['SpatialGlue'][true_labels>=0])

data = {
    "Method": ["GraphBGM", "SpatialGlue"],
    "homogeneity": [hom_bgm, hom_sg],
    "completeness": [com_bgm, com_sg],
    "NMI": [nmi_bgm, nmi_sg],
    "ARI": [ari_bgm, ari_sg]
}

df = pd.DataFrame(data)

# Lưu thành file CSV
output_path = "sim1992_metrics_results.csv"
df.to_csv(output_path, index=False)


In [None]:
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score, homogeneity_score, completeness_score, adjusted_rand_score

# === Đường dẫn file ===
input_csv = "/Users/melancholy/Desktop/Graph BG-MM/CBMCs_nCells_5024_nGenes_1000__GraphBGUnimodal.csv"
output_csv = "/Users/melancholy/Desktop/SpatialGlue/run_GraphBGM-multiModals/Simulated_data_csv/sim5024_1000_metrics_results.csv"

# === Đọc dữ liệu ===
df = pd.read_csv(input_csv, header=None)

# Lấy các nhãn từ cột
label1 = df.iloc[:, 0]  # cột 1
label2 = df.iloc[:, 1]  # cột 2
label3 = df.iloc[:, 2]  # cột 3

# === Hàm tính các chỉ số ===
def compute_metrics(true_labels, pred_labels):
    hom = homogeneity_score(true_labels, pred_labels)
    com = completeness_score(true_labels, pred_labels)
    nmi = normalized_mutual_info_score(true_labels, pred_labels)
    ari = adjusted_rand_score(true_labels, pred_labels)
    return hom, com, nmi, ari

# === Tính toán ===
metrics_rna = compute_metrics(label3, label1)       
metrics_protein = compute_metrics(label3, label2)   

# === Chuẩn bị dataframe kết quả ===
results = pd.DataFrame([
    ["RNA", *metrics_rna],
    ["protein", *metrics_protein]
], columns=["Method", "homogeneity", "completeness", "NMI", "ARI"])

# === Đọc file kết quả cũ và gộp ===
df_out = pd.read_csv(output_csv)
df_out = pd.concat([df_out, results], ignore_index=True)

# === Ghi lại vào file CSV ===
df_out.to_csv(output_csv, index=False)

print("Đã gộp kết quả vào:", output_csv)
