In [1]:
pip install scanpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install gudhi

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gc
import scanit
import torch
import torch
import random
import scanpy as sc
import pandas as pd
import anndata
import numpy as np
from scipy import sparse
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from sklearn.cluster import SpectralClustering, KMeans
import matplotlib.pyplot as plt
# import stlearn as st
from pathlib import Path

In [44]:
adata = sc.read('/Users/melancholy/Desktop/Maynard dataset/151510.h5ad')
adata.obs['Region']

AAACAAGTATCTCCCA-1    Layer1
AAACACCAATAACTGC-1    Layer5
AAACAGAGCGACTCCT-1    Layer3
AAACAGCTTTCAGAAG-1    Layer4
AAACAGGGTCTATATT-1    Layer4
                       ...  
TTGTTTCACATCCAGG-1    Layer4
TTGTTTCATTAGTCTA-1    Layer5
TTGTTTCCATACAACT-1    Layer3
TTGTTTGTATTACACG-1    Layer6
TTGTTTGTGTAAATTC-1    Layer3
Name: Region, Length: 4634, dtype: category
Categories (7, object): ['Layer1', 'Layer2', 'Layer3', 'Layer4', 'Layer5', 'Layer6', 'WM']

In [46]:
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, homogeneity_score, completeness_score
import scanit
import os

# Cài đặt đường dẫn
datadir = '/Users/melancholy/Desktop/Maynard dataset'
result_path = '/Users/melancholy/Desktop'

# Danh sách tên file (12 file)
sample_ids = ['151507', '151508', '151509', '151510'] + [str(sid) for sid in range(151669, 151677)]

# Hàm tìm resolution phù hợp với số cụm mong muốn
def res_search_fixed_clus(adata, fixed_clus_count, increment=0.02):
    for res in sorted(list(np.arange(0.15, 5, increment)), reverse=True):
        sc.tl.leiden(adata, random_state=0, resolution=res)
        count_unique_leiden = len(adata.obs['leiden'].unique())
        if count_unique_leiden == fixed_clus_count:
            break
    return res

# Danh sách lưu kết quả
metrics_list = []

# Vòng lặp qua từng file
for i, sid in enumerate(sample_ids):
    file_path = os.path.join(datadir, f'{sid}.h5ad')
    print(f'\n--- Processing file: {sid}.h5ad ---')

    # Đọc dữ liệu
    adata = sc.read_h5ad(file_path)
    new_data = adata.copy()
    adata_sp = adata.copy()

    # Tiền xử lý
    sc.pp.normalize_total(adata_sp)
    sc.pp.log1p(adata_sp)
    sc.pp.scale(adata_sp)

    # SCAN-IT embedding
    scanit.tl.spatial_graph(adata_sp, method='alpha shape', alpha_n_layer=2, knn_n_neighbors=5)
    scanit.tl.spatial_representation(
        adata_sp, 
        n_h=30, n_epoch=2000, lr=0.001, device='cpu', 
        n_consensus=1, projection='mds', 
        python_seed=0, torch_seed=0, numpy_seed=0
    )

    # Gán số cụm mong muốn
    if sid in ['151669', '151670', '151671', '151672']:
        n_clusters = 5
    else:
        n_clusters = 7

    # Clustering
    sc.pp.neighbors(adata_sp, use_rep='X_scanit', n_neighbors=15)
    eval_resolution = res_search_fixed_clus(adata_sp, n_clusters)
    sc.tl.leiden(adata_sp, key_added="scanit_leiden", resolution=eval_resolution)

    # Tính NMI, HOM, COM
    obs_df = adata_sp.obs.dropna()
    if 'Region' not in obs_df.columns:
        print(f"⚠️  File {sid}.h5ad thiếu nhãn 'Region'. Bỏ qua.")
        continue

    labels_true = obs_df['Region']
    labels_pred = obs_df['scanit_leiden']

    nmi = normalized_mutual_info_score(labels_true, labels_pred)
    hom = homogeneity_score(labels_true, labels_pred)
    com = completeness_score(labels_true, labels_pred)

    metrics_list.append({
        'File': f'{sid}.h5ad',
        'NMI': nmi,
        'HOM': hom,
        'COM': com
    })

    # Gán nhãn vào bản sao new_data và lưu
    new_data.obs[f'pred_{i+1}'] = adata_sp.obs['scanit_leiden']
    new_data.uns['nmi'] = nmi
    new_data.uns['hom'] = hom
    new_data.uns['com'] = com

    out_path = os.path.join(result_path, f'SCAN-IT_{sid}.h5ad')
    new_data.write(out_path)
    print(f"✅ Done: {sid}.h5ad | NMI = {nmi:.4f}, HOM = {hom:.4f}, COM = {com:.4f} | Saved to: {out_path}")

# In danh sách NMI/HOM/COM và lưu vào CSV
df_metrics = pd.DataFrame(metrics_list)
print("\n=== Tổng hợp chỉ số NMI / HOM / COM ===")
print(df_metrics)

csv_out = os.path.join(result_path, 'NMI_summary.csv')
df_metrics.to_csv(csv_out, index=False)
print(f"\n📄 Đã lưu NMI tổng hợp vào: {csv_out}")



--- Processing file: 151507.h5ad ---
Epoch: 000, Loss: 1.3982
Epoch: 500, Loss: 0.0009
Epoch: 1000, Loss: 0.0004
Epoch: 1500, Loss: 0.0001
Epoch: 1999, Loss: 0.0000
✅ Done: 151507.h5ad | NMI = 0.6379, HOM = 0.5916, COM = 0.6919 | Saved to: /Users/melancholy/Desktop/SCAN-IT_151507.h5ad

--- Processing file: 151508.h5ad ---
Epoch: 000, Loss: 1.4016
Epoch: 500, Loss: 0.0005
Epoch: 1000, Loss: 0.0001
Epoch: 1500, Loss: 0.0000
Epoch: 1999, Loss: 0.0000
✅ Done: 151508.h5ad | NMI = 0.5564, HOM = 0.5727, COM = 0.5410 | Saved to: /Users/melancholy/Desktop/SCAN-IT_151508.h5ad

--- Processing file: 151509.h5ad ---
Epoch: 000, Loss: 1.4059
Epoch: 500, Loss: 0.0006
Epoch: 1000, Loss: 0.0000
Epoch: 1500, Loss: 0.0003
Epoch: 1999, Loss: 0.0001
✅ Done: 151509.h5ad | NMI = 0.6498, HOM = 0.6936, COM = 0.6111 | Saved to: /Users/melancholy/Desktop/SCAN-IT_151509.h5ad

--- Processing file: 151510.h5ad ---
Epoch: 000, Loss: 1.4030
Epoch: 500, Loss: 0.0004
Epoch: 1000, Loss: 0.0002
Epoch: 1500, Loss: 0.0002

In [53]:
import pandas as pd

# Đọc file gốc
df_others = pd.read_csv("/Users/melancholy/Desktop/results section b and c/result_metrics_10xVisium.csv")

# Đọc file SCAN-IT
df_scanit = pd.read_csv("/Users/melancholy/Desktop/metric_summary.csv")

# Đổi tên cột cho giống df_others
df_scanit = df_scanit.rename(columns={"File": "DataName"})

# Xóa phần .h5ad khỏi DataName
df_scanit["DataName"] = df_scanit["DataName"].str.replace(".h5ad", "", regex=False)

# Thêm cột Method = "SCAN-IT"
df_scanit["Method"] = "SCAN-IT"

# Gộp 2 dataframe
df_concat = pd.concat([df_others, df_scanit], ignore_index=True)

# Sắp xếp lại cột đúng thứ tự nếu cần
df_concat = df_concat[["DataName", "Method", "NMI", "HOM", "COM"]]

# Lưu ra file mới
df_concat.to_csv("/Users/melancholy/Desktop/metrics_all_methods.csv", index=False)
df_concat

Unnamed: 0,DataName,Method,NMI,HOM,COM
0,151669,GraphBG,0.675974,0.740235,0.621979
1,151669,GraphST,0.649642,0.609027,0.696061
2,151508,GraphBG,0.637682,0.637907,0.637457
3,151508,GraphST,0.643654,0.644672,0.642640
4,151509,GraphBG,0.691697,0.714134,0.670628
...,...,...,...,...,...
391,151672,SCAN-IT,0.641018,0.724257,0.574941
392,151673,SCAN-IT,0.521138,0.509963,0.532814
393,151674,SCAN-IT,0.565233,0.591542,0.541165
394,151675,SCAN-IT,0.549594,0.550267,0.548922
