In [1]:
from src.ot_annotator2 import OTAnnotator
import time
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from hyperopt import hp
import stlearn as st
import sys
import warnings
import os
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import f1_score
warnings.filterwarnings("ignore")
np.random.seed(42) 

2025-01-06 10:13:43.930780: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-06 10:13:44.107688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-01-06 10:13:44.107752: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-01-06 10:13:45.132039: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-

In [2]:
# output folder
out_folder = "tab_out3"
if not os.path.exists(out_folder):
    os.makedirs(out_folder, exist_ok=True)
else:
    print(f"Folder already exists: {out_folder}")


Folder already exists: tab_out3


In [3]:
# input data
dataset = sc.read("data/merfish.h5ad")

In [4]:
# create target-reference from the 64 slices
def split_data(dataset,slice, noise ,nb_gene):
        atlas= dataset[dataset.obs["slice_id"]== slice,].copy()
        #introduce noise in the spatial data
        beta_values = np.random.uniform(-noise, noise, size=atlas.X.shape[0])
        
        # Add random noise to the 'expression' values
        atlas.X = (atlas.X.transpose() * (1 + beta_values)).transpose()
        ref= dataset[dataset.obs["slice_id"] != slice, :].copy()
#         print('reference shape={} and target shape= {}'.format(ref.shape,atlas.shape))
        return ref, atlas
def ingest(adata, adata_ref,key_ref,ngh=15):
        sc.pp.pca(adata_ref)
        sc.pp.neighbors(adata_ref)
        b = np.array(list(map(len, adata_ref.obsp['distances'].tolil().rows)))
        adata_ref_subset = adata_ref[np.where(b > 2)[0]]
        sc.pp.neighbors(adata_ref_subset, n_neighbors=ngh)
        sc.tl.umap(adata_ref_subset)        
        sc.tl.ingest(adata, adata_ref_subset, obs=key_ref)
        return adata.obs[key_ref + '_org'],adata.obs['subclass']

def calculate_f1_scores(predictions, true_labels):
    conf_matrix = confusion_matrix(true_labels, predictions)
    precision = np.diag(conf_matrix) / np.sum(conf_matrix, axis=0)
    recall = np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)
    f1_scores = 2 * (precision * recall) / (precision + recall)
    f1_scores = np.nan_to_num(f1_scores)
    mean_f1_score = np.mean(f1_scores)
    return mean_f1_score


In [5]:
# hyperOpt params space
# param_space = {
#             "reg": hp.loguniform("reg", np.log(0.001), np.log(0.5)),
#             "reg_m_kl_1": hp.loguniform("reg_m_kl_1", np.log(0.00001), np.log(10)), #hp.uniform("reg_m_kl_1", 0, 500),
#             "reg_m_kl_2":hp.loguniform("reg_m_kl_2", np.log(0.00001), np.log(10)),# hp.uniform("reg_m_kl_2", 0, 500),
#             "method": hp.choice("method", ['sinkhorn']),
#             "reg_type": hp.choice("reg_type", ['entropy','kl'])
#         }
param_space = {
            "reg": hp.loguniform("reg", np.log(0.0001), np.log(1)),
            "reg_m_kl_1": hp.choice("reg_m_kl_1",[float('inf'),hp.loguniform("reg_m_kl_1_", np.log(0.0001), np.log(10))]), #hp.uniform("reg_m_kl_1", 0, 500),
            "reg_m_kl_2": hp.choice("reg_m_kl_2",[float('inf'),hp.loguniform("reg_m_kl_2_", np.log(0.0001), np.log(10))]),# hp.uniform("reg_m_kl_2", 0, 500),
            "method": hp.choice("method", ['sinkhorn']),
            "reg_type": hp.choice("reg_type", ['kl'])
        }


# Cell Annotation (per cell)

In [6]:
slices = np.unique(dataset.obs.slice_id)

In [7]:
# Parameters
task='CellAnnotation_MERFISH'
way = 'all'
key_tar = 'leiden'
key_ref='subclass'
nb_cluster = 10
op_iter = 100
metric='cosine'
log_file = open(f"{out_folder}/{task}_verbose_mapit_{way}_{key_ref}.log", "w")
sys.stdout = log_file  # Redirect print to the log file
results = []
print(f"""
        ##Configuration:
        Task = {task}
        mapping_strategy = {way}
        distance_metric = {metric}
        reference_annotation = {key_ref}
        target_clustering = {key_tar}
        HyperOPT_iterations = {op_iter}
        """)

# Iterate over different numbers of genes in common
for nb_gene in [50, 100, 150, 200]:
    print(f"\n*** Number of genes in common: {nb_gene} ***")
    
    for slice_id in slices:
        print(f"\n*** Processing slice: {slice_id} ***")

        # 1. Split data into reference and target
        print("1/ Splitting data...")
        adata_ref, adata = split_data(dataset, slice_id, 0.25, nb_gene)
        gene_interest = adata_ref.var_names[:nb_gene]
        adata.obs[key_ref + '_org'] = adata.obs[key_ref]
        adata.obs[key_ref] = None

        # Initialize annotator
        start_time = time.time() 
        annotator = OTAnnotator(
            adata, adata_ref, gene_interest,
            param_space=param_space, key_ref=key_ref, key_tar=key_tar, way=way
        )

        print("1.1/ Subclustering...")
        annotator.subcluster(nb_cluster=nb_cluster)

        print("1.2/ Annotating...")
        annotator.annotate(op_iter=op_iter, metric=metric)
        print(annotator.best_params)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.2f} seconds")

        # 2. Evaluate predictions
        print("2/ Evaluating predictions...")
        predicted_labels = annotator.adata.obs['predicted_annotation']
        true_labels = annotator.adata.obs[key_ref + '_org']
        
        # Compute accuracy and F1 score
        accuracy = (true_labels.astype(str) == predicted_labels.astype(str)).mean()
        print(f"Accuracy: {accuracy:.4f}")

        f1 = calculate_f1_scores(true_labels, predicted_labels)
        print(f"F1-Score: {f1:.4f}")

#         print("3/ Ingest predictions (ingest)...")
#         start_time = time.time() 
#         true_labels_ingest, predicted_labels_ingest = ingest(
#             annotator.adata[:, gene_interest], 
#             annotator.adata_ref[:, gene_interest], 
#             'subclass'
#         )
#         end_time = time.time()
#         print(f"Execution time (ingest): {end_time - start_time:.2f} seconds")

#         accuracy_ingest = (true_labels_ingest.astype(str) == predicted_labels_ingest.astype(str)).mean()
#         print(f"Accuracy_ingest: {accuracy_ingest:.4f}")

#         f1_ingest = calculate_f1_scores(true_labels_ingest, predicted_labels_ingest)
#         print(f"F1-Score_ingest: {f1_ingest:.4f}")

        # Append results to list
        results.append({
            'Nb_Gene': nb_gene,
            'Slice_ID': slice_id,
            'Accuracy': accuracy,
            'F1_Score': f1,
#             'Accuracy_ingest': accuracy_ingest,
#             'F1_Score_ingest': f1_ingest
        })

        # Clean up
        del annotator

# Save results to a single dataframe
results_df = pd.DataFrame(results)
results_df.to_csv(f"{out_folder}/{task}_mapit_{way}_{key_ref}.csv")
print("\nEnded successfully.") 
sys.stdout = sys.__stdout__  # Reset to normal console output
log_file.close()

































In [8]:

task='CellAnnotation_MERFISH'
way = 'mean'
key_tar = 'leiden'
key_ref='subclass'
nb_cluster = 10
op_iter = 100
metric='cosine'
log_file = open(f"{out_folder}/{task}_verbose_mapit_{way}_{key_ref}.log", "w")
sys.stdout = log_file  # Redirect print to the log file
results = []
print(f"""
        ##Configuration:
        Task = {task}
        mapping_strategy = {way}
        distance_metric = {metric}
        reference_annotation = {key_ref}
        target_clustering = {key_tar}
        HyperOPT_iterations = {op_iter}
        """)

# Iterate over different numbers of genes in common
for nb_gene in [50, 100, 150, 200]:
    print(f"\n*** Number of genes in common: {nb_gene} ***")
    
    for slice_id in slices:
        print(f"\n*** Processing slice: {slice_id} ***")

        # 1. Split data into reference and target
        print("1/ Splitting data...")
        adata_ref, adata = split_data(dataset, slice_id, 0.25, nb_gene)
        gene_interest = adata_ref.var_names[:nb_gene]
        adata.obs[key_ref + '_org'] = adata.obs[key_ref]
        adata.obs[key_ref] = None

        # Initialize annotator
        start_time = time.time() 
        annotator = OTAnnotator(
            adata, adata_ref, gene_interest,
            param_space=param_space, key_ref=key_ref, key_tar=key_tar, way=way
        )

        print("1.1/ Subclustering...")
        annotator.subcluster(nb_cluster=nb_cluster)

        print("1.2/ Annotating...")
        annotator.annotate(op_iter=op_iter, metric=metric)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.2f} seconds")
        print(annotator.best_params)
        # 2. Evaluate predictions
        print("2/ Evaluating predictions...")
        predicted_labels = annotator.adata.obs['predicted_annotation']
        true_labels = annotator.adata.obs[key_ref + '_org']
        
        # Compute accuracy and F1 score
        accuracy = (true_labels.astype(str) == predicted_labels.astype(str)).mean()
        print(f"Accuracy: {accuracy:.4f}")

        f1 = calculate_f1_scores(true_labels, predicted_labels)
        print(f"F1-Score: {f1:.4f}")

        # Append results to list
        results.append({
            'Nb_Gene': nb_gene,
            'Slice_ID': slice_id,
            'Accuracy': accuracy,
            'F1_Score': f1        })

        # Clean up
        del annotator

# Save results to a single dataframe
results_df = pd.DataFrame(results)
results_df.to_csv(f"{out_folder}/{task}_mapit_{way}_{key_ref}.csv")
print("\nEnded successfully.") 
sys.stdout = sys.__stdout__  # Reset to normal console output
log_file.close()



KeyboardInterrupt: 