In [1]:
import os
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import re

sc.settings.verbosity = 0

import GenKI as gk
from GenKI.preprocesing import build_adata
from GenKI.dataLoader import DataLoader
from GenKI.train import VGAE_trainer
from GenKI import utils

import stringdb
import networkx as nx

from scipy.sparse import csr_matrix
from scipy.sparse import issparse

  from .autonotebook import tqdm as notebook_tqdm
2025-06-26 13:07:36,399	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-06-26 13:07:36.871530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-26 13:07:36.920508: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-26 13:07:36.935633: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-26 13:07:36.967803: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instruc

In [2]:
adata = sc.read("VisiumRNABC_v3.h5ad")

In [None]:
# # Load your deconvolution matrix
# import pandas as pd

# deconv_df = pd.read_excel("Cell_types_Probabilities.xlsx", index_col=0)

# # Optional: check alignment between adata.obs_names and deconv_df.index
# matching_barcodes = adata.obs_names.intersection(deconv_df.index)
# deconv_df = deconv_df.loc[matching_barcodes]

# # Add the matrix to adata.obsm
# adata.obsm["deconvolution"] = deconv_df.values

# # Store the column names (cell types)
# adata.uns["deconvolution_celltypes"] = deconv_df.columns.tolist()

In [None]:
# adata.uns["deconvolution_celltypes"]

In [None]:
# adata.obsm["deconvolution"]

In [3]:
adata

AnnData object with n_obs × n_vars = 4011 × 500
    obs: 'in_tissue', 'array_row', 'array_col', 'spot_id', 'region', 'leiden'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'spatial', 'spatialdata_attrs'
    obsm: 'X_pca', 'spatial'
    varm: 'PCs'
    layers: 'raw_layer'
    obsp: 'connectivities', 'distances'

In [4]:
import scanpy as sc
import anndata as ad

# Standardize the data (zero mean, unit variance)
sc.pp.scale(adata, zero_center=True)

In [None]:
# ### Let's commence with creating a digital KO of Six2 expression in the DBiT-seq mouse embryo dataset.
# gene_of_interest = "TP53"

# # Verify that the gene of interest is part of the rownames in adata.var
# if gene_of_interest in adata.var.index:
#     print(f"The gene {gene_of_interest} is present in the rownames of adata.var.")
# else:
#     print(f"The gene {gene_of_interest} is not present in the rownames of adata.var.")

In [5]:
# adata pre-processing to prepare for input in the GenKI tool
adata.layers["norm"] = adata.X.copy()

# The adata.X should be normalised-scaled AND in sparse matrix format!
if not issparse(adata.X):
    sparse_matrix = csr_matrix(adata.X)
    adata.X = sparse_matrix
    print("Converted adata.X to a sparse matrix.")
else:
    print("adata.X is already a sparse matrix.")

Converted adata.X to a sparse matrix.


In [7]:

combined_results = []
#all_genes_of_interest = ['PGR', 'TACSTD2', 'KRT14','KRT17','CXCL9','CXCL10','CXCL13']
all_genes_of_interest = ['AREG', 'BCL11A', 'EPAS1']
num_GenKI_KO_responsive_genes = 30
num_shap = 5 


for gene_of_interest in all_genes_of_interest:
    print(f"\nProcessing gene: {gene_of_interest}")

    data_wrapper =  DataLoader(
                adata, # adata object
                target_gene = [gene_of_interest], # KO gene name
                target_cell = None, # obsname for cell type, if none use all
                obs_label = "ident", # colname for genes
                GRN_file_dir = "GRNs", # folder name for GRNs
                rebuild_GRN = False, # whether build GRN by pcNet
                pcNet_name = "Visium500_breast_cancer_example", # GRN file name
                verbose = True, # whether verbose
                n_cpus = 20, # multiprocessing
                )

    data_wt = data_wrapper.load_data()
    data_ko = data_wrapper.load_kodata()


    hyperparams = {
        "epochs": 100,  # Increased epochs for more training
        "lr": 7e-4,  # Adjusted learning rate
        "beta": 1e-4,  # Increased beta for stronger regularization
        "seed": 8096  # Trying a different seed
    }


    log_dir = None

    sensei = VGAE_trainer(
        data_wt,
        epochs=hyperparams["epochs"],
        lr=hyperparams["lr"],
        log_dir=log_dir,
        beta=hyperparams["beta"],
        seed=hyperparams["seed"],
        verbose=False,
    )

    sensei.train()
    #sensei.save_model('Visium500_breast_cancer_model_example')

    # Get latent variables for KO data
    z_mu_ko, z_std_ko = sensei.get_latent_vars(data_ko)
    z_mu_wt, z_std_wt = sensei.get_latent_vars(data_wt)

    # Calculate the distance between WT and KO data
    dis = gk.utils.get_distance(z_mu_ko, z_std_ko, z_mu_wt, z_std_wt, by="KL")

    # Get the ranked list of responsive genes
    res_raw = utils.get_generank(data_wt, dis, rank=True)

    # Store the top 30 responsive genes
    top_genes = res_raw.head(num_GenKI_KO_responsive_genes)
    print(f"Top {num_GenKI_KO_responsive_genes} KO Responsive Genes for {gene_of_interest}:\n{top_genes}")

    os.makedirs('Top_Responsive_Genes', exist_ok=True)
    top_genes_path = f'Top{num_GenKI_KO_responsive_genes}_Responsive_Genes/Top{num_GenKI_KO_responsive_genes}_Responsive_Genes_{gene_of_interest}.csv'
    top_genes.to_csv(top_genes_path)

    # Extract the top genes for the current KO gene
    genki_list = top_genes.index.tolist()

    # Load the data from CSV file
    file_path = 'feature_feature_importance.csv'  # Replace with your file path
    data = pd.read_csv(file_path)

    # Filter data for the genes of interest
    filtered_data = data[data['Source'].isin(genki_list)]

    # Separate data for RNA -> Niche and RNA -> Protein directions
    rna_niche_data = filtered_data[filtered_data['Direction'] == 'RNA -> Niche']
    rna_protein_data = filtered_data[filtered_data['Direction'] == 'RNA -> Protein']

    # Find the top unique RNA -> Niche and RNA -> Protein interactions with the highest 'Value'
    top_rna_niche = (
        rna_niche_data
        .sort_values(by='Value', ascending=False)
        .drop_duplicates(subset=['Source', 'Target'])
        .groupby('Source')
        .head(num_shap)
    )

    top_rna_protein = (
        rna_protein_data
        .sort_values(by='Value', ascending=False)
        .drop_duplicates(subset=['Source', 'Target'])
        .groupby('Source')
        .head(num_shap)
    )

    # Identify cases where the 'Target' value is the same as the 'Source' value
    rna_protein_duplicates = top_rna_protein[top_rna_protein['Target'] == top_rna_protein['Source']]
    rna_niche_duplicates = top_rna_niche[top_rna_niche['Target'] == top_rna_niche['Source']]

    # Remove duplicate entries from the original top lists
    top_rna_protein_cleaned = top_rna_protein[~(top_rna_protein['Target'] == top_rna_protein['Source'])]
    top_rna_niche_cleaned = top_rna_niche[~(top_rna_niche['Target'] == top_rna_niche['Source'])]

    # Find the next highest entries for those with matching 'Target' and 'Source'
    next_rna_protein = (
        rna_protein_data
        .loc[~rna_protein_data.index.isin(rna_protein_duplicates.index)]
        .sort_values(by='Value', ascending=False)
    )

    next_rna_protein_add = (
        next_rna_protein
        .groupby('Source')
        .apply(lambda x: x[~x['Target'].isin(top_rna_protein_cleaned['Target'])].head(1))
        .reset_index(drop=True)
    )

    next_rna_niche = (
        rna_niche_data
        .loc[~rna_niche_data.index.isin(rna_niche_duplicates.index)]
        .sort_values(by='Value', ascending=False)
    )

    next_rna_niche_add = (
        next_rna_niche
        .groupby('Source')
        .apply(lambda x: x[~x['Target'].isin(top_rna_niche_cleaned['Target'])].head(1))
        .reset_index(drop=True)
    )

    # Combine the original top lists with the added entries
    final_top_rna_protein = (
        pd.concat([top_rna_protein_cleaned, next_rna_protein_add])
        .sort_values(by=['Source', 'Value'], ascending=[True, False])
        .groupby('Source')
        .head(num_shap)
    )

    final_top_rna_niche = (
        pd.concat([top_rna_niche_cleaned, next_rna_niche_add])
        .sort_values(by=['Source', 'Value'], ascending=[True, False])
        .groupby('Source')
        .head(num_shap)
    )

    # Combine all results into a single DataFrame
    combined_df = pd.concat([final_top_rna_protein, final_top_rna_niche])

    # Add a column to indicate the KO gene
    combined_df['KO_Gene'] = gene_of_interest

    # Append to the list
    combined_results.append(combined_df)

# Concatenate all results
all_combined_df = pd.concat(combined_results)

# Save the combined results to a single sheet in an Excel file
output_path = 'Top_RNA_Niche_Protein_Interactions_UnitedNet.xlsx'

with pd.ExcelWriter(output_path) as writer:
    all_combined_df.to_excel(writer, sheet_name='Top Interactions', index=False)

print(f"\nResults have been saved to {output_path}")

## Rest Process


interactions_df = pd.read_excel('Top_RNA_Niche_Protein_Interactions_UnitedNet.xlsx', sheet_name='Top Interactions')

# Concatenate both columns and drop duplicates
genes = pd.unique(pd.concat([interactions_df["Target"], interactions_df["Source"]]))

# If you want a Python list instead of a NumPy array
genes = genes.tolist()

import requests
import pandas as pd
import stringdb

species = 9606

# Step 1: Get STRING IDs
string_ids_df = stringdb.get_string_ids(genes)
string_ids = string_ids_df.queryItem.tolist()

# Step 2: Get interaction network
def get_interactions(string_ids, species):
    url = "https://string-db.org/api/json/network"
    params = {
        "identifiers": "%0d".join(string_ids),  # line-break separated
        "species": species
    }
    r = requests.get(url, params=params)
    return pd.DataFrame(r.json())

# Step 3: Format output
interactions_string_df = get_interactions(string_ids, species)
df_edges = interactions_string_df[["preferredName_A", "preferredName_B", "score"]]
df_edges.columns = ["source", "target", "combined_score"]

# Rename for clarity if needed
df_edges = df_edges.rename(columns={
    "source": "Source",
    "target": "Target"
})

# Merge: Left join from your original df (keep all rows from df, add combined_score if available)
df_merged = interactions_df.merge(df_edges, on=["Target", "Source"], how="left")

#interaction_df = df_merged.dropna(subset=['combined_score'])

# Save the combined results to a single sheet in an Excel file
output_path = 'Top_RNA_Niche_Protein_Interactions_UnitedNet_StringDB_filtered.xlsx'

with pd.ExcelWriter(output_path) as writer:
    all_combined_df.to_excel(writer, sheet_name='Top Interactions', index=False)

print(f"\nResults have been saved to {output_path}")


interactions_df = pd.read_excel('Top_RNA_Niche_Protein_Interactions_UnitedNet_StringDB_filtered.xlsx', sheet_name='Top Interactions')
targets = interactions_df['Target'].tolist()

all_genes_of_interest = ['AREG', 'BCL11A', 'EPAS1']
print(f"Genes of interest: {all_genes_of_interest}")

# Build targets list per KO gene as a dictionary
targets_dict = {
    ko_gene: interactions_df[interactions_df['KO_Gene'] == ko_gene]['Target'].tolist()
    for ko_gene in all_genes_of_interest
}

# Process each gene of interest
for gene_of_interest in all_genes_of_interest:
    print(f"\nProcessing gene of interest: {gene_of_interest}")

    # Get the targets for this KO gene
    targets = targets_dict[gene_of_interest]
    print(f"Targets from Excel for {gene_of_interest}: {len(targets)} genes")

    # Initialize list of responsive genes from Top_Responsive_Genes
    responsive_genes = []

    file_name = f'Top{num_GenKI_KO_responsive_genes}_Responsive_Genes/Top{num_GenKI_KO_responsive_genes}_Responsive_Genes_{gene_of_interest}.csv'
    if os.path.exists(file_name):
        df_top_genes = pd.read_csv(file_name, index_col=0)
        responsive_genes = df_top_genes.index.tolist()
        print(f"Loaded {len(responsive_genes)} responsive genes from {file_name}.")
    else:
        print(f"File {file_name} does not exist. Skipping Top responsive genes.")

    # Combine targets + responsive_genes
    combined_list = targets + responsive_genes
    print(f"Total combined genes before removing duplicates: {len(combined_list)}.")

    # Remove duplicates
    combined_list = list(set(combined_list))
    print(f"Total combined genes after removing duplicates: {len(combined_list)}.")

    # Save to a file
    output_file = f'Combined_Genes_List_{gene_of_interest}.txt'
    with open(output_file, 'w') as f:
        for gene in combined_list:
            f.write(f"{gene}\n")

    print(f"Combined list saved to {output_file}.")



Processing gene: AREG
use all the cells (4011) in adata
loading GRN from "GRNs/Visium500_breast_cancer_example.npz"
init completed

Top 30 KO Responsive Genes for AREG:
                dis  rank
AREG      90.400504     1
IL1RN      0.003106     2
S100A8     0.003032     3
CHI3L1     0.002981     4
CPB1       0.002867     5
S100A9     0.002743     6
MMP12      0.002689     7
SOCS3      0.002624     8
GOLM1      0.002549     9
IFIT2      0.002467    10
MMP7       0.002395    11
HERC5      0.002385    12
TTC39A     0.002378    13
SEMA3C     0.002317    14
PARD6B     0.002253    15
DHRS2      0.002244    16
CA12       0.002189    17
GABRP      0.002183    18
CXCL11     0.002154    19
AGR2       0.002127    20
PTHLH      0.002097    21
IGFBP3     0.002066    22
REPS2      0.002037    23
APOBEC3A   0.002023    24
SYNPO2     0.002019    25
CXCL14     0.001973    26
HECTD2     0.001960    27
FCRL2      0.001957    28
SPP1       0.001948    29
IGFBP5     0.001947    30

Processing gene: BCL11A

In [None]:
# Let's create a STRINGdb network of the targets affected by the Six2 digital KO on the DBiT-seq dataset
genes = list(res_raw.index[1:10,])

string_ids = stringdb.get_string_ids(genes, species = 9606)   # human is 'species=9606'

# Performe pathway enrichment
enrichment_df = stringdb.get_enrichment(string_ids.queryItem, species = 9606)
enrichment_df
df = enrichment_df
sorted_df = df.sort_values(by='fdr', ascending=False)

# Select the top 10 pathways to visualise
top_10_pathways = sorted_df['description'].head(10)

print("Top 10 Pathways:")
print(top_10_pathways)

# Plotting negative log10 of FDR values against description to discern most significant pathways
plt.figure(figsize=(10, 6))
plt.barh(sorted_df['description'].head(10), -1 * sorted_df['fdr'].head(10).apply(lambda x: math.log10(x)))
plt.xlabel('-log10(FDR)')
plt.ylabel('Description')
plt.title('Top 10 Pathways: Negative Log10 FDR values by Description')
plt.show()

In [None]:
# Preparing visualization of STRINGdb network with Networkx
network_df = stringdb.get_network(string_ids.queryItem, species = 9606, required_score=400)
df = network_df
G = nx.Graph()
for index, row in df.iterrows():
    G.add_edge(row['preferredName_A'], row['preferredName_B'], weight=row['score'])

# Draw the graph
pos = nx.spring_layout(G)  # positions for all nodes
nx.draw(G, pos, with_labels=True, node_size=700, node_color='lightblue', font_size=10)

# Draw edge labels with weights
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

plt.show()

#