# GenKI ko DBit-seq




## set up

In [1]:
# Install GenKI and torch_geometric as a prerequisite
! pip install git+https://github.com/yjgeno/GenKI.git
! pip install torch_geometric

# Install STRINGdb and Networkx for network biology models
! pip install stringdb
! pip install networkx pandas

Collecting git+https://github.com/yjgeno/GenKI.git
  Cloning https://github.com/yjgeno/GenKI.git to /tmp/pip-req-build-hxg0bhcj
  Running command git clone --filter=blob:none --quiet https://github.com/yjgeno/GenKI.git /tmp/pip-req-build-hxg0bhcj
  Resolved https://github.com/yjgeno/GenKI.git to commit 6d69789b89859eda75ac75dfb1cc00ef190ada41
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
# Restart kernel
import os
os.kill(os.getpid(), 9)

In [2]:
import os
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

sc.settings.verbosity = 0

In [3]:
import GenKI as gk
from GenKI.preprocesing import build_adata
from GenKI.dataLoader import DataLoader
from GenKI.train import VGAE_trainer
from GenKI import utils

%load_ext autoreload
%autoreload 2

In [4]:
import stringdb
import networkx as nx

In [5]:
# Libraries for data wrangling on the adata object
from scipy.sparse import csr_matrix
from scipy.sparse import issparse

In [6]:
# Download the DBiT-seq dataset suitable for this workshop
!gdown  1D3AIjshsAiIOktmFrvMgqKO1UKiQSPbu

# Assemble adata object for single-cell analysis
adata = sc.read("DBiTseq_UnitedNet.h5ad")
adata

Downloading...
From: https://drive.google.com/uc?id=1D3AIjshsAiIOktmFrvMgqKO1UKiQSPbu
To: /content/DBiTseq_UnitedNet.h5ad
100% 2.36M/2.36M [00:00<00:00, 16.3MB/s]


AnnData object with n_obs × n_vars = 936 × 568
    obs: 'id', 'array_row', 'array_col', 'cell_type', 'batch', 'imagecol', 'imagerow', 'label', 'sample'
    var: 'gene_ids-0', 'gene_ids-1'
    obsm: 'spatial'

## KO one gene example: Six2

In [7]:
# Let's commence with creating a digital KO of Six2 expression in the DBiT-seq mouse embryo dataset.
gene_of_interest = "Six2"

# Verify that the gene of interest is part of the rownames in adata.var
if gene_of_interest in adata.var.index:
    print(f"The gene {gene_of_interest} is present in the rownames of adata.var.")
else:
    print(f"The gene {gene_of_interest} is not present in the rownames of adata.var.")

The gene Six2 is present in the rownames of adata.var.


In [8]:
# adata pre-processing to prepare for input in the GenKI tool
adata.layers["norm"] = adata.X.copy()

# The adata.X should be normalised-scaled AND in sparse matrix format!
if not issparse(adata.X):
    sparse_matrix = csr_matrix(adata.X)
    adata.X = sparse_matrix
    print("Converted adata.X to a sparse matrix.")
else:
    print("adata.X is already a sparse matrix.")

Converted adata.X to a sparse matrix.


In [9]:
# load data

data_wrapper =  DataLoader(
                adata, # adata object
                target_gene = [gene_of_interest], # KO gene name
                target_cell = None, # obsname for cell type, if none use all
                obs_label = "ident", # colname for genes
                GRN_file_dir = "GRNs", # folder name for GRNs
                rebuild_GRN = True, # whether build GRN by pcNet
                pcNet_name = "DBiTseq_example", # GRN file name
                verbose = True, # whether verbose
                n_cpus = 8, # multiprocessing
                )

data_wt = data_wrapper.load_data()
data_ko = data_wrapper.load_kodata()

use all the cells (936) in adata
build GRN


2024-12-05 15:16:06,035	INFO worker.py:1821 -- Started a local Ray instance.


ray init, using 8 CPUs
execution time of making pcNet: 224.69 s
GRN has been built and saved in "GRNs/DBiTseq_example.npz"
init completed



In [10]:
#Changed the hyperparameters to better adapt to the spatial dataset in hand

hyperparams = {
    "epochs": 300,  # Increased epochs for more training
    "lr": 5e-2,  # Adjusted learning rate
    "beta": 5e-4,  # Increased beta for stronger regularization
    "seed": 8096  # Trying a different seed
}


log_dir = None

sensei = VGAE_trainer(
    data_wt,
    epochs=hyperparams["epochs"],
    lr=hyperparams["lr"],
    log_dir=log_dir,
    beta=hyperparams["beta"],
    seed=hyperparams["seed"],
    verbose=False,
)

In [11]:
# %%timeit
sensei.train()

In [12]:
sensei.save_model('model_example_DBiTseq')

save model parameters to model/model_example_DBiTseq.th


In [13]:
# get distance between wt and ko

z_mu_wt, z_std_wt = sensei.get_latent_vars(data_wt)
z_mu_ko, z_std_ko = sensei.get_latent_vars(data_ko)
dis = gk.utils.get_distance(z_mu_ko, z_std_ko, z_mu_wt, z_std_wt, by="KL")
print(dis.shape)

(568,)


In [14]:
# raw ranked gene list

res_raw = utils.get_generank(data_wt, dis, rank=True)
res_raw.head(10)

Unnamed: 0,dis,rank
Six2,8423.716324,1
Col3a1,16.065773,2
Col1a2,8.421597,3
Vcan,6.678489,4
Gpc3,6.598462,5
Lgals1,5.790285,6
Fbn2,5.741572,7
Rbp1,5.73398,8
Col4a1,5.731743,9
Cxcl12,5.60206,10


# Filter the uncharacterised genes in adata


In [15]:
import re

# Get the list of gene names
gene_names = adata.var.index.tolist()

# Define a regex pattern to match gene names that end with 'Rik'
pattern = r'.*Rik$'

# Identify genes to filter out
genes_to_filter = [gene for gene in gene_names if re.match(pattern, gene)]

print(f"Number of genes to filter out: {len(genes_to_filter)}")
print("Genes to filter out:")
print(genes_to_filter)

# Now filter the adata object to exclude these genes
adata_filtered = adata[:, ~adata.var.index.isin(genes_to_filter)].copy()

print(f"Number of genes after filtering: {adata_filtered.n_vars}")

Number of genes to filter out: 6
Genes to filter out:
['1810037I17Rik', '2410002F23Rik', '5730559C18Rik', 'C130093G08Rik', 'C530008M17Rik', 'D930015E06Rik']
Number of genes after filtering: 562


# SHAP value

In [16]:
# Loop the 10 responsive genes from Six2 KO as a example

import pandas as pd

# Load the data from CSV file
file_path = '/content/feature_feature_importance.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Automatically generate the list of genes of interest from the previous GenKI results (Six2 example)
# 'res_raw' is the DataFrame with genes ranked by 'dis' and 'rank'
N = 10
genki_list = res_raw.head(N).index.tolist()

print("Genes of interest:", genki_list)

# Filter data for the genes of interest
filtered_data = data[data['Source'].isin(genki_list)]

# Separate data for RNA -> Niche and RNA -> Protein directions
rna_niche_data = filtered_data[filtered_data['Direction'] == 'RNA -> Niche']
rna_protein_data = filtered_data[filtered_data['Direction'] == 'RNA -> Protein']

# Find the top 3 unique RNA -> Niche and RNA -> Protein interactions with the highest 'Value'
top_rna_niche = (
    rna_niche_data
    .sort_values(by='Value', ascending=False)
    .drop_duplicates(subset=['Source', 'Target'])
    .groupby('Source')
    .head(3)
)

top_rna_protein = (
    rna_protein_data
    .sort_values(by='Value', ascending=False)
    .drop_duplicates(subset=['Source', 'Target'])
    .groupby('Source')
    .head(3)
)

# Identify cases where the 'Target' value is the same as the 'Source' value
rna_protein_duplicates = top_rna_protein[top_rna_protein['Target'] == top_rna_protein['Source']]
rna_niche_duplicates = top_rna_niche[top_rna_niche['Target'] == top_rna_niche['Source']]

# Remove duplicate entries from the original top lists
top_rna_protein_cleaned = top_rna_protein[~(top_rna_protein['Target'] == top_rna_protein['Source'])]
top_rna_niche_cleaned = top_rna_niche[~(top_rna_niche['Target'] == top_rna_niche['Source'])]

# Find the next highest entries for those with matching 'Target' and 'Source'
next_rna_protein = (
    rna_protein_data
    .loc[~rna_protein_data.index.isin(rna_protein_duplicates.index)]
    .sort_values(by='Value', ascending=False)
)

next_rna_protein_add = (
    next_rna_protein
    .groupby('Source')
    .apply(lambda x: x[~x['Target'].isin(top_rna_protein_cleaned['Target'])].head(1))
    .reset_index(drop=True)
)

next_rna_niche = (
    rna_niche_data
    .loc[~rna_niche_data.index.isin(rna_niche_duplicates.index)]
    .sort_values(by='Value', ascending=False)
)

next_rna_niche_add = (
    next_rna_niche
    .groupby('Source')
    .apply(lambda x: x[~x['Target'].isin(top_rna_niche_cleaned['Target'])].head(1))
    .reset_index(drop=True)
)

# Combine the original top lists with the added entries
final_top_rna_protein = (
    pd.concat([top_rna_protein_cleaned, next_rna_protein_add])
    .sort_values(by=['Source', 'Value'], ascending=[True, False])
    .groupby('Source')
    .head(3)
)

final_top_rna_niche = (
    pd.concat([top_rna_niche_cleaned, next_rna_niche_add])
    .sort_values(by=['Source', 'Value'], ascending=[True, False])
    .groupby('Source')
    .head(3)
)

# Combine all results into a single DataFrame
combined_df = pd.concat([final_top_rna_protein, final_top_rna_niche])

# Save the results to a single sheet in an Excel file
output_path = './Top_RNA_Niche_Protein_Interactions_Six2_Sheet.xlsx'  # Replace with your desired output path

with pd.ExcelWriter(output_path) as writer:
    combined_df.to_excel(writer, sheet_name='Top Interactions', index=False)

print(f"Results have been saved to {output_path}")

Genes of interest: ['Six2', 'Col3a1', 'Col1a2', 'Vcan', 'Gpc3', 'Lgals1', 'Fbn2', 'Rbp1', 'Col4a1', 'Cxcl12']
Results have been saved to ./Top_RNA_Niche_Protein_Interactions_Six2_Sheet.xlsx


In [None]:
filtered_data["Source"].unique()

array(['Six2', 'Vcan', 'Fbn2'], dtype=object)

# Loop KO the first three genes and get their Top_RNA_Niche_Protein_Interactions

In [17]:
import pandas as pd
import os

# Create a directory to save the top interactions if it doesn't exist
os.makedirs('Top_Interactions', exist_ok=True)

# Get the first 3 genes in adata_filtered.var.index
genes_of_interest = adata_filtered.var.index[:3].tolist()
print(f"Genes to knock out: {genes_of_interest}")

# Select a dummy gene not in genes_of_interest
all_genes = list(adata_filtered.var.index)
dummy_gene = next(g for g in all_genes if g not in genes_of_interest)
print(f"Using dummy gene for WT data: {dummy_gene}")

# Initialize DataLoader for WT data
data_wrapper_wt = DataLoader(
    adata_filtered,
    target_gene=[dummy_gene],  # Use the dummy gene
    target_cell=None,
    obs_label="cell_type",  # Adjust if necessary
    GRN_file_dir="GRNs",
    rebuild_GRN=True,  # Build the GRN
    pcNet_name="DBiTseq_example",
    verbose=False,  # Set to False to reduce output
    n_cpus=8,
)

# Load WT data (GRN will be built here)
data_wt = data_wrapper_wt.load_data()

# Set hyperparameters
hyperparams = {
    "epochs": 300,
    "lr": 5e-2,
    "beta": 5e-4,
    "seed": 8096,
}

# Train the model on WT data
sensei = VGAE_trainer(
    data_wt,
    epochs=hyperparams["epochs"],
    lr=hyperparams["lr"],
    beta=hyperparams["beta"],
    seed=hyperparams["seed"],
    verbose=False,  # Set to False to reduce output
)

sensei.train()

# Get latent variables for WT data
z_mu_wt, z_std_wt = sensei.get_latent_vars(data_wt)

# Initialize a list to store combined results
combined_results = []

for gene_of_interest in genes_of_interest:
    print(f"\nProcessing gene: {gene_of_interest}")

    # Initialize DataLoader for KO data
    data_wrapper_ko = DataLoader(
        adata_filtered,
        target_gene=[gene_of_interest],
        target_cell=None,
        obs_label="cell_type",
        GRN_file_dir="GRNs",
        rebuild_GRN=False,  # Use the existing GRN
        pcNet_name="DBiTseq_example",
        verbose=False,  # Set to False to reduce output
        n_cpus=8,
    )

    # Load KO data for the gene
    data_ko = data_wrapper_ko.load_kodata()

    # Get latent variables for KO data
    z_mu_ko, z_std_ko = sensei.get_latent_vars(data_ko)

    # Calculate the distance between WT and KO data
    dis = gk.utils.get_distance(z_mu_ko, z_std_ko, z_mu_wt, z_std_wt, by="KL")

    # Get the ranked list of responsive genes
    res_raw = utils.get_generank(data_wt, dis, rank=True)

    # Store the top 10 responsive genes
    top_genes = res_raw.head(10)
    print(f"Top 10 KO Responsive Genes for {gene_of_interest}:\n{top_genes}")

    # Extract the top genes for the current KO gene
    genki_list = top_genes.index.tolist()

    # Load the data from CSV file
    file_path = '/content/feature_feature_importance.csv'  # Replace with your file path
    data = pd.read_csv(file_path)

    # Filter data for the genes of interest
    filtered_data = data[data['Source'].isin(genki_list)]

    # Separate data for RNA -> Niche and RNA -> Protein directions
    rna_niche_data = filtered_data[filtered_data['Direction'] == 'RNA -> Niche']
    rna_protein_data = filtered_data[filtered_data['Direction'] == 'RNA -> Protein']

    # Find the top 3 unique RNA -> Niche and RNA -> Protein interactions with the highest 'Value'
    top_rna_niche = (
        rna_niche_data
        .sort_values(by='Value', ascending=False)
        .drop_duplicates(subset=['Source', 'Target'])
        .groupby('Source')
        .head(3)
    )

    top_rna_protein = (
        rna_protein_data
        .sort_values(by='Value', ascending=False)
        .drop_duplicates(subset=['Source', 'Target'])
        .groupby('Source')
        .head(3)
    )

    # Identify cases where the 'Target' value is the same as the 'Source' value
    rna_protein_duplicates = top_rna_protein[top_rna_protein['Target'] == top_rna_protein['Source']]
    rna_niche_duplicates = top_rna_niche[top_rna_niche['Target'] == top_rna_niche['Source']]

    # Remove duplicate entries from the original top lists
    top_rna_protein_cleaned = top_rna_protein[~(top_rna_protein['Target'] == top_rna_protein['Source'])]
    top_rna_niche_cleaned = top_rna_niche[~(top_rna_niche['Target'] == top_rna_niche['Source'])]

    # Find the next highest entries for those with matching 'Target' and 'Source'
    next_rna_protein = (
        rna_protein_data
        .loc[~rna_protein_data.index.isin(rna_protein_duplicates.index)]
        .sort_values(by='Value', ascending=False)
    )

    next_rna_protein_add = (
        next_rna_protein
        .groupby('Source')
        .apply(lambda x: x[~x['Target'].isin(top_rna_protein_cleaned['Target'])].head(1))
        .reset_index(drop=True)
    )

    next_rna_niche = (
        rna_niche_data
        .loc[~rna_niche_data.index.isin(rna_niche_duplicates.index)]
        .sort_values(by='Value', ascending=False)
    )

    next_rna_niche_add = (
        next_rna_niche
        .groupby('Source')
        .apply(lambda x: x[~x['Target'].isin(top_rna_niche_cleaned['Target'])].head(1))
        .reset_index(drop=True)
    )

    # Combine the original top lists with the added entries
    final_top_rna_protein = (
        pd.concat([top_rna_protein_cleaned, next_rna_protein_add])
        .sort_values(by=['Source', 'Value'], ascending=[True, False])
        .groupby('Source')
        .head(3)
    )

    final_top_rna_niche = (
        pd.concat([top_rna_niche_cleaned, next_rna_niche_add])
        .sort_values(by=['Source', 'Value'], ascending=[True, False])
        .groupby('Source')
        .head(3)
    )

    # Combine all results into a single DataFrame
    combined_df = pd.concat([final_top_rna_protein, final_top_rna_niche])

    # Add a column to indicate the KO gene
    combined_df['KO_Gene'] = gene_of_interest

    # Append to the list
    combined_results.append(combined_df)

# Concatenate all results
all_combined_df = pd.concat(combined_results)

# Save the combined results to a single sheet in an Excel file
output_path = './Top_RNA_Niche_Protein_Interactions_first3_Sheet.xlsx'

with pd.ExcelWriter(output_path) as writer:
    all_combined_df.to_excel(writer, sheet_name='Top Interactions', index=False)

print(f"\nResults have been saved to {output_path}")

Genes to knock out: ['Aagab', 'Aars2', 'Acat1']
Using dummy gene for WT data: Acta2


2024-12-05 15:22:36,507	INFO worker.py:1821 -- Started a local Ray instance.


ray init, using 8 CPUs

Processing gene: Aagab
Top 10 KO Responsive Genes for Aagab:
             dis  rank
Pak3    8.384550     1
Aagab   0.971121     2
Lhx2    0.002852     3
Ptprz1  0.002830     4
Foxg1   0.002344     5
Ednrb   0.002206     6
Glra1   0.002037     7
Nr2e1   0.001805     8
Col3a1  0.001749     9
Hoxa5   0.001715    10

Processing gene: Aars2
Top 10 KO Responsive Genes for Aars2:
             dis  rank
Aars2   0.882645     1
Pds5a   0.000000     2
Pcm1    0.000000     3
Pdgfa   0.000000     4
Pdhb    0.000000     5
Pdlim3  0.000000     6
Pdlim7  0.000000     7
Pdpn    0.000000     8
Phf21b  0.000000     9
Plcd1   0.000000    10

Processing gene: Acat1
Top 10 KO Responsive Genes for Acat1:
             dis  rank
Acat1   0.315872     1
Aagab   0.000000     2
Pds5a   0.000000     3
Pcm1    0.000000     4
Pdgfa   0.000000     5
Pdhb    0.000000     6
Pdlim3  0.000000     7
Pdlim7  0.000000     8
Pdpn    0.000000     9
Phf21b  0.000000    10

Results have been saved to ./To

In [20]:
os.makedirs('Top10_Responsive_Genes', exist_ok=True)

for gene_of_interest in genes_of_interest:
  top_genes_path = f'Top10_Responsive_Genes/Top10_Responsive_Genes_{gene_of_interest}.csv'
  top_genes.to_csv(top_genes_path)
  print(f"Top 10 responsive genes for {gene_of_interest} saved to {top_genes_path}")

Top 10 responsive genes for Aagab saved to Top10_Responsive_Genes/Top10_Responsive_Genes_Aagab.csv
Top 10 responsive genes for Aars2 saved to Top10_Responsive_Genes/Top10_Responsive_Genes_Aars2.csv
Top 10 responsive genes for Acat1 saved to Top10_Responsive_Genes/Top10_Responsive_Genes_Acat1.csv


# Make the python list combine with Protein and KO res Genes


In [29]:
import pandas as pd
import os

interactions_df = pd.read_excel('Top_RNA_Niche_Protein_Interactions_first3_Sheet.xlsx', sheet_name='Top Interactions')
targets = interactions_df['Target'].tolist()
print(f"Extracted {len(targets)} targets from the Excel file.")

genes_of_interest = adata_filtered.var.index[:3].tolist()
print(f"Genes of interest: {genes_of_interest}")

all_responsive_genes = []

for gene_of_interest in genes_of_interest:
    file_name = f'Top10_Responsive_Genes/Top10_Responsive_Genes_{gene_of_interest}.csv'
    if os.path.exists(file_name):
        df_top_genes = pd.read_csv(file_name, index_col=0)
        responsive_genes = df_top_genes.index.tolist()
        all_responsive_genes.extend(responsive_genes)
        print(f"Added {len(responsive_genes)} responsive genes from {file_name}.")
    else:
        print(f"File {file_name} does not exist.")

combined_list = targets + all_responsive_genes
print(f"Total combined genes before removing duplicates: {len(combined_list)}.")

# Remove duplicates
combined_list = list(set(combined_list))
print(f"Total combined genes after removing duplicates: {len(combined_list)}.")

# Save it to a file
output_file = 'Combined_Genes_List.txt'
with open(output_file, 'w') as f:
    for gene in combined_list:
        f.write(f"{gene}\n")
print(f"Combined list saved to {output_file}.")

Extracted 72 targets from the Excel file.
Genes of interest: ['Aagab', 'Aars2', 'Acat1']
Added 10 responsive genes from Top10_Responsive_Genes/Top10_Responsive_Genes_Aagab.csv.
Added 10 responsive genes from Top10_Responsive_Genes/Top10_Responsive_Genes_Aars2.csv.
Added 10 responsive genes from Top10_Responsive_Genes/Top10_Responsive_Genes_Acat1.csv.
Total combined genes before removing duplicates: 102.
Total combined genes after removing duplicates: 49.
Combined list saved to Combined_Genes_List.txt.
