In [1]:
import scanpy as sc
import scgpt
from beeline import GENIE3, GRNBoost2, PIDC

ModuleNotFoundError: No module named 'scgpt'

In [None]:
import copy
import json
import os
from pathlib import Path
import sys
import warnings

import torch
from anndata import AnnData
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import tqdm
import gseapy as gp

from torchtext.vocab import Vocab
from torchtext._torchtext import (
    Vocab as VocabPybind,
)

sys.path.insert(0, "../")
import scgpt as scg
from scgpt.tasks import GeneEmbedding
from scgpt.tokenizer.gene_tokenizer import GeneVocab
from scgpt.model import TransformerModel
from scgpt.preprocess import Preprocessor
from scgpt.utils import set_seed

In [None]:
os.environ["KMP_WARNINGS"] = "off"
warnings.filterwarnings('ignore')

In [None]:
set_seed(42)
pad_token = "<pad>"
special_tokens = [pad_token, "<cls>", "<eoc>"]
n_hvg = 1200
n_bins = 51
mask_value = -1
pad_value = -2
n_input_bins = n_bins

In [None]:
data_dir = Path("Immune_ALL_human.h5ad")
adata = sc.read(
    str(data_dir), cache=True
)

In [None]:
model_dir = Path("scGPT_bc")
model_config_file = model_dir / "args.json"
model_file = model_dir / "best_model.pt"
vocab_file = model_dir / "vocab.json"

In [None]:
vocab = GeneVocab.from_file(vocab_file)
for s in special_tokens:
    if s not in vocab:
        vocab.append_token(s)

In [None]:
# Retrieve model parameters from config files
with open(model_config_file, "r") as f:
    model_configs = json.load(f)
print(
    f"Resume model from {model_file}, the model args will override the "
    f"config {model_config_file}."
)

In [None]:
embsize = model_configs["embsize"]
nhead = model_configs["nheads"]
d_hid = model_configs["d_hid"]
nlayers = model_configs["nlayers"]
n_layers_cls = model_configs["n_layers_cls"]

gene2idx = vocab.get_stoi()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ntokens = len(vocab)  # size of vocabulary
model = TransformerModel(
    ntokens,
    embsize,
    nhead,
    d_hid,
    nlayers,
    vocab=vocab,
    pad_value=pad_value,
    n_input_bins=n_input_bins,
)

try:
    model.load_state_dict(torch.load(model_file))
    print(f"Loading all model params from {model_file}")
except:
    # only load params that are in the model and match the size
    model_dict = model.state_dict()
    pretrained_dict = torch.load(model_file)
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items()
        if k in model_dict and v.shape == model_dict[k].shape
    }
    for k, v in pretrained_dict.items():
        print(f"Loading params {k} with shape {v.shape}")
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)

model.to(device)

In [None]:
# Step 2: Pick GRN inference methods from BEELINE
methods = [GENIE3, GRNBoost2, PIDC]

In [None]:
ori_batch_col = "batch"
adata.obs["celltype"] = adata.obs["final_annotation"].astype(str)
data_is_raw = False

In [None]:
# Preprocess the data following the scGPT data pre-processing pipeline
preprocessor = Preprocessor(
    use_key="X",  # the key in adata.layers to use as raw data
    filter_gene_by_counts=3,  # step 1
    filter_cell_by_counts=False,  # step 2
    normalize_total=1e4,  # 3. whether to normalize the raw data and to what sum
    result_normed_key="X_normed",  # the key in adata.layers to store the normalized data
    log1p=data_is_raw,  # 4. whether to log1p the normalized data
    result_log1p_key="X_log1p",
    subset_hvg=n_hvg,  # 5. whether to subset the raw data to highly variable genes
    hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
    binning=n_bins,  # 6. whether to bin the raw data and to what number of bins
    result_binned_key="X_binned",  # the key in adata.layers to store the binned data
)
preprocessor(adata, batch_key="batch")

In [None]:
# Step 2: Infer GRNs using gene embeddings
grns_from_embeddings = []
for method in methods:
    grn = method.fit(adata.obsm["X_gpt"])
    grns_from_embeddings.append(grn)

# Step 2: Infer GRNs using gene expression data
grns_from_expression = []
for method in methods:
    grn = method.fit(adata.X)
    grns_from_expression.append(grn)

In [None]:
import networkx as nx

def load_gtn(file):
    """
    Loads a graph from a file.

    Parameters
    ----------
    file : str
        The file path to the graph file.

    Returns
    -------
    nx.Graph
        The loaded graph.
    """
    # Load the graph from the file
    G = nx.read_graphml(file)

    return G

In [None]:
def evaluate_grn(model,data):
    

In [None]:
# Assuming you have ground truth networks in a file
ground_truth_grns = load_gtn("ground_truth.txt")

# Evaluate GRNs inferred from gene embeddings
for grn, ground_truth in zip(grns_from_embeddings, ground_truth_grns):
    evaluate_grn(grn, ground_truth)

# Evaluate GRNs inferred from gene expression data
for grn, ground_truth in zip(grns_from_expression, ground_truth_grns):
    evaluate_grn(grn, ground_truth)