## Install Dependencies

In [1]:
# --- Installs ---

!pip install -U -q scgpt "torch<=2.2.2" "numpy<2" "umap-learn<0.5.7"
!pip install -q wandb louvain faiss-cpu
!pip install -q scanpy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m831.7/831.7 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m119.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [1]:
import scgpt as scg
print("scGPT loaded successfully!", scg.__version__)



scGPT loaded successfully! 0.2.4


In [2]:
# --- Imports ---

import scanpy as sc
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import warnings
from scipy.stats import mode
from pathlib import Path

# extra dependency for similarity search
try:
    import faiss

    faiss_imported = True
    print("faiss imported!")
    print(faiss.__version__)

except ImportError:
    faiss_imported = False
    print(
        "faiss not installed! We highly recommend installing it for fast similarity search."
    )
    print("To install it, see https://github.com/facebookresearch/faiss/wiki/Installing-Faiss")

warnings.filterwarnings("ignore", category=ResourceWarning)
warnings.filterwarnings("ignore", category=ImportWarning)

faiss imported!
1.12.0


## Load Data

In [3]:
# --- Load data ---

ref_adata = sc.read_h5ad("/content/drive/MyDrive/projects/scGPT-MAFLD/data/ref.h5ad")
query_adata = sc.read_h5ad("/content/drive/MyDrive/projects/scGPT-MAFLD/data/MacParland.h5ad")

# Load the cell annotation file
labels = pd.read_csv(
    "/content/drive/MyDrive/projects/scGPT-MAFLD/data/GSE115469_CellClusterType.txt",
    sep="\t",
    index_col=0
)

# Set model directory
model_dir = "/content/drive/MyDrive/projects/scGPT-MAFLD/scGPT_human/"

## Add Ground Truth Labels

In [4]:
# --- Add cell annotations to query_adata ---

# Fix query cell names to match annotation
query_adata.obs_names = query_adata.obs_names.str.replace("^Nash_", "", regex=True)

# Find common cells between query and annotation
common_cells = query_adata.obs_names.intersection(labels.index)
print(f"Number of matching cells: {len(common_cells)}")

# Subset query to only those cells and add true labels
query_adata = query_adata[common_cells, :].copy()
query_adata.obs['celltype_true'] = labels.loc[query_adata.obs_names, 'CellType']

# Verify
print(query_adata.obs[['celltype_true']].head())

Number of matching cells: 8310
                                        celltype_true
P1TLH_AAACCTGAGCAGCCTC_1         Central_venous_LSECs
P1TLH_AAACCTGTCCTCATTA_1               Cholangiocytes
P1TLH_AAACCTGTCTAAGCCA_1         Central_venous_LSECs
P1TLH_AAACGGGAGTAGGCCA_1  Non-inflammatory_Macrophage
P1TLH_AAACGGGGTTCGGGCT_1           alpha-beta_T_Cells


In [5]:
print(query_adata)

AnnData object with n_obs × n_vars = 8310 × 1500
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'celltype_true'
    var: 'vf_vst_counts_mean', 'vf_vst_counts_variance', 'vf_vst_counts_variance.expected', 'vf_vst_counts_variance.standardized', 'vf_vst_counts_variable', 'vf_vst_counts_rank', 'var.features', 'var.features.rank'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'


In [6]:
print(ref_adata)

AnnData object with n_obs × n_vars = 79492 × 3000
    obs: 'celltype.l1', 'celltype.l2', 'ori.index', 'nCount_refAssay', 'nFeature_refAssay'
    var: 'features'
    obsm: 'X_refDR'
    varm: 'REFDR'


In [7]:
# Check if you can access raw counts from reference
if 'raw' in ref_adata.__dict__:
    ref_raw = ref_adata.raw.to_adata()
else:
    # If no raw data, you might need to work with what you have
    print("No raw data available in reference")

No raw data available in reference


## Generate Embeddings

In [None]:
# --- Generate query embeddings ---

%%time

# Set gene column
gene_col = "var.features"

# Generate embeddings
query_embed_adata = scg.tasks.embed_data(
    query_adata,
    model_dir,
    gene_col=gene_col,
    device="cuda",
    use_fast_transformer=False,
)

scGPT - INFO - match 1353/1500 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 130/130 [01:23<00:00,  1.55it/s]
  adata.obsm["X_scGPT"] = cell_embeddings


CPU times: user 1min 13s, sys: 1.27 s, total: 1min 15s
Wall time: 1min 31s


In [None]:
print(query_embed_adata)

AnnData object with n_obs × n_vars = 8310 × 1353
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'celltype_true'
    var: 'vf_vst_counts_mean', 'vf_vst_counts_variance', 'vf_vst_counts_variance.expected', 'vf_vst_counts_variance.standardized', 'vf_vst_counts_variable', 'vf_vst_counts_rank', 'var.features', 'var.features.rank', 'id_in_vocab'
    obsm: 'X_pca', 'X_umap', 'X_scGPT'
    varm: 'PCs'


## Mask Cell Types on Query Cells and Mark Reference vs Query Cells

In [None]:
# Column in reference to use as cell type labels
cell_type_key = "celltype.l2"

# Choose embedding keys
query_embed_key = "X_scGPT"
ref_embed_key = "X_refDR"

# Add a unified embedding key before concatenation
query_embed_adata.obsm["X_emb"] = query_embed_adata.obsm[query_embed_key].copy()
ref_adata.obsm["X_emb"] = ref_adata.obsm[ref_embed_key].copy()

# Concatenate query and reference embeddings
adata_concat = query_embed_adata.concatenate(
    ref_adata,
    batch_key="dataset",
    uns_merge="unique",
    join="outer"
)

# Mark reference vs query
adata_concat.obs["is_ref"] = (
    ["Query"] * query_embed_adata.n_obs + ["Reference"] * ref_adata.n_obs
)
adata_concat.obs["is_ref"] = adata_concat.obs["is_ref"].astype("category")

# Make cell type categorical and add new category
adata_concat.obs[cell_type_key] = adata_concat.obs[cell_type_key].astype("category")
adata_concat.obs[cell_type_key] = adata_concat.obs[cell_type_key].cat.add_categories(["To be predicted"])

# Mask query cell types
adata_concat.obs[cell_type_key][: query_embed_adata.n_obs] = "To be predicted"

  adata_concat = query_embed_adata.concatenate(
  out = concat(
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_concat.obs[cell_type_key][: query_embed_adata.n_obs] = "To be predicted"


In [None]:
print(adata_concat)

AnnData object with n_obs × n_vars = 87802 × 3837
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'celltype_true', 'celltype.l1', 'celltype.l2', 'ori.index', 'nCount_refAssay', 'nFeature_refAssay', 'dataset', 'is_ref'
    var: 'vf_vst_counts_mean-0', 'vf_vst_counts_variance-0', 'vf_vst_counts_variance.expected-0', 'vf_vst_counts_variance.standardized-0', 'vf_vst_counts_variable-0', 'vf_vst_counts_rank-0', 'var.features-0', 'var.features.rank-0', 'id_in_vocab-0', 'features-1'
    obsm: 'X_pca', 'X_umap', 'X_scGPT', 'X_emb', 'X_refDR'


In [None]:
# --- Save files for post-processing UMAPs ---

# Save concatenated file
for col in adata_concat.obs.columns:
    if str(adata_concat.obs[col].dtype) == "category":
        adata_concat.obs[col] = adata_concat.obs[col].astype(str)

adata_concat.write('/content/drive/MyDrive/projects/scGPT-MAFLD/data/query_ref_concat.h5ad')

# Save query embeddings file
query_embed_adata.write('/content/drive/MyDrive/projects/scGPT-MAFLD/data/query_embed.h5ad')

In [None]:
adata_check = sc.read('/content/drive/MyDrive/projects/scGPT-MAFLD/data/query_ref_concat.h5ad')
print(adata_check)

AnnData object with n_obs × n_vars = 87802 × 3837
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'celltype_true', 'celltype.l1', 'celltype.l2', 'ori.index', 'nCount_refAssay', 'nFeature_refAssay', 'dataset', 'is_ref'
    var: 'vf_vst_counts_mean-0', 'vf_vst_counts_variance-0', 'vf_vst_counts_variance.expected-0', 'vf_vst_counts_variance.standardized-0', 'vf_vst_counts_variable-0', 'vf_vst_counts_rank-0', 'var.features-0', 'var.features.rank-0', 'id_in_vocab-0', 'features-1'
    obsm: 'X_emb', 'X_pca', 'X_refDR', 'X_scGPT', 'X_umap'


In [None]:
query_embed_check = sc.read('/content/drive/MyDrive/projects/scGPT-MAFLD/data/query_embed.h5ad')
print(query_embed_check)

AnnData object with n_obs × n_vars = 8310 × 1353
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'celltype_true'
    var: 'vf_vst_counts_mean', 'vf_vst_counts_variance', 'vf_vst_counts_variance.expected', 'vf_vst_counts_variance.standardized', 'vf_vst_counts_variable', 'vf_vst_counts_rank', 'var.features', 'var.features.rank', 'id_in_vocab'
    obsm: 'X_emb', 'X_pca', 'X_scGPT', 'X_umap'
    varm: 'PCs'


## Calculate Performance Metrics