In [1]:
import seaborn as sns
import pandas as pd 
import numpy as np
from scipy.spatial.distance import squareform, pdist
import matplotlib.pyplot as plt
import torch
import anndata as an
import scanpy as sc
import umap
import gc

from datasets import Dataset, load_from_disk
from datasets import load_dataset
from geneformer import EmbExtractor

sns.set_style('white')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [23]:
# model = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/geneformer/fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224"
model = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer-12L-30M/"
data_path = "/scratch/indikar_root/indikar1/cstansbu/HSC/geneformer_inputs/iHSC.dataset"
outpath = "/scratch/indikar_root/indikar1/cstansbu/geneformer"

In [18]:
def load_and_subset_data(data_path: str, num_cells: int = 100) -> pd.DataFrame:
    """Loads a dataset from disk, selects a subset of cells, and converts it to a Pandas DataFrame.

    Args:
        data_path (str): Path to the dataset file.
        num_cells (int, optional): Number of cells to include in the subset (default: 100).

    Returns:
        pd.DataFrame: The subset of data as a Pandas DataFrame.
    """

    data = load_from_disk(data_path)
    if num_cells > len(data):
        raise ValueError(f"Requested subset size ({num_cells}) exceeds dataset length ({len(data)})")

    data_subset = data.select([i for i in range(num_cells)])
    df = data_subset.to_pandas()

    return df

# Example usage:
df = load_and_subset_data(data_path, num_cells=54346)

print(df['cell_type'].unique())

df.head()

['iHSC' 'LinNegCD34lowCD164high' 'HSC' 'LinNegCD34PosCD164Pos' 'MPP' 'MLP'
 'FB' 'MKP']


Unnamed: 0,input_ids,cell_id,cell_type,n_counts,dataset,length
0,"[17610, 10632, 3717, 15803, 8008, 1864, 8659, ...",AAACCCAAGGTTACCT_iHSC,iHSC,6558.0,iHSC,2048
1,"[19925, 4387, 11310, 19823, 16979, 1806, 5346,...",AAACCCAAGTTGAAGT_iHSC,iHSC,5488.0,iHSC,2048
2,"[1078, 3546, 17321, 587, 2815, 11814, 8430, 82...",AAACCCAAGTTGTCGT_iHSC,iHSC,4330.0,iHSC,2048
3,"[14192, 8674, 16790, 11523, 2044, 8654, 6995, ...",AAACCCACAGAAGCGT_iHSC,iHSC,3442.0,iHSC,2048
4,"[17126, 4895, 10601, 1362, 3537, 19999, 12030,...",AAACCCACAGGAGGTT_iHSC,iHSC,14427.0,iHSC,2048


In [19]:
all_genes = []

for sentence in df['input_ids'].values:
    all_genes += list(sentence)

len(set(all_genes))

18780

In [None]:
break

In [None]:
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [24]:
torch.cuda.empty_cache()

n_cells = 100
# 0 for last layer, -1 for second to last
layer = 0 

# initiate EmbExtractor
embex = EmbExtractor(model_type="Pretrained",
                     num_classes=0,
                     max_ncells=n_cells,
                     emb_mode='cell',
                     emb_layer=layer,
                     emb_label=["cell_type", "dataset", "n_counts", "length"],
                     forward_batch_size=30,
                     nproc=16,
                      )

# extracts embedding from input data
embs = embex.extract_embs(model,
                          data_path,
                          outpath,
                          "test")


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
embs.head()

In [None]:
# break

In [None]:
def embedding_to_adata(df: pd.DataFrame, n_dim: int = None) -> an.AnnData:
    """Converts a Pandas DataFrame with an embedding to an AnnData object.

    Args:
        df: The input DataFrame with numerical embedding columns and optional metadata columns.
        n_dim: The number of dimensions to keep in the embedding. If None, all dimensions are kept.

    Returns:
        The converted AnnData object.

    Raises:
        ValueError: If `n_dim` exceeds the available dimensions in the DataFrame.
    """

    if n_dim is not None and n_dim > df.shape[1]:
        raise ValueError(f"n_dim ({n_dim}) exceeds available dimensions ({df.shape[1]})")

    # Assuming embedding columns are those that are not integers
    is_metadata = df.columns.astype(str).str.isdigit()
    metadata_df = df.loc[:, ~is_metadata]
    embedding_df = df.loc[:, is_metadata]

    cell_index = pd.Index([f"C{x}" for x in range(df.shape[0])], name='obs_names')

    if n_dim is not None:
        embedding_df = embedding_df.iloc[:, :n_dim]

    var_index = pd.Index([f"D{x}" for x in range(embedding_df.shape[1])], name='var_names')

    adata = an.AnnData(embedding_df.to_numpy())
    adata.obs_names = cell_index
    adata.var_names = var_index
    adata.obs = metadata_df
    return adata

    
adata = embedding_to_adata(embs, n_dim=50)
adata

# PCA of embeddings

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata, min_dist=0.1)
sc.pl.umap(
    adata,
    color=["cell_type", "dataset"],
    ncols=1,
    # Setting a smaller point size to get prevent overlap
    size=30,
)

In [None]:
break

# Neighbor Graph (no PCA of embeddings)

In [None]:
sc.pp.neighbors(adata, use_rep='X')
sc.tl.umap(adata, min_dist=0.1)
sc.pl.umap(
    adata,
    color=["cell_type", "dataset", "n_counts", "length"],
    ncols=1,
    size=30,
)

In [None]:
break

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(adata.X)

adata.obs['UMAP 1'] = embedding[:, 0]
adata.obs['UMAP 2'] = embedding[:, 1]

adata

In [None]:
adata.obs['dataset'].unique()

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 5

sns.scatterplot(data=adata.obs,
                x='UMAP 1',
                y='UMAP 2',
                ec='none',
                lw=0.1,
                alpha=0.9,
                hue='dataset',
                s=5)


plt.xticks([])
plt.yticks([])


sns.move_legend(plt.gca(), 
                loc='upper right',
                bbox_to_anchor=(1.7, 1))

plt.gca().set_aspect('equal')

# Neighbor Graph (no PCA of embeddings)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(
    adata,
    color=["cell_type", "dataset"],
    n_cols=1,
    # Setting a smaller point size to get prevent overlap
    size=10,
)

In [None]:
break

In [None]:
comp_1 = 4
comp_2 = 5

sns.scatterplot(data=embs,
                x=comp_1,
                y=comp_2,
                ec='none',
                lw=0.1,
                alpha=0.9,
                hue='dataset',
                s=5)


plt.xticks([])
plt.yticks([])

sns.move_legend(plt.gca(), 
                loc='upper right',
                bbox_to_anchor=(1.7, 1))

plt.gca().set_aspect('equal')

In [None]:
sc.tl.pca(adata, n_comps=5)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(
    adata,
    color="cell_type",
    # Setting a smaller point size to get prevent overlap
    size=10,
)

In [None]:
break

In [None]:
sc.tl.leiden(adata, n_iterations=2)
sc.pl.umap(adata, color=["leiden"])

In [None]:
break

In [None]:
sns.scatterplot(data=embs,
                x=0,
                y=1,
                hue='dataset',
                s=10)

sns.move_legend(plt.gca(), 
                loc='upper right',
                bbox_to_anchor=(1.7, 1))

plt.gca().set_aspect('equal')

In [None]:
break

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_neighbors=21)
sc.tl.umap(adata, min_dist=0.2)

adata

In [None]:
sc.pl.umap(adata, 
           color=['dataset'])

In [None]:
break

In [None]:
df = embs.copy()

sns.scatterplot(data=df,  
                x=0,
                y=2,
                alpha=0.6,
                s=10,
                palette="Set1",
                # legend=False,
                hue='dataset')

sns.move_legend(plt.gca(), 
                loc='upper right',
                bbox_to_anchor=(1.4, 1))
