In [1]:
import sys
import seaborn as sns
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import torch
import anndata as an
import scanpy as sc
import os
import gc
from importlib import reload

from datasets import Dataset, load_from_disk
from datasets import load_dataset

# local imports
sys.path.insert(0, '../../scripts/')
import geneformer_utils as gtu

sns.set_style('white')
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fpath = "ihsc_cell_types.csv"
ctypes = pd.read_csv(fpath)
ctypes.head()

sorted(ctypes['standardized_cell_type'].unique())

['B Cell',
 'Common Lymphoid Progenitor',
 'Common Myeloid Progenitor',
 'Dendritic Cell',
 'Endothelial Cell',
 'Erythrocyte',
 'Fibroblast',
 'Granulocyte',
 'Granulocyte-Macrophage Progenitor',
 'HSC',
 'Macrophage',
 'Mast Cell',
 'Megakaryocyte-Erythroid Progenitor',
 'Monocyte',
 'Multi-Lymphoid Progenitor',
 'Multipotent Progenitor',
 'NK Cell',
 'NK T Cell',
 'Neutrophil',
 'Plasma Cell',
 'Smooth Muscle Cell',
 'T Cell',
 'iHSC']

# Data Summary

In [3]:
fpath = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"

data = load_from_disk(fpath)
data

Dataset({
    features: ['input_ids', 'cell_type', 'dataset', 'length', 'ignore', 'standardized_cell_type', 'broad_type', '__index_level_0__'],
    num_rows: 214715
})

In [4]:
df = data.to_pandas()
df.shape

(214715, 8)

In [10]:
counts = df['standardized_cell_type'].value_counts().reset_index()
counts.columns = ['cell_type', 'count']
print(counts.to_latex(index=False))

\begin{tabular}{lr}
\toprule
cell_type & count \\
\midrule
T Cell & 31324 \\
Monocyte & 24568 \\
Megakaryocyte-Erythroid Progenitor & 22542 \\
HSC & 20090 \\
Fibroblast & 15308 \\
B Cell & 13630 \\
Neutrophil & 11807 \\
Erythrocyte & 10635 \\
Multipotent Progenitor & 9394 \\
iHSC & 8574 \\
NK Cell & 8508 \\
Common Myeloid Progenitor & 7186 \\
Macrophage & 6861 \\
Endothelial Cell & 5494 \\
Granulocyte-Macrophage Progenitor & 4893 \\
Common Lymphoid Progenitor & 3640 \\
Smooth Muscle Cell & 3091 \\
Dendritic Cell & 2709 \\
Plasma Cell & 2051 \\
Multi-Lymphoid Progenitor & 928 \\
Granulocyte & 855 \\
NK T Cell & 518 \\
Mast Cell & 109 \\
\bottomrule
\end{tabular}



In [11]:
counts = df['dataset'].value_counts().reset_index()
counts.columns = ['cell_type', 'count']
print(counts.to_latex(index=False))

\begin{tabular}{lr}
\toprule
cell_type & count \\
\midrule
TS_Blood & 49344 \\
weng_young2_all & 28948 \\
weng_young1_all_t2 & 25252 \\
TS_Fat & 18769 \\
weng_young1_all_t1 & 18420 \\
weng_old2_BMMC_HSPC & 17011 \\
TS_Vasculature & 14818 \\
weng_old1_BMMC_HSPC & 12781 \\
TS_Bone_Marrow & 11676 \\
iHSC & 8574 \\
pellin & 5419 \\
weng_young2_HSC & 3703 \\
\bottomrule
\end{tabular}



In [14]:
df[df['standardized_cell_type'] == 'HSC']['dataset'].value_counts()

dataset
weng_young2_all        4996
weng_young1_all_t2     4845
weng_young1_all_t1     4510
weng_young2_HSC        3243
pellin                 1282
weng_old2_BMMC_HSPC     934
weng_old1_BMMC_HSPC     280
Name: count, dtype: int64