In [1]:
import pandas as pd
import os
import sys
import anndata as an
from pathlib import Path

In [2]:
dpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/pellin_2019/raw_files/"

def process_pellin_data(fpath):
    """
    Reads a tab-separated CSV file, processes it, and returns an AnnData object.

    Args:
        fpath (str): Path to the CSV file.

    Returns:
        anndata.AnnData: The processed AnnData object.
    """

    df = pd.read_csv(fpath, sep='\t')

    # Transpose and extract headers efficiently
    df = df.set_index(df.columns[0]).T.reset_index(names='Barcode')

    # Create 'barcode_library' and set index
    dataset_name = Path(fpath).stem.split("_")[1].split(".")[0]
    df['dataset'] = dataset_name
    
    df['cell_id'] =  df['dataset'] + "_" + df['Barcode'] + "_" + df['Library'].astype(str)
    df = df.set_index('cell_id')

    # Build observation annotation with dataset extraction
    obs = df[['Barcode', 'Library', 'dataset']].copy()

    # Drop unnecessary columns
    df = df.drop(columns=['Barcode', 'Library', 'dataset'])

    # Build variable annotation
    var = pd.DataFrame({
        'var_name': df.columns,
        'gene_name': df.columns,
    }).set_index('var_name')

    # Create AnnData object
    adata = an.AnnData(X=df.to_numpy(), obs=obs, var=var)

    return adata
    

adata_list = []

for f in os.listdir(dpath):
    fpath = f"{dpath}{f}"
    adata = process_pellin_data(fpath)
    print(adata)
    adata_list.append(adata.copy())

len(adata_list)

AnnData object with n_obs × n_vars = 592 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 4266 × 24719
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 1282 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 1576 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 1211 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 6343 × 24719
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 4434 × 24719
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 215 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
AnnData object with n_obs × n_vars = 1012 × 25464
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_

11

In [3]:
main_data = an.concat(adata_list)
main_data

AnnData object with n_obs × n_vars = 21412 × 20582
    obs: 'Barcode', 'Library', 'dataset'

In [6]:
# write the output
outpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/pellin_2019/pellin.anndata.h5ad"
main_data.write(outpath)
main_data

AnnData object with n_obs × n_vars = 21412 × 20582
    obs: 'Barcode', 'Library', 'dataset'

In [7]:
main_data.obs['dataset'].value_counts()

LinNegCD34PosCD164Pos     6343
LinNegCD34NegCD164high    4434
LinNegCD34lowCD164high    4266
CMP                       1576
HSC                       1282
MEP                       1211
GMP                       1012
PreBNK                     592
LinNegCD34NegCD164low      358
MPP                        215
MLP                        123
Name: dataset, dtype: int64