In [1]:
import numpy as np
import pandas as pd
import gget
import glob
import networkx as nx
import os
import seaborn as sns
import gget
from textwrap import fill
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
import scipy
from scipy.sparse import csr_matrix 

# locals
import utils as ut
import plotting as plt2

sc.settings.verbosity = 3  

In [2]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/gomes_2018/GSE51025_HDF_Day2_15_25_UCB_RawCounts.txt.gz"

df = pd.read_csv(fpath, sep='\t')
df = df.rename(columns={'Unnamed: 0' : 'gene_name'})
df = df.set_index('gene_name')
df = df.T
print(f"{df.shape=}")

df.head()

df.shape=(286, 56269)


gene_name,DDX11L1,WASH7P,MIR6859-1,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,RP11-34P13.8,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
HDF_1,0,1,0,0,0,0,0,0,0,0,...,9586,0,2,0,1532,491,2,10708,1,113
HDF_10,0,0,0,0,0,0,0,0,0,0,...,22813,0,0,0,4124,2898,6,19987,0,55
HDF_11,0,0,0,0,0,0,0,0,0,0,...,16820,0,5,0,3967,1854,19,28461,5,157
HDF_12,0,0,0,0,0,0,0,0,0,0,...,53329,4,46,0,7796,2488,69,46653,3,113
HDF_13,0,0,0,0,0,0,0,0,0,0,...,17070,1,5,0,5651,1538,17,18673,1,191


In [3]:
def df_to_anndata(df):
    """
    Converts a pandas DataFrame to an AnnData object.

    Args:
        df: The pandas DataFrame to convert.

    Returns:
        An AnnData object.
    """

    adata = an.AnnData(csr_matrix(df.values))
    adata.obs_names = df.index
    adata.var_names = df.columns

    return adata


adata = df_to_anndata(df)
adata

AnnData object with n_obs × n_vars = 286 × 56269

In [4]:
obs = pd.DataFrame(
    {
        'cell_type': df.index.map(lambda x: x.split("_")[0]),
        'cell_num': df.index.map(lambda x: x.split("_")[1]),
    },
    index=df.index,
)
print(f"{obs.shape=}")
obs.head()

obs.shape=(286, 2)


Unnamed: 0,cell_type,cell_num
HDF_1,HDF,1
HDF_10,HDF,10
HDF_11,HDF,11
HDF_12,HDF,12
HDF_13,HDF,13


In [5]:
var = pd.DataFrame(
    index=df.columns,
)

print(f"{var.shape=}")
var.head()

var.shape=(56269, 0)


DDX11L1
WASH7P
MIR6859-1
MIR1302-2
FAM138A


In [6]:
adata.obs = obs
adata.var = var

adata

AnnData object with n_obs × n_vars = 286 × 56269
    obs: 'cell_type', 'cell_num'

# Assign layers

In [7]:
adata.layers['raw_counts'] = csr_matrix(adata.X).copy()

adata

AnnData object with n_obs × n_vars = 286 × 56269
    obs: 'cell_type', 'cell_num'
    layers: 'raw_counts'

In [8]:
out_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/gomes_2018/gomes.h5ad"

# write the object to file
adata.write(out_path)

adata


... storing 'cell_type' as categorical
... storing 'cell_num' as categorical


AnnData object with n_obs × n_vars = 286 × 56269
    obs: 'cell_type', 'cell_num'
    layers: 'raw_counts'