In [13]:
import os
import seaborn as sns
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import anndata as an
import scanpy as sc

In [42]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"
gdf = pd.read_csv(fpath, low_memory=False)

gdf = gdf[['gene_id', 'gene_name']].drop_duplicates()
gdf = gdf[gdf['gene_name'].notna()]

name2id = dict(zip(gdf['gene_name'].values, gdf['gene_id'].values))
id2name = dict(zip(gdf['gene_name'].values, gdf['gene_id'].values))

gdf.head()

Unnamed: 0,gene_id,gene_name
0,ENSG00000160072,ATAD3B
111,ENSG00000225972,MTND1P23
114,ENSG00000198744,MTCO3P12
117,ENSG00000279928,DDX11L17
129,ENSG00000142611,PRDM16


In [56]:
dpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/zheng_2019/raw_data/"

adata_list = []

for f in os.listdir(dpath):
    
    if not f == 'GSM3993420_CS10_Body_rawdata.txt.gz':
        continue
    
    fpath = f"{dpath}{f}"
    
    label = f.replace(".txt.gz", "")
    print(f"Working... {label}")
    df = pd.read_csv(fpath, sep=' ').T
    
    # filter genes not in ensemle
    keep_cols = [x for x in df.columns if x in gdf['gene_name'].values]
    df = df[keep_cols]
    
    var = pd.DataFrame(df.columns, columns=['gene_name'])
    var['ensembl_id'] = var['gene_name'].map(name2id)
    var = var.set_index('ensembl_id')
    
    df.columns = [name2id[x] for x in df.columns]

    adata = an.AnnData(df)
    adata.var = var
    adata.obs['data_label'] = label
    
    adata_list.append(adata)



Working... GSM3993420_CS10_Body_rawdata


In [58]:
main_data = an.concat(adata_list)
main_data

AnnData object with n_obs × n_vars = 6666 × 23575
    obs: 'data_label'

In [66]:
d = pd.DataFrame(list(main_data.obs.index.str.split("_")))
d[0].value_counts()

0
CS10    6666
Name: count, dtype: int64

In [53]:
main_data.obs.head()

Unnamed: 0,data_label
S.20190419.EC_FKDL190735971.1a_sc1,GSM3993425_CS14_DA_UMI_raw
S.20190419.EC_FKDL190735971.1a_sc2,GSM3993425_CS14_DA_UMI_raw
S.20190419.EC_FKDL190735971.1a_sc3,GSM3993425_CS14_DA_UMI_raw
S.20190419.EC_FKDL190735971.1a_sc4,GSM3993425_CS14_DA_UMI_raw
S.20190419.EC_FKDL190735971.1a_sc5,GSM3993425_CS14_DA_UMI_raw


In [55]:
main_data.obs.index.str.split(".")

Index([ ['S', '20190419', 'EC_FKDL190735971', '1a_sc1'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc2'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc3'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc4'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc5'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc6'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc7'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc8'],
        ['S', '20190419', 'EC_FKDL190735971', '1a_sc9'],
       ['S', '20190419', 'EC_FKDL190735971', '1a_sc10'],
       ...
                           ['CS13_DA_TTGCCGTAGGAGTCTG'],
                           ['CS13_DA_TTGCCGTTCTGCGACG'],
                           ['CS13_DA_TTGGAACGTAGAGCTG'],
                           ['CS13_DA_TTGGAACGTTTAGCTG'],
                           ['CS13_DA_TTGGCAAGTAATCGTC'],
                           ['CS13_DA_TTGGCAATCTTGCCGT'],
                           ['CS13_DA_TTTATGCGTCCAAGTT'],
                    