In [1]:
import os
import anndata as an
import scanpy as sc
import pandas as pd
from scipy import sparse

In [2]:
count_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/xing_2020_reprogramming_data/GSE118258_UMI.csv.gz"

X = pd.read_csv(count_path)
X.head()

Unnamed: 0.1,Unnamed: 0,AAACCTGAGGAGTTGC-1,AAACCTGAGTACTTGC-1,AAACCTGAGTATGACA-1,AAACCTGCACCATGTA-1,AAACCTGCACCTTGTC-1,AAACCTGCACGGTGTC-1,AAACCTGCAGGACGTA-1,AAACCTGCATTGGTAC-1,AAACCTGTCAAAGACA-1,...,TTTCCTCTCGCACTCT-6,TTTCCTCTCTCCTATA-6,TTTGCGCCACCAGTTA-6,TTTGCGCGTCTGATTG-6,TTTGGTTCAATAACGA-6,TTTGGTTGTTACGACT-6,TTTGGTTTCTGTCTCG-6,TTTGGTTTCTTCGGTC-6,TTTGTCACAATGTAAG-6,TTTGTCATCCAAGTAC-6
0,ENSG00000243485,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000237613,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000186092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000238009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000239945,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# update columns and rows
Xt = X.copy()
Xt = Xt.rename(columns = {'Unnamed: 0' : 'gene_id'})
Xt = Xt.set_index('gene_id')
Xt = Xt.T
Xt.shape

(32138, 32738)

In [4]:
# build anndata
adata = an.AnnData(sparse.csr_matrix(Xt.to_numpy()))  # Sparse matrix for efficiency
adata.var_names = Xt.columns
adata.obs_names = Xt.index
adata

AnnData object with n_obs × n_vars = 32138 × 32738

In [5]:
# load the metadata 
obs_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/xing_2020_reprogramming_data/GSE118258_Annotation.txt.gz"
obs = pd.read_csv(obs_path, sep='\t')
obs['cell_id'] = obs['Cell ID'].copy()
obs = obs.set_index('cell_id')

adata.obs = obs.copy()
adata

AnnData object with n_obs × n_vars = 32138 × 32738
    obs: 'Cell ID', 'Time-point'

In [6]:
# load the gene names

gene_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/resources/token_mapping.csv"
gene_map = pd.read_csv(gene_path)

var = pd.DataFrame({'gene_id' : Xt.columns})
var = pd.merge(var, gene_map[['gene_id', 'gene_name', 'gene_biotype', 'scenic_tf']],
               how='left',
               left_on='gene_id',
               right_on='gene_id',
              )

var['ensembl_id'] = var['gene_id'].copy()
var = var.set_index('gene_id')
var = var.astype(str)

adata.var = var
adata

AnnData object with n_obs × n_vars = 32138 × 32738
    obs: 'Cell ID', 'Time-point'
    var: 'gene_name', 'gene_biotype', 'scenic_tf', 'ensembl_id'

In [7]:
outpath = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/xing_2020_reprogramming_data/fib_reprogramming.h5ad"
adata.write(outpath)
print('done')

done
