In [1]:
import pandas as pd
import numpy as np
import os
import sys

import anndata as ad
import scanpy as sc

In [2]:
fpath = '/scratch/indikar_root/indikar1/cstansbu/sc-iHSC/references/geneTable.csv'
gf = pd.read_csv(fpath, low_memory=False)
print(f"{gf.shape=}")

# subset just the major genes
gf = gf[gf['Feature'] == 'gene']
gf = gf[gf['gene_biotype'] == 'protein_coding'].reset_index(drop=True)
print(f"{gf.shape=}")

keep_cols = [
    'gene_id', 
    'gene_name',
    'Chromosome', 
    'Start',
    'End',
    'Strand',
]

gf = gf[keep_cols]
gf = gf[gf['gene_name'].notna()]
gf = gf.drop_duplicates(subset='gene_name')
gf.head()

gf.shape=(3371244, 26)
gf.shape=(20023, 26)


Unnamed: 0,gene_id,gene_name,Chromosome,Start,End,Strand
0,ENSG00000160072,ATAD3B,1,1471764,1497848,+
1,ENSG00000142611,PRDM16,1,3069167,3438621,+
2,ENSG00000157933,SKI,1,2228318,2310213,+
3,ENSG00000142655,PEX14,1,10472287,10630758,+
4,ENSG00000149527,PLCH2,1,2425979,2505532,+


In [3]:
fpath = '/scratch/indikar_root/indikar1/cstansbu/sc-iHSC/counts/counts.txt'

df = pd.read_csv(fpath, sep='\t', low_memory=False)
print(f"{df.shape=}")

df = df.rename(columns={'Unnamed: 0' : 'gene_id', 'Unnamed: 1' : 'chromosome'})
df = df.drop(columns='chromosome')

# filter out non-PT genes
df = df[df['gene_id'].isin(gf['gene_id'])]
df = df.set_index('gene_id')

# make sure that the GTF has the same set
gf = gf[gf['gene_id'].isin(df.index)]

gf = gf.set_index('gene_id')

print(f"{df.shape=} {gf.shape=}")

##TRANSPOSE 
df = df.T

df.head()

df.shape=(61865, 8526)
df.shape=(19393, 8524) gf.shape=(19393, 5)


gene_id,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288825,ENSG00000288859,ENSG00000288864,ENSG00000288867,ENSG00000288920,ENSG00000289549,ENSG00000289604,ENSG00000289716,ENSG00000289721,ENSG00000289746
AAACCCAAGGTTACCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCAAGTTGAAGT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCAAGTTGTCGT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCACAGAAGCGT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCACAGGAGGTT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# create the anndata object

adata = ad.AnnData(df.to_numpy(), dtype='int64')
adata.obs_names = df.index
adata.var_names = df.columns
adata.var = gf

obs = df.sum(axis=1).reset_index(drop=False)
obs.columns = ['cell_id', 'n_genes']
obs = obs.set_index('cell_id')

adata.obs = obs

adata

AnnData object with n_obs × n_vars = 8524 × 19393
    obs: 'n_genes'
    var: 'gene_name', 'Chromosome', 'Start', 'End', 'Strand'

In [5]:
adata.obs.head()

Unnamed: 0_level_0,n_genes
cell_id,Unnamed: 1_level_1
AAACCCAAGGTTACCT,352
AAACCCAAGTTGAAGT,276
AAACCCAAGTTGTCGT,225
AAACCCACAGAAGCGT,194
AAACCCACAGGAGGTT,787
