In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scanpy as sc
import anndata as an
import scipy.io
import h5py

In [2]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/HWG/gene_index.csv"
genes = pd.read_csv(fpath)
print(f"{genes.shape=}")
genes.columns = ['gene_id', 'gene_name', 'transcription_factor']
genes['transcription_factor'] = genes['transcription_factor'].map({1 : True, 0 : False})

for c in genes.columns:
    genes[c] = genes[c].astype(str)

genes['gene_name'] = genes['gene_name'].str.strip()
genes = genes.set_index('gene_id')
genes.head()

genes.shape=(19198, 3)


Unnamed: 0_level_0,gene_name,transcription_factor
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,TSPAN6,False
ENSG00000000005,TNMD,False
ENSG00000000419,DPM1,False
ENSG00000000457,SCYL3,False
ENSG00000000460,C1orf112,False


In [3]:
genes.dtypes

gene_name               object
transcription_factor    object
dtype: object

In [4]:
hwg_path = "/nfs/turbo/umms-indikar/shared/projects/HWG/HWG.mat"
data = scipy.io.loadmat(hwg_path) 
A = data['HWG'][0][0][1]

# create an andata object
hwg = an.AnnData(A)
hwg.var = genes
hwg.obs = genes

hwg.X = hwg.X.astype(bool).astype(int)
hwg.var_names = hwg.var['gene_name'].astype(str).values
hwg.obs_names = hwg.obs['gene_name'].astype(str).values
hwg.var_names_make_unique()
hwg.obs_names_make_unique()

# annotate master regulators
hwg.obs['self_loop'] = np.where(np.diag(hwg.X.todense()) > 0, True, False)
hwg.var['self_loop'] = np.where(np.diag(hwg.X.todense()) > 0, True, False)

hwg.obs['master_regulator'] = hwg.obs['self_loop'] & hwg.obs['transcription_factor']
hwg.var['master_regulator'] = hwg.var['self_loop'] & hwg.var['transcription_factor']

# a few aliases
hwg.obs['MR'] = hwg.obs['master_regulator']
hwg.var['MR'] = hwg.var['master_regulator']

hwg.obs['TF'] = hwg.obs['transcription_factor']
hwg.var['TF'] = hwg.var['transcription_factor']
        
out_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/HWG.scanpy.h5ad"
hwg.write(out_path)
hwg

AnnData object with n_obs × n_vars = 19198 × 19198
    obs: 'gene_name', 'transcription_factor', 'self_loop', 'master_regulator', 'MR', 'TF'
    var: 'gene_name', 'transcription_factor', 'self_loop', 'master_regulator', 'MR', 'TF'

In [5]:
hwg.obs['master_regulator'].value_counts()

master_regulator
False    18424
True       774
Name: count, dtype: int64

In [6]:
hwg.obs['transcription_factor'].value_counts()

transcription_factor
False    17579
True      1619
Name: count, dtype: int64