In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scanpy as sc
import anndata as an
from scipy.sparse import csr_matrix
import scipy.io
import h5py
from scipy.sparse.linalg import eigsh

# Load a GTF

In [2]:
# buld a gene look up table

fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/references/geneTable.csv"

columns = [
    'gene_id',
    'gene_name',
    'Feature',
    'gene_biotype',
    'Chromosome',
    'Start',
    'End',
]

gdf = pd.read_csv(fpath, usecols=columns)
gdf = gdf[gdf['Feature'] == 'gene']
gdf = gdf.drop_duplicates()
gdf = gdf[gdf['gene_name'].notna()]
print(f"{gdf.shape=}")
gdf.head()

  gdf = pd.read_csv(fpath, usecols=columns)


gdf.shape=(41407, 7)


Unnamed: 0,Chromosome,Feature,Start,End,gene_id,gene_name,gene_biotype
0,1,gene,1471764,1497848,ENSG00000160072,ATAD3B,protein_coding
111,1,gene,629061,629433,ENSG00000225972,MTND1P23,unprocessed_pseudogene
114,1,gene,634375,634922,ENSG00000198744,MTCO3P12,unprocessed_pseudogene
117,1,gene,182695,184174,ENSG00000279928,DDX11L17,unprocessed_pseudogene
129,1,gene,3069167,3438621,ENSG00000142611,PRDM16,protein_coding


# Load SCENIC TF list

In [3]:
filepath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/scenic_resources/500bp_up_100bp_down_B.csv"
sdf = pd.read_csv(filepath)
sdf = sdf.rename(columns={"Unnamed: 0": "gene_name"})
sdf.head()

scenic_transcription_factors = sdf.columns.to_list()
print(len(scenic_transcription_factors))

1606


# Load TRRUST

In [4]:
# add regulator information
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/TRRUST/trrust_rawdata.human.tsv"
rdf = pd.read_csv(fpath, sep="\t", header=None)
rdf.columns = ['source', 'target', 'class', 'evidence']
print(f"{rdf.shape=}")

class_map = {
    'Unknown' : 0, 
    'Activation' : 1,
    'Repression' : -1,
}

rdf['type'] = rdf['class'].map(class_map)

print(f"{rdf['source'].nunique()=}")
print(f"{rdf['target'].nunique()=}")

rdf.head()

rdf.shape=(9396, 4)
rdf['source'].nunique()=795
rdf['target'].nunique()=2492


Unnamed: 0,source,target,class,evidence,type
0,AATF,BAX,Repression,22909821,-1
1,AATF,CDKN1A,Unknown,17157788,0
2,AATF,KLK3,Unknown,23146908,0
3,AATF,MYC,Activation,20549547,1
4,AATF,TP53,Unknown,17157788,0


# Load gene index

In [5]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/gene_index.csv"
genes = pd.read_csv(fpath)

genes = genes.rename(columns={
    'Row' : 'gene_id',
    'GeneName' : 'gene_name',
    'Chromosome' : 'chrom',
    'Gene_start_bp' : 'start',
    'Gene_end_bp' : 'end',
    'Source' : 'source',
    'IsTF' : 'tf',
    'IsSelfConnected' : 'mr',
    'HWGclass' : 'gene_class',
})

genes['gene_name'] = genes['gene_name'].str.strip()

genes['is_scenic_tf'] = genes['gene_name'].isin(scenic_transcription_factors).astype(int)

# merge the metadata
genes = pd.merge(genes, gdf[['gene_id', 'gene_biotype']], 
                 how='left',
                 left_on='gene_id',
                 right_on='gene_id', 
                )

print(f"{genes.shape=}")
genes = genes.set_index('gene_name')
genes.head()

genes.shape=(18771, 11)


Unnamed: 0_level_0,gene_id,chrom,start,end,source,tf,mr,gene_class,is_scenic_tf,gene_biotype
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
OR4F5,ENSG00000186092,1,65419,71585,HGNC Symbol,0,0,g3,0,protein_coding
OR4F29,ENSG00000284733,1,450703,451697,HGNC Symbol,0,0,g3,0,protein_coding
OR4F16,ENSG00000284662,1,685679,686673,HGNC Symbol,0,0,g3,0,protein_coding
SAMD11,ENSG00000187634,1,923928,944581,HGNC Symbol,0,0,g3,0,protein_coding
NOC2L,ENSG00000188976,1,944203,959309,HGNC Symbol,0,0,g3,0,protein_coding


In [6]:
genes['gene_class'].value_counts()

gene_class
g3    17170
g2      916
g1      685
Name: count, dtype: int64

In [7]:
genes['is_scenic_tf'].value_counts()

is_scenic_tf
0    17203
1     1568
Name: count, dtype: int64

# load the HWG

In [8]:
hwg_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/hwg.csv"

X = pd.read_csv(hwg_path, low_memory=False)
X = X.rename(columns={
    'Row' : 'gene_name',
})

X = X.set_index('gene_name')
X = X.astype(int)
print(f"{X.shape=}")
X.head()

X.shape=(18771, 18771)


Unnamed: 0_level_0,OR4F5,OR4F29,OR4F16,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,...,MT-CO2,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F16,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMD11,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NOC2L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# test for symmetry
np.allclose(X.to_numpy(), X.to_numpy().T)

True

In [10]:
# compute quick summary stats
total_vals = X.to_numpy().sum()
sparsity = 100 - (X.to_numpy().sum() / X.to_numpy().size) * 100
print(f"{total_vals=}")
print(f"{sparsity=:.3f}")

total_vals=1092721
sparsity=99.690


# Build the HWG

In [11]:
hwg = an.AnnData(csr_matrix(X.to_numpy()))
hwg.var = genes.copy()
hwg.obs = genes.copy()

hwg.var_names = X.columns.copy()
hwg.obs_names = X.index.copy()

# add some metadata 
hwg.uns['TRRUST'] = rdf.copy()
hwg.uns['SCENIC+'] = sdf.copy()
hwg.uns['scenic_transcription_factors'] = scenic_transcription_factors
hwg.uns['transcription_factors'] = genes[(genes['tf'] == 1) & (genes.index.notna())].index.to_list()
hwg.uns['master_regulators'] = genes[(genes['mr'] == 1) & (genes.index.notna())].index.to_list()

# add some eignevalues
k = 10
which = "LM"
eigenvalues, eigenvectors = eigsh(hwg.X.asfptype(), k=k, which=which) 
hwg.uns['n_eigenvalues'] = k
hwg.uns['which_eigenvalues'] = which
hwg.uns['eigenvalues'] = eigenvalues
hwg.uns['eigenvectors'] = eigenvectors

out_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/HWG.scanpy.h5ad"
hwg.write(out_path)

hwg

AnnData object with n_obs × n_vars = 18771 × 18771
    obs: 'gene_id', 'chrom', 'start', 'end', 'source', 'tf', 'mr', 'gene_class', 'is_scenic_tf', 'gene_biotype'
    var: 'gene_id', 'chrom', 'start', 'end', 'source', 'tf', 'mr', 'gene_class', 'is_scenic_tf', 'gene_biotype'
    uns: 'TRRUST', 'SCENIC+', 'scenic_transcription_factors', 'transcription_factors', 'master_regulators', 'n_eigenvalues', 'which_eigenvalues', 'eigenvalues', 'eigenvectors'

In [12]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Archive

In [None]:
# fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/gene_index.csv"
# genes = pd.read_csv(fpath)


# columns = [
#     'Stable ID',
#     'Transcription Factor',
# ]

# genes = pd.read_csv(fpath, usecols=columns)
# genes.columns = ['gene_id', 'tf']

# # merge the metadata
# genes = pd.merge(genes, gdf, 
#                  how='left',
#                  left_on='gene_id',
#                  right_on='gene_id', 
#                 )

# print(f"{genes.shape=}")
# genes.head()


# genes = genes.rename(columns={
#     'Chromosome' : 'chrom',
#     'Feature' : 'feature',
#     'Start' : 'start',
#     'End' : 'end',
# })
# genes.head()

# genes['gene_name'] = genes['gene_name'].str.strip()

# # typing
# genes['gene_name'] = genes['gene_name'].astype(str)
# genes['chrom'] = genes['chrom'].astype(str)
# genes['feature'] = genes['feature'].astype(str)
# genes['gene_biotype'] = genes['gene_biotype'].astype(str)

# genes = genes.set_index('gene_name')
# print(f"{genes.shape=}")
# genes.head()

In [None]:
# hwg_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/hwg.csv"

# X = pd.read_csv(hwg_path, header=None)
# print(f"{X.shape=}")

In [None]:
# X.head()

# Build the HWG

In [None]:
# hwg = an.AnnData(csr_matrix(X.to_numpy()))
# hwg.var = genes.copy()
# hwg.obs = genes.copy()

# hwg.var_names_make_unique()
# hwg.obs_names_make_unique()

# # add some metadata 
# hwg.uns['transcription_factors'] = genes[(genes['tf'] == 1) & (genes.index.notna())].index.to_list()
# hwg.uns['master_regulators'] = genes[(genes['mr'] == 1) & (genes.index.notna())].index.to_list()

# # add some eignevalues
# k = 10
# which = "LM"
# eigenvalues, eigenvectors = eigsh(hwg.X.asfptype(), k=k, which=which) 
# hwg.uns['n_eigenvalues'] = k
# hwg.uns['which_eigenvalues'] = which
# hwg.uns['eigenvalues'] = eigenvalues
# hwg.uns['eigenvectors'] = eigenvectors

# out_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/HWG.scanpy.h5ad"
# hwg.write(out_path)

# hwg

In [None]:
# np.allclose(hwg.X.todense(), hwg.X.todense().T)

In [None]:
# # compute quick summary stats
# total_vals = hwg.X.sum()
# sparsity = 100 - (hwg.X.sum() / hwg.X.todense().size) * 100
# print(f"{total_vals=}")
# print(f"{sparsity=:.3f}")

In [None]:
break

# Archive 2

In [None]:
# hwg_path = "/nfs/turbo/umms-indikar/shared/projects/HWG/HWG.mat"
# data = scipy.io.loadmat(hwg_path) 
# A = data['HWG'][0][0][1]

# # create an andata object
# hwg = an.AnnData(A)
# hwg.var = genes[['gene_name']].copy()
# hwg.obs = genes

# hwg.X = hwg.X.astype(bool).astype(int)
# hwg.var_names = hwg.var['gene_name'].astype(str).values
# hwg.obs_names = hwg.obs['gene_name'].astype(str).values
# hwg.var_names_make_unique()
# hwg.obs_names_make_unique()

# # annotate master regulators
# hwg.obs['self_loop'] = np.where(np.diag(hwg.X.todense()) > 0, True, False)
# hwg.obs['master_regulator'] = hwg.obs['self_loop'] & hwg.obs['transcription_factor']

# # a few aliases
# hwg.obs['MR'] = hwg.obs['master_regulator']
# hwg.obs['TF'] = hwg.obs['transcription_factor']

# # a few metadata columns
# hwg.obs['degree'] = hwg.to_df().sum(axis=1).values


# # a few unstructured lists
# hwg.uns['transcription_factors'] = list(hwg.obs[hwg.obs['TF']].index)
# hwg.uns['master_regulators'] = list(hwg.obs[hwg.obs['MR']].index)

# k = 25
# which = "LM"
# eigenvalues, eigenvectors = eigsh(hwg.X.asfptype(), k=k, which=which) 
# hwg.uns['n_eigenvalues'] = k
# hwg.uns['which_eigenvalues'] = which
# hwg.uns['eigenvalues'] = eigenvalues
# hwg.uns['eigenvectors'] = eigenvectors
        
# out_path = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/HWG/HWG.scanpy.h5ad"
# # hwg.write(out_path)
# hwg

In [None]:
# hwg.obs['master_regulator'].value_counts()

In [None]:
# hwg.obs['transcription_factor'].value_counts()

In [None]:
# np.allclose(hwg.X.todense(), hwg.X.todense().T)

In [None]:
# hwg.X.max()

In [None]:
# np.triu(hwg.X.todense()).sum()

In [None]:
# np.tril(hwg.X.todense()).sum()

In [None]:
break

In [None]:
# print(hwg.X[11050, 17134])
# print(hwg.X[17134, 11050])

In [None]:
# row, col = hwg.X.nonzero()
# idx_pairs = list(zip(row, col))
# idx_pairs_transposed = list(zip(col, row))  # Transpose indices

# asymmetric_pairs = set(idx_pairs) - set(idx_pairs_transposed)
# print(asymmetric_pairs)  # {(0, 2)}  since A[0, 2] != A[2, 0]