In [1]:
import os
import sys
import pandas as pd
import numpy as np

import anndata as ad
import scanpy as sc

# PangloaDB

In [2]:
pangloa_path = "/nfs/turbo/umms-indikar/shared/projects/spatial_transcriptomics/data/panglaodb/pandb.tsv.gz"
pdf = pd.read_csv(pangloa_path, sep="\t")

pdf = pdf[pdf['species'].str.contains('Hs')]

cell_types = [
    'Fibroblasts',
    'Hematopoietic stem cells',
    "Erythroblasts",
    "Megakaryocytes",
    "Myeloid-derived suppressor cells",
    "B cells memory",
    "B cells naive",
    "Plasma cells",
    "Monocytes",
    "Macrophages",
    "Dendritic cells",
    "Plasmacytoid dendritic cells",
    "Neutrophils",
    "Eosinophils",
    "Basophils",
    "Reticulocytes",
    "Stromal cells",
    "Osteoblasts",
    "Endothelial cells",
]

columns = [
    'official gene symbol',
    'cell type',
    'ubiquitousness index',
    'gene type',
    'organ',
    'sensitivity_human',
    'specificity_human',
]

pdf = pdf[pdf['cell type'].isin(cell_types)]
pdf = pdf[columns]

pdf.columns = [
    'gene_name',
    'cell_type',
    'ubiquitousness_index',
    'gene_type',
    'organ',
    'sensitivity_human',
    'specificity_human'
]


output_path = "../config/gene_annotations/panglaodb.csv"
pdf.to_csv(output_path, index=False)
pdf.head()

Unnamed: 0,gene_name,cell_type,ubiquitousness_index,gene_type,organ,sensitivity_human,specificity_human
531,CD38,B cells memory,0.016,protein-coding gene,Immune system,0.0,0.014098
532,CD80,B cells memory,0.0,protein-coding gene,Immune system,0.0,0.0
533,CD84,B cells memory,0.023,protein-coding gene,Immune system,0.0,0.009399
534,CD86,B cells memory,0.026,protein-coding gene,Immune system,0.0,0.012218
535,NT5E,B cells memory,0.005,protein-coding gene,Immune system,0.0,0.007206


# SCENIC transcription factors

In [3]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/scenic/tf_lists/allTFs_hg38.txt"

genes = [x.strip() for x in open(fpath)]
print(f"{len(genes)=}")

df = pd.DataFrame({'gene_name' : genes})
df['gene_type'] = 'transcription_factor'

output_path = "../config/gene_annotations/scenic_transcription_factors.csv"
df.to_csv(output_path, index=False)
df.head()

len(genes)=1892


Unnamed: 0,gene_name,gene_type
0,ZNF354C,transcription_factor
1,KLF12,transcription_factor
2,ZNF143,transcription_factor
3,ZIC2,transcription_factor
4,ZNF274,transcription_factor


# Gene Ontology

In [4]:
dpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/GO_annotations/"

hdf = []

for f in os.listdir(dpath):
    if 'neuronal' in f:
        continue
        
    fpath = f"{dpath}{f}"
    tmp = pd.read_csv(fpath, sep='\t')
    hdf.append(tmp)

hdf = pd.concat(hdf)
print(hdf['GO NAME'].value_counts())

columns = [
    'SYMBOL',
    'GO NAME',
]

hdf = hdf[columns]
hdf.columns = [
    'gene_name',
    'go_biological_process',
    
]

output_path = "../config/gene_annotations/go_annotations.csv"
hdf.to_csv(output_path, index=False)
hdf.head()

hematopoietic stem cell proliferation      32
hematopoietic stem cell homeostasis        28
hematopoietic stem cell differentiation    25
Name: GO NAME, dtype: int64


Unnamed: 0,gene_name,go_biological_process
0,LMBR1L,hematopoietic stem cell differentiation
1,CHD2,hematopoietic stem cell differentiation
2,UFL1,hematopoietic stem cell differentiation
3,UFL1,hematopoietic stem cell differentiation
4,TP53,hematopoietic stem cell differentiation


# Tabula Sapeins differentially expressed genes

In [5]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/adaptive_sampling/data/tabula_sapiens_filtered.h5ad"
adata = sc.read_h5ad(fpath)

adata

AnnData object with n_obs × n_vars = 27346 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype', 'record_id', 'cell_id'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'log1p', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [6]:
adata.obs['cell_ontology_class'].value_counts()

fibroblast                 26586
hematopoietic stem cell      760
Name: cell_ontology_class, dtype: int64

In [7]:
key = 'cell_ontology_class'

sc.tl.rank_genes_groups(adata, 
                        groupby=key,
                        method='wilcoxon',
                        key_added=key,
                        pts=True,
                        corr_method='benjamini-hochberg')

deg = sc.get.rank_genes_groups_df(adata, 
                                  group=None,
                                  key=key)

deg.head()



  foldchanges = (self.expm1_func(mean_group) + 1e-9) / (
  self.expm1_func(mean_rest) + 1e-9
  foldchanges = (self.expm1_func(mean_group) + 1e-9) / (
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  foldchanges = (self.expm1_func(mean_group) + 1e-9) / (
  self.expm1_func(mean_rest) + 1e-9
  foldchanges = (self.expm1_func(mean_group) + 1e-9) / (


Unnamed: 0,group,names,scores,logfoldchanges,pvals,pvals_adj,pct_nz_group,pct_nz_reference
0,fibroblast,DCN,46.815006,874.562866,0.0,0.0,0.994396,0.003947
1,fibroblast,MGP,46.606716,445.559448,0.0,0.0,0.990032,0.002632
2,fibroblast,C1S,46.314835,182.270279,0.0,0.0,0.985406,0.001316
3,fibroblast,COL6A2,46.147636,50.370811,0.0,0.0,0.982322,0.046053
4,fibroblast,C1R,46.06068,153.544693,0.0,0.0,0.979576,0.022368


In [8]:
def filter_dataframe(df, lfc_thresh=2, pval_adj_thresh=0.05, pct_nz_thresh=0.5):
  """
  Filters a DataFrame based on logfoldchanges, pvals_adj, and pct_nz_group.

  Args:
      df (pandas.DataFrame): The DataFrame to filter.
      lfc_thresh (float, optional): Minimum absolute value of logfoldchange. Defaults to 2.
      pval_adj_thresh (float, optional): Maximum adjusted p-value. Defaults to 0.05.
      pct_nz_thresh (float, optional): Minimum proportion of non-zero values in a group. Defaults to 0.5.

  Returns:
      pandas.DataFrame: The filtered DataFrame.
  """

  return df[
      (df['logfoldchanges'].abs() >= lfc_thresh) &
      (df['pvals_adj'] <= pval_adj_thresh) &
      (df['pct_nz_group'] >= pct_nz_thresh)
  ]

sig = filter_dataframe(deg, lfc_thresh=2, pval_adj_thresh=0.05, pct_nz_thresh=0.5)
print(f"{sig.shape=}")

columns = [
    'group',
    'names', 
    'logfoldchanges',
    'pvals_adj',
    'pct_nz_group',
]

sig = sig[columns]

sig.columns = [
    'cell_type',
    'gene_name',
    'logfoldchanges',
    'pvals_adj',
    'pct_nz_group',
]

output_path = "../config/gene_annotations/tabula_sapiens_deg.csv"
sig.to_csv(output_path, index=False)

sig.head()

sig.shape=(3215, 8)


Unnamed: 0,cell_type,gene_name,logfoldchanges,pvals_adj,pct_nz_group
0,fibroblast,DCN,874.562866,0.0,0.994396
1,fibroblast,MGP,445.559448,0.0,0.990032
2,fibroblast,C1S,182.270279,0.0,0.985406
3,fibroblast,COL6A2,50.370811,0.0,0.982322
4,fibroblast,C1R,153.544693,0.0,0.979576
