In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc

In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/geneformer_adata/merged.anndata.h5ad"

adata = sc.read_h5ad(fpath)
adata

AnnData object with n_obs × n_vars = 166691 × 19309
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'Barcode', 'Library', 'dataset', 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype', 'record_id', 'cell_id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'seurat_clusters', 'STD.CellType', 'STD_Cat', 'STD_Cat2', 'Sample', 'HLF', 'CRHBP', 'CD34', 'MitoCoverage', 'ClonalGroup', 'Sig.HSC1', 'Sig.Prog1', 'Sig.EarlyE1', 'Sig.LateE1', 'Sig.ProMono1', 'Sig.Mono1', 'Sig.ncMono1', 'Sig.cDC1', 'Sig.pDC1', 'Sig.ProB1', 'Sig.PreB1', 'Sig.B1', 'Sig.Plasma1', 'Sig.T1', 'Sig.CTL1', 'Sig.NK1', 'meanCov', 'ClonalGroup.Prob', 'wsnn_res.0.8', 'Origin.Seurat'
    var: 'gene_name', 'ensemble_id'

In [3]:
keep = [
    # 'LinNegCD34lowCD164high',
    'HSC',
    # 'LinNegCD34PosCD164Pos',
    'FB',
    'Refined.HSC',
]

idx = adata.obs[adata.obs['cell_type'].isin(keep)].index

pdf = adata[idx, :].copy()
pdf.obs['broad_type'] = np.where(pdf.obs['cell_type'] == 'FB', 'Fibroblast', 'HSC')
pdf


AnnData object with n_obs × n_vars = 47643 × 19309
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'Barcode', 'Library', 'dataset', 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype', 'record_id', 'cell_id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'seurat_clusters', 'STD.CellType', 'STD_Cat', 'STD_Cat2', 'Sample', 'HLF', 'CRHBP', 'CD34', 'MitoCoverage', 'ClonalGroup', 'Sig.HSC1', 'Sig.Prog1', 'Sig.EarlyE1', 'Sig.LateE1', 'Sig.ProMono1', 'Sig.Mono1', 'Sig.ncMono1', 'Sig.cDC1', 'Sig.pDC1', 'Sig.ProB1', 'Sig.PreB1', 'Sig.B1', 'Sig.Plasma1', 'Sig.T1', 'Sig.CTL1', 'Sig.NK1', 'meanCov', 'ClonalGroup.Prob', 'wsnn_res.0.8', 'Origin.Seurat', 'broad_type'
    var: 'gene_name', 'ensembl

In [4]:
pdf.obs['broad_type'].value_counts()

broad_type
Fibroblast    26586
HSC           21057
Name: count, dtype: int64

In [5]:
sc.pp.filter_genes(pdf, min_counts=3)
sc.pp.filter_cells(pdf, min_counts=100)

sc.pp.normalize_total(pdf)
sc.pp.log1p(pdf)

sc.pp.combat(pdf, key='dataset')

# handle negatives
pdf.X = np.where(pdf.X < 0, 0, pdf.X)

sc.pp.highly_variable_genes(pdf)

print('done')


  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


done


In [6]:
sc.tl.rank_genes_groups(pdf, 
                        groupby="broad_type",
                        method='wilcoxon',
                        corr_method='benjamini-hochberg',
                        pts=True,
                        key_added='deg',
                       )

print('Done')

Done


In [7]:
deg = sc.get.rank_genes_groups_df(pdf, 
                                  group=None,
                                  key='deg')

deg.head()

Unnamed: 0,group,names,scores,logfoldchanges,pvals,pvals_adj,pct_nz_group,pct_nz_reference
0,Fibroblast,TRPV5,180.885025,-0.583331,0.0,0.0,1.0,0.450966
1,Fibroblast,CAMKV,180.885025,-0.585088,0.0,0.0,1.0,0.096785
2,Fibroblast,SPACA7,180.867188,-0.609371,0.0,0.0,1.0,0.096832
3,Fibroblast,PLPPR1,180.84935,-0.578272,0.0,0.0,1.0,0.09688
4,Fibroblast,CACNG1,180.84935,-0.616132,0.0,0.0,1.0,0.25089


In [8]:
sig = deg.copy()

pvals_adj = 0.05
logfoldchanges = 0.0
pct_nz_group = 0.9

sig = sig[~sig['names'].str.startswith("RP")]
sig = sig[~sig['names'].str.startswith("MT")]
sig = sig[sig['pvals_adj'] < pvals_adj]
sig = sig[sig['logfoldchanges'] > logfoldchanges]
sig = sig[sig['pct_nz_group'] > pct_nz_group]

print(sig['group'].value_counts())

sig.head()

group
HSC           540
Fibroblast    129
Name: count, dtype: int64


Unnamed: 0,group,names,scores,logfoldchanges,pvals,pvals_adj,pct_nz_group,pct_nz_reference
3241,Fibroblast,CFD,83.714836,0.288471,0.0,0.0,0.957421,0.965617
3567,Fibroblast,C1S,67.851257,0.210598,0.0,0.0,0.983262,0.963907
3605,Fibroblast,SERPING1,65.787346,0.201826,0.0,0.0,0.986196,0.969369
3659,Fibroblast,GSN,62.587425,0.230001,0.0,0.0,0.986722,0.97502
3724,Fibroblast,C1R,58.892143,0.212016,0.0,0.0,0.974122,0.963955


In [9]:
out = sig.copy()

out.columns = ['cell_type', 'gene_name', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj', 'pct_nz_group', 'pct_nz_reference']

outpath = "../config/gene_annotations/HSC_vs_FB_pure.csv"
out.to_csv(outpath, index=False)
out.head()

Unnamed: 0,cell_type,gene_name,scores,logfoldchanges,pvals,pvals_adj,pct_nz_group,pct_nz_reference
3241,Fibroblast,CFD,83.714836,0.288471,0.0,0.0,0.957421,0.965617
3567,Fibroblast,C1S,67.851257,0.210598,0.0,0.0,0.983262,0.963907
3605,Fibroblast,SERPING1,65.787346,0.201826,0.0,0.0,0.986196,0.969369
3659,Fibroblast,GSN,62.587425,0.230001,0.0,0.0,0.986722,0.97502
3724,Fibroblast,C1R,58.892143,0.212016,0.0,0.0,0.974122,0.963955


In [10]:
out.tail(15)

Unnamed: 0,cell_type,gene_name,scores,logfoldchanges,pvals,pvals_adj,pct_nz_group,pct_nz_reference
33026,HSC,SERPINA9,-93.808411,0.388746,0.0,0.0,0.939403,1.0
33027,HSC,OR2Y1,-93.826248,0.388919,0.0,0.0,0.939355,1.0
33028,HSC,MAGEB3,-93.826248,0.388924,0.0,0.0,0.939355,1.0
33035,HSC,TNP2,-93.844078,0.389097,0.0,0.0,0.939308,1.0
33036,HSC,OTX2,-93.861908,0.389251,0.0,0.0,0.93926,1.0
33037,HSC,SNCB,-93.861908,0.389255,0.0,0.0,0.93926,1.0
33038,HSC,CCDC105,-93.861908,0.421033,0.0,0.0,0.923683,1.0
33040,HSC,OR10J5,-93.861908,0.389258,0.0,0.0,0.93926,1.0
33042,HSC,OR4Q3,-93.861908,0.38925,0.0,0.0,0.93926,1.0
33043,HSC,IRGC,-93.861908,0.389248,0.0,0.0,0.93926,1.0


In [11]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
marker_genes = sig.groupby('group')['names'].agg(list).to_dict()
marker_genes.keys()

In [None]:
our = adata[adata.obs['dataset'] == "iHSC", :]
our

In [None]:
sc.pp.filter_genes(our, min_counts=3)
sc.pp.filter_cells(our, min_counts=100)

sc.pp.normalize_total(our)
sc.pp.log1p(our)
sc.pp.highly_variable_genes(our)
sc.tl.pca(our)
sc.pp.neighbors(our)
sc.tl.umap(our)

In [None]:
# score the gene sets
def min_max(v):
    return (v - v.min()) / (v.max() - v.min())

def filter_genes(gene_list, our):
    return [x for x in gene_list if x in pdf.var['gene_name'].values]

for k, v in marker_genes.items():
    
    gene_names = filter_genes(v, our)
    
    sc.tl.score_genes(our,
                      gene_list=gene_names, 
                      ctrl_size=len(gene_names),
                      n_bins=11,
                      score_name=k)
    
    # normalize the scores
    our.obs[k] = min_max(our.obs[k])
    

sc.pl.umap(our, 
       s=80,
       cmap='inferno',
       alpha=0.5,
       color=list(marker_genes.keys()))