In [1]:
import os
import sys
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import scanpy.external as sce
import scipy
import time
import sklearn
from importlib import reload
from scipy.spatial.distance import cdist

# local imports
import utils as ut
import plotting as plt2

sc.settings.verbosity = 3 

# Load the data

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/scanvi_models/raw_anndata/adata.h5ad"

adata = sc.read_h5ad(fpath)
adata.X = adata.layers['counts'].copy()

keep_types = [
    'Fib',
    'iHSC',
    'HSC',
]

adata = adata[adata.obs['standard_cell_type'].isin(keep_types), :].copy()
sc.logging.print_memory_usage()

MIN_GENES = 1000
MIN_CELLS = 100

sc.pp.filter_cells(adata, min_genes=MIN_GENES)
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)

sc.logging.print_memory_usage()
adata

Memory usage: current 12.25 GB, difference +12.25 GB
filtered out 2342 cells that have less than 1000 genes expressed
filtered out 1141 genes that are detected in less than 100 cells
Memory usage: current 14.03 GB, difference +1.78 GB


AnnData object with n_obs × n_vars = 61115 × 17726
    obs: 'n_genes', 'dataset', 'n_genes_by_counts', 'total_counts', 'obs_index', 'cell_type', 'standard_cell_type'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_id', 'token_id', 'gene_biotype', 'Chromosome', 'Start', 'End', 'n_cells'
    layers: 'counts'

In [3]:
adata.obs['standard_cell_type'].value_counts()

standard_cell_type
Fib     33710
HSC     19274
iHSC     8131
Name: count, dtype: int64

# Pseudo Bulk

In [4]:
aggdata = sc.get.aggregate(
    adata,
    by='standard_cell_type',
    func='mean',
    layer='counts',
    axis='obs',
)

aggdata.X = aggdata.layers['mean'].copy()

sc.pp.normalize_total(
    aggdata, 
    target_sum=1e6,
)

sc.pp.log1p(
    aggdata
)

aggdata

normalizing counts per cell
    finished (0:00:00)


AnnData object with n_obs × n_vars = 3 × 17726
    obs: 'standard_cell_type'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_id', 'token_id', 'gene_biotype', 'Chromosome', 'Start', 'End', 'n_cells'
    uns: 'log1p'
    layers: 'mean'

In [15]:
df = aggdata.to_df().T

df['fib_vs_hsc'] = np.log1p(df['Fib']) - np.log1p(df['HSC'])
df = df.sort_values(by='fib_vs_hsc', ascending=False)

df.head(25)

Unnamed: 0_level_0,Fib,HSC,iHSC,fib_vs_hsc
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DCN,9.607837,0.022212,2.471106,2.339624
PLA2G2A,9.249505,0.0,2.311118,2.327229
APOD,8.919829,0.006397,3.464443,2.288159
MGP,8.914555,0.034687,3.790796,2.259905
GPX3,9.248089,0.080128,2.339083,2.250011
ADH1B,8.069215,0.006397,2.673731,2.198509
LUM,7.669123,0.003204,0.351103,2.156569
C3,8.314963,0.080128,4.206537,2.154542
EFEMP1,7.386736,0.0,0.169809,2.126651
COL1A2,7.394848,0.019069,0.0,2.108728


### df.tail(25)

In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
aggdata.to_df().T


In [None]:
break

# Load SCENIC

In [None]:
fpath = "../resources/scenic.500b_up_100bp_down.csv"
df = pd.read_csv(fpath)
df = df.rename(columns={'Unnamed: 0' : 'gene_name'})
df = df.set_index('gene_name')
print(f"{df.shape=}")

# filter for var
df = df[df.index.isin(adata.var_names)]
tf_list = [x for x in df.columns if x in adata.var_names]
df = df[tf_list]

print(f"{df.shape=}")

df.head()

# aggregate by cell type

In [None]:
aggdata = sc.get.aggregate(
    adata,
    by='standard_cell_type',
    func='sum',
    layer='counts',
    axis='obs',
)

aggdata