In [1]:
import os
import pandas as pd
import scanpy as sc
from anndata import AnnData
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from matplotlib.pyplot import rc_context
from matplotlib import cm
from collections import defaultdict
import json
from itertools import chain

In [2]:
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Arial']
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
sns.set_theme(style="ticks", rc=custom_params)

# matplotlib.rcParams['figure.figsize'] = [5, 5]
# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3
sc.logging.print_header()
sc.set_figure_params(dpi=300)
%matplotlib inline

2024-10-15 16:22:39.895580: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


scanpy==1.9.6 anndata==0.8.0 umap==0.5.5 numpy==1.23.5 scipy==1.11.4 pandas==1.5.3 scikit-learn==1.4.0 statsmodels==0.14.0 igraph==0.10.3 pynndescent==0.5.8


In [2]:
wd = '/mnt/data/hong/2022/DHJ1_human_obesity_placenta/'

In [3]:
os.chdir(wd)

In [4]:
ad_clean = sc.read_h5ad('output/10x_h5/h5ad/ad_v2.h5ad')

In [86]:
def function_sets(cell_type,function_set, function_name, mode):
    ## read in the data
    # ad_zscore = sc.pp.scale(ad_clean[ad_clean.obs.final_celltypes==cell_type], max_value=10, zero_center=True, layer = 'log_norm', copy = True)
    ad = ad_clean[ad_clean.obs.final_celltypes==cell_type]
    df = pd.read_csv(f'output/DEGs/final_negbinom_all/score_tsv/{cell_type}.tsv', sep='\t')
    ## order df by score
    df.sort_values('score' , ascending=False, inplace=True)
    common_set = df.query("convergence=='high'&p<0.05")['Unnamed: 0']
    divergent_set = df.query("convergence=='low'&p<0.05")['Unnamed: 0']
    common_function = [element for element in common_set if element in function_set]
    divergent_function = [element for element in divergent_set if element in function_set]
    print(len(common_function))
    print(len(divergent_function))
    if mode == 'common':
        function2plot = common_function
    if mode == 'divergent':
        function2plot = divergent_function
    elif mode=='both':
        function2plot = common_function+divergent_function
    if len(function2plot)>0:
        print(function2plot)
        function_name = function_name.replace('/', '_')
        sc.pl.dotplot(ad, function2plot, groupby='group', dendrogram=False, layer = 'log_norm', show=False, swap_axes=True, cmap='YlOrRd', standard_scale='var', save=f'fig2_{cell_type}_{function_name}_standard.pdf', figsize=(2.5, len(function2plot)/2), use_raw=False, categories_order=['Normal_AGA', 'Obese_AGA', 'Obese_LGA'], dot_max
=1, dot_min=0.1)

    # sc.pl.stacked_violin(ad_clean[ad_clean.obs.final_celltypes==cell_type], list(function2plot), groupby='group', dendrogram=False, standard_scale='var', show=False, swap_axes=True, cmap='YlOrRd',save=f'fig2_{cell_type}_{function_name}_violin.pdf', figsize=(2.5, len(function2plot)/2), use_raw=False, categories_order=['Normal_AGA', 'Obese_AGA', 'Obese_LGA'])

In [24]:
genesets = pd.read_csv('data/Placenta_genesets.tsv', sep='\t')
transporters = set(genesets.query("geneset!='secretome'")['genesymbol'])
secretome = set(genesets.query("geneset=='secretome'")['genesymbol'])
tnfa = set(pd.read_csv('data/TNFa.tsv', sep='\t')['SYMBOL'])
hypoxia = set(pd.read_csv('data/hypoxia.tsv', sep='\t')['SYMBOL'])

In [26]:
apical = set(pd.read_csv('data/HALLMARK_APICAL_JUNCTION.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
myc = set(pd.read_csv('data/HALLMARK_MYC_TARGETS_V2.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
mitotic = set(pd.read_csv('data/HALLMARK_MITOTIC_SPINDLE.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
g2m = set(pd.read_csv('data/HALLMARK_G2M_CHECKPOINT.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
adipo = set(pd.read_csv('data/HALLMARK_ADIPOGENESIS.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
il2 = set(pd.read_csv('data/HALLMARK_IL2_STAT5_SIGNALING.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))
heme = set(pd.read_csv('data/HALLMARK_HEME_METABOLISM.v2024.1.Hs.tsv', sep='\t').iloc[16, 1].split(','))

In [88]:
hallmark = pd.read_csv('data/MSigDB_Hallmark_2020.tsv', sep='\t', names=['set_name', 'genes'])
kegg = pd.read_csv('data/KEGG_2021_Human.tsv', sep='\t', names=['set_name', 'genes'])
go = pd.read_csv('data/GO_Biological_Process_2023.tsv', sep='\t', names=['set_name', 'genes'])

In [82]:

function_sets('CTB', function_set, 'Apical Junction', 'common')
function_sets('CTB', function_set,'Mitotic Spindle', 'common')
function_sets('CTB', function_set,'G2-M Checkpoint', 'common')
function_sets('CTB',function_set, 'Adipogenesis', 'common')
function_sets('CTB', function_set,'IL-2/STAT5 Signaling', 'common')
function_sets('CTB',function_set, 'heme Metabolism', 'common')
function_sets('CTB', function_set,'Hypoxia', 'common')
function_sets('CTB',function_set, 'Myc Targets V2', 'divergent')

1
0
['ITGB4']
2
0
['SORBS2', 'DST']


  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
  dot_ax.scatter(x, y, **kwds)


1
0
['SLC38A1']


  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


0
0
2
0
['PRKCH', 'RABGAP1L']
2
0
['FOXO3', 'TFDP2']


  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
  dot_ax.scatter(x, y, **kwds)


2
0
['FOXO3', 'RBPJ']


  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


0
0


In [87]:
set_name = "PI3K-Akt signaling pathway"
function_set = get_set_genes(kegg, set_name)
function_sets('CTB',function_set, set_name, 'common')

12
2
['GHR', 'MET', 'HSP90AA1', 'ITGB4', 'CSH1', 'FN1', 'CSF3R', 'FOXO3', 'LAMB1', 'PRLR', 'ERBB3', 'PDGFD']


  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


In [89]:
set_name = "Positive Regulation Of MAPK Cascade (GO:0043410)"
function_set = get_set_genes(go, set_name) + ['MAPK4']
function_sets('CTB',function_set, set_name, 'divergent')

2
5
['WWC1', 'PRKCZ', 'MAPK4', 'ERBB4', 'KSR1']


  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


In [76]:
def get_set_genes(set_df, set_name):
    gene_str= set_df.loc[set_df.set_name==set_name, 'genes'].to_list()
    gene_list = gene_str[0].split(',')
    return gene_list


In [72]:
function_sets('CTB', ctb_sets, 'ctb_sets', mode='common')

NameError: name 'ctb_sets' is not defined

In [16]:
function_sets('CTB', apical, 'apical_junction', mode='common')
function_sets('CTB', myc, 'myc_targets', mode='divergent')

['ITGB4']
[]


  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
  norm = cell_h * nrows / sum(self._row_height_ratios)
  cell_heights = [r * norm for r in self._row_height_ratios]
  dot_ax.scatter(x, y, **kwds)
  dot_ax.set_ylim(dot_color.shape[0], 0)
  norm = cell_h * nrows / sum(self._row_height_ratios)
  cell_heights = [r * norm for r in self._row_height_ratios]
  norm = cell_h * nrows / sum(self._row_height_ratios)
  cell_heights = [r * norm for r in self._row_height_ratios]
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


[]

  func(*args, **kwargs)
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


Error in callback <function _draw_all_if_interactive at 0x7f3064fe0b80> (for post_execute):


ValueError: cannot convert float NaN to integer

  fig.canvas.print_figure(bytes_io, **kw)
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


ValueError: cannot convert float NaN to integer

<Figure size 750x0 with 4 Axes>

In [27]:
function_sets('STB-b', transporters, 'transporters', mode='both')
function_sets('STB-c', transporters, 'transporters', mode='both')



  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')




  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


In [26]:
function_sets('STB-b', tnfa, 'tnfa', mode='common')
function_sets('STB-c', tnfa, 'tnfa', mode='common')
function_sets('STB-b', hypoxia, 'hypoxia', mode='common')
function_sets('STB-c', hypoxia, 'hypoxia', mode='common')



  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')




  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')




  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')




  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


In [28]:
function_sets('STB-b', secretome, 'secretome', mode='both')
function_sets('STB-c', secretome, 'secretome', mode='both')



  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')




  dot_ax.scatter(x, y, **kwds)
  pl.savefig(filename, dpi=dpi, bbox_inches='tight')


... as `zero_center=True`, sparse input is densified and may lead to large memory consumption


In [40]:
ad_zscore

AnnData object with n_obs × n_vars = 37408 × 26913
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes', 'sample', 'group', 'batch', 'sex', 'mother', 'Hemoglobins', 'S_score', 'G2M_score', 'phase', 'leiden', 'cell type', 'subset', 'C_scANVI', 'maturation_inhouse_three', 'final_celltypes'
    var: 'means', 'variances', 'residual_variances', 'highly_variable_rank', 'highly_variable_nbatches', 'highly_variable_intersection', 'highly_variable', 'mean', 'std'
    uns: 'group_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'pearson_residuals_normalization', 'subset_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'latent_gene_encoding'
    varm: 'PCs'
    layers: 'log_norm', 'raw', 'sqrtCPMedian', 'sqrt_norm'
    obsp: 'connectivities', 'distances'