In [None]:
import os
import warnings

import pandas as pd

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

import schist
import scanpy as sc
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import altair as alt
import pertpy as pt

# set a working directory
# wdir = "/ceph/project/tendonhca/akurjan/analysis/"
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
HARMONY_FOLDERNAME = "foetal/results/Harmony/"
RESULTS_FOLDERNAME = "foetal/results/tascCODA/"
FIGURES_FOLDERNAME = "foetal/figures/tascCODA/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
adata = sc.read_h5ad(os.path.join(HARMONY_FOLDERNAME, 'dev_harmony.h5ad'))
adata

In [None]:
del adata.raw

In [None]:
adata.obs['C_scANVI'].value_counts()

In [None]:
sc.pp.neighbors(adata, n_neighbors=30, use_rep="X_msdiff", metric='euclidean')

In [None]:
sccoda_model = pt.tl.Sccoda()
sccoda_data = sccoda_model.load(
    adata,
    type="cell_level",
    generate_sample_level=True,
    cell_type_identifier="C_scANVI",
    sample_identifier="sampletype",
    covariate_obs=["age", "libbatch", "sample"],
)
sccoda_data

In [None]:
pt.pl.coda.boxplots(
    sccoda_data,
    modality_key="coda",
    feature_name="age",
    figsize=(12, 5),
    add_dots=True,
    args_swarmplot={"palette": ["red"]},
)
savesvg('sccoda_cell_proportionsByAge.svg', plt)
plt.show()

In [None]:
pt.pl.coda.boxplots(
    sccoda_data,
    modality_key="coda",
    feature_name="sample",
    figsize=(12, 5),
    add_dots=True,
    args_swarmplot={"palette": ["red"]}
)
savesvg('sccoda_cell_proportionsPerSample.svg', plt)
plt.show()

In [None]:
pt.pl.coda.boxplots(
    sccoda_data,
    modality_key="coda",
    feature_name="sampletype",
    figsize=(12, 5),
    add_dots=True,
    args_swarmplot={"palette": ["red"]},
)
savesvg('sccoda_cell_proportionsPerSampleType.svg', plt)
plt.show()

In [None]:
pt.pl.coda.boxplots(
    sccoda_data,
    modality_key="coda",
    feature_name="libbatch",
    figsize=(12, 5),
    add_dots=True,
    args_swarmplot={"palette": ["red"]},
)
plt.show()

In [None]:
pt.pl.coda.stacked_barplot(
    sccoda_data, modality_key="coda", feature_name="sample", figsize=(6, 4)
)
savesvg('sccoda_cell_proportionsPerSample_barplot.svg', plt)
plt.show()

In [None]:
pt.pl.coda.stacked_barplot(
    sccoda_data, modality_key="coda", feature_name="sampletype", figsize=(6, 4)
)
savesvg('sccoda_cell_proportionsPerSampleType_barplot.svg', plt)
plt.show()

In [None]:
pt.pl.coda.stacked_barplot(
    sccoda_data, modality_key="coda", feature_name="age", figsize=(6, 4)
)
savesvg('sccoda_cell_proportionsByAge_barplot.svg', plt)

plt.show()

In [None]:
pt.pl.coda.stacked_barplot(
    sccoda_data, modality_key="coda", feature_name="libbatch", figsize=(6, 4)
)
plt.show()

In [None]:
sccoda_data = sccoda_model.prepare(
    sccoda_data,
    modality_key="coda",
    formula="sampletype + libbatch + age",
    reference_cell_type="Nervous System Cells",
)
sccoda_model.run_nuts(sccoda_data, modality_key="coda", rng_key=1234)

In [None]:
sccoda_data["coda"]

In [None]:
sccoda_data["coda"].varm['effect_df_age[T.17w]']

In [None]:
sccoda_model.set_fdr(sccoda_data, 0.2)

In [None]:
sccoda_model.credible_effects(sccoda_data, modality_key="coda")

In [None]:
sccoda_model.summary(sccoda_data, modality_key="coda")

In [None]:
# saving
sccoda_data.write_h5mu(os.path.join(RESULTS_FOLDERNAME, 'sccoda_dev_msdiff'))

# loading
#sccoda_data_2 = mu.read_h5mu(path)
#sccoda_model.summary(sccoda_data_2, modality_key="coda")

# TASCCODA

Not changing neighbours here! Running on existing connectivities (from draw_graph_fa)

In [None]:
sc.pl.draw_graph(adata,color="C_scANVI")

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit

# n_cells_total = 20000  # total number of cells to keep
# cell_type_names = adata.obs['C_scANVI'].unique()  # get the unique cell type names
# cell_type_counts = adata.obs['C_scANVI'].value_counts()  # get the number of cells in each cell type

# indices_to_keep = []  # initialize a list to store the indices of the cells to keep
# for cell_type in cell_type_names:
#     n_cells_to_keep = int(n_cells_total * cell_type_counts[cell_type] / adata.shape[0])  # calculate the number of cells to keep for this cell type
#     cell_type_indices = adata.obs.index[adata.obs['C_scANVI'] == cell_type]  # get the indices of the cells in this cell type
#     if len(cell_type_indices) > n_cells_to_keep:
#         sss = StratifiedShuffleSplit(n_splits=1, test_size=n_cells_to_keep, random_state=0)  # create a stratified shuffle split object
#         _, cell_type_indices_to_keep = next(sss.split(X=adata.obsm['X_pca'], y=adata.obs['C_scANVI']))  # use the generator to get the indices
#     else:
#         cell_type_indices_to_keep = cell_type_indices
#     indices_to_keep.extend(cell_type_indices_to_keep)

# adata_subsampled = adata[indices_to_keep, :]  # create a new AnnData object containing only the subsampled cells


In [None]:
# adata_subsampled

In [None]:
#sc.pl.draw_graph(adata_subsampled,color="C_scANVI")

In [None]:
adata.X.max()

In [None]:
adata.obsm

In [None]:
import schist
schist.inference.nested_model(adata, n_jobs=20, 
                              n_sweep=5, 
                              refine_model=False,
                              directed=False,
                              random_seed=111)
adata.obs

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'dev_harmony_tasccoda.h5ad'))

In [None]:
adata=sc.read_h5ad(os.path.join(RESULTS_FOLDERNAME, 'dev_harmony_tasccoda.h5ad'))

In [None]:
sc.pl.embedding(
    adata, color=["nsbm_level_4", "nsbm_level_5", "C_scANVI"], 
    basis = 'force_directed_array', ncols=3, wspace=0.5,
    frameon=False
)

In [None]:
sc.pl.embedding(
    adata, color=["nsbm_level_3", "nsbm_level_4", "C_scANVI"], 
    basis = 'force_directed_array', ncols=3, wspace=0.5,
    frameon=False
)

In [None]:
sc.pl.umap(
    adata, color=["nsbm_level_3", "nsbm_level_4", "C_scANVI"], 
    ncols=3, wspace=0.5,
    frameon=False
)

In [None]:
sc.pl.embedding(
    adata, basis = 'tsne',
    color=["nsbm_level_3", "nsbm_level_4", "nsbm_level_5", "C_scANVI"], 
    ncols=3, wspace=0.5,
    frameon=False
)

In [None]:
sc.pl.embedding(
    adata, basis = 'draw_graph_fa',
    color=["nsbm_level_3", "nsbm_level_4", "nsbm_level_5", "nsbm_level_6", "nsbm_level_7", "C_scANVI"], 
    ncols=3, wspace=0.5,
    frameon=False,
    save='_fa_nsbm_levels.svg'
)

In [None]:
sc.pl.embedding(
    adata, basis = 'draw_graph_fa',
    color=["nsbm_level_3", "C_scANVI"], 
    ncols=3, wspace=0.5,
    frameon=False, legend_loc='on data'
)

In [None]:
adata.obs

In [None]:
tasccoda_model = pt.tl.Tasccoda()
tasccoda_data = tasccoda_model.load(
    adata,
    type="cell_level",
    cell_type_identifier="nsbm_level_3",
    sample_identifier="sampletype",
    covariate_obs=["age", "libbatch", "sample"],
    levels_orig=["nsbm_level_8", "nsbm_level_7", "nsbm_level_6", "nsbm_level_5", "nsbm_level_4", "nsbm_level_3"],
    add_level_name=True,
)
tasccoda_data

In [None]:
tasccoda_data['rna'].obs[['nsbm_level_4', 'C_scANVI']].value_counts().head(20)

In [None]:
pt.pl.coda.draw_tree(tasccoda_data,
                    file_name=os.path.join(FIGURES_FOLDERNAME, "foetal_tasccoda_tree.svg"))

In [None]:
sc.pl.embedding(
    adata, basis = 'draw_graph_fa',
    color=["nsbm_level_4", "C_scANVI"], 
    ncols=3, wspace=0.5,
    frameon=False, legend_loc='on data'
)

In [None]:
tasccoda_model.prepare(
    tasccoda_data,
    modality_key="coda",
    reference_cell_type="38", #corresponding to nervous system cells
    formula="age",
    pen_args={"phi": 0, "lambda_1": 3.5},
    tree_key="tree",
)

In [None]:
tasccoda_model.run_nuts(
    tasccoda_data, modality_key="coda", rng_key=1234, num_samples=10000, num_warmup=1000
)

In [None]:
tasccoda_model.summary(tasccoda_data, modality_key="coda")

In [None]:
plt = pt.pl.coda.draw_effects(
    tasccoda_data,
    modality_key="coda",
    tree="tree",
    covariate="age[T.20w]",
    show_leaf_effects=True,
    show_legend=False,
    file_name=os.path.join(FIGURES_FOLDERNAME, "foetal_tasccoda_20wVs12w.svg")
)

In [None]:
plt = pt.pl.coda.draw_effects(
    tasccoda_data,
    modality_key="coda",
    tree="tree",
    covariate="age[T.17w]",
    show_leaf_effects=True,
    show_legend=False,
    file_name=os.path.join(FIGURES_FOLDERNAME, "foetal_tasccoda_17wVs12w.svg")
)



In [None]:
pt.pl.coda.effects_barplot(tasccoda_data, modality_key="coda", covariates="age")

In [None]:
#import matplotlib as mpl
#from copy import copy


kwargs = {"ncols": 3, 
          "wspace": 0.25,
          #'cmap': sns.cubehelix_palette(dark=0, light=.9, as_cmap=True),
          #'cmap': copy(mpl.cm.Reds)
          'cmap': "RdBu",
          'frameon': False, 
          'vmin': -1, 'vcenter': 0,
          'vmax': 1, 
          'sort_order': False,
          'save': '_tasccoda_umaps_effects.svg'
         }
pt.pl.coda.effects_umap(
    tasccoda_data,
    effect_name=[
        "effect_df_age[T.17w]",
        "effect_df_age[T.20w]",
    ],
    cluster_key="nsbm_level_3",
    **kwargs
)
sc.pl.umap( 
    tasccoda_data["rna"], color=["C_scANVI", "nsbm_level_3"], 
    palette=None, frameon=False,
    ncols=2, wspace=0.5
)

In [None]:
# Create a color palette for each cluster
cluster_palette = ['#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', #5
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   'blue', '#d9d9d9', #11
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', #21
                   'red', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   'red', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                   '#d9d9d9', '#d9d9d9', 
                  ]

In [None]:
sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='force_directed_array',
    palette=cluster_palette,
    frameon=False,legend_loc='right margin', 
    save = 'significant_at_20w_harmonyforcelayout.svg'
)

In [None]:
sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='draw_graph_fa',
    palette=cluster_palette,
    frameon=False, legend_loc='right margin', save = 'significant_at_20w_fa.svg'
)

In [None]:
# Create a color palette for each cluster
cluster_palette_17w = ['#d9d9d9', '#d9d9d9', 
                       'red', '#d9d9d9', 
                       'pink', '#d9d9d9', #5
                       '#d9d9d9', 'red', 
                       'red', 'red', 
                       '#d9d9d9', 'red', #11
                       'red', 'red', 
                       'red', 'red', 
                       'red', 'red', 
                       '#d9d9d9', 'red', 
                       '#d9d9d9', 'red', #21
                       'red', '#d9d9d9', 
                       '#d9d9d9', 'red', 
                       'red', '#d9d9d9', 
                       '#d9d9d9', '#d9d9d9', 
                       '#d9d9d9', '#d9d9d9', 
                       '#d9d9d9', '#d9d9d9', 
                      ]

In [None]:
sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='force_directed_array',
    palette=cluster_palette_17w,
    frameon=False,legend_loc='right margin', save = 'significant_at_17w_harmonyforcelayout.svg'
)

In [None]:
sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='draw_graph_fa',
    palette=cluster_palette_17w,
    frameon=False, legend_loc='right margin', save = 'significant_at_17w_fa.svg'
)

In [None]:
sc.pl.umap( 
    tasccoda_data["rna"], color=["C_scANVI"],
    frameon=False, legend_loc='right margin', save = 'scANVI.svg'
)

In [None]:
import matplotlib as mpl

sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='draw_graph_fa',
    palette=mpl.rcParams["axes.prop_cycle"],
    frameon=False, legend_loc='right margin', save = '_nsbm_level3_fa.svg'
)

In [None]:
sc.pl.embedding( 
    tasccoda_data["rna"], color=["nsbm_level_3"], 
    basis='force_directed_array',
    palette=mpl.rcParams["axes.prop_cycle"],
    frameon=False, legend_loc='right margin', save = '_nsbm_level3_harmonyforcelayout.svg'
)

In [None]:
sc.pl.umap( 
    tasccoda_data["rna"], color=["nsbm_level_3"], palette=mpl.rcParams["axes.prop_cycle"],
    frameon=False,legend_loc='right margin', save = '_nsbm_level3.svg'
)

In [None]:
sc.tl.rank_genes_groups(adata, groupby='nsbm_level_3', method='wilcoxon', key_added='wilcoxon_nsbm3', 
                        use_raw=False, layer='log1p_norm')

In [None]:
result = adata.uns['wilcoxon_nsbm3']
groups = result['names'].dtype.names
df = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups 
    for key in ['names','scores','logfoldchanges', 'pvals', 'pvals_adj']})
#df.to_csv(os.path.join(RESULTS_FOLDERNAME, 'wilcoxon_DGE_nbsm3.csv'))
df.head(5)

In [None]:
# create an empty dictionary to store the top 10 names for each cell_type
top_names_dict = {}

for group in groups:
    top_names = result['names'][group][:11] # select the top 10 names for this cell_type
    top_names_dict[group] = list(top_names) # add the list of top names to the dictionary

In [None]:
top_names_dict['29'] # significantly increased proportionally in 20w

In [None]:
top_names_dict['7'] # significantly increased at 20w

In [None]:
top_names_dict['2'] # significantly decreased at 20w

In [None]:
top_names_dict['13']  # significantly decreased at 20w

In [None]:
top_names_dict['0'] # significantly decreased at 20w