# 02 LIANA tumor vs normal core atlas 
### Load DESEQ2 results to obtain significant interactions 

## 00. Libraries

In [1]:
import contextlib
import os

import decoupler as dc
import liana as li
import numpy as np
import pandas as pd
import sc_atlas_helpers as ah
import scanpy as sc
import scipy.sparse
import statsmodels.stats.multitest
from anndata import AnnData
from liana.method import (
    cellchat,
    cellphonedb,
    connectome,
    geometric_mean,
    logfc,
    natmi,
    singlecellsignalr,
)
from tqdm.auto import tqdm

## 01. Define variables, paths and comparison tumor vs normal

In [2]:
dataDir = f"/data/projects/2022/CRCA/results/v1/final/liana_cell2cell/h5ads/updated"
resDir = dataDir

## 03. Functions

In [3]:
def fdr_correction(df, pvalue_col="pvalue", *, key_added="fdr", inplace=False):
    """Adjust p-values in a data frame with test results using FDR correction."""
    if not inplace:
        df = df.copy()

    df[key_added] = statsmodels.stats.multitest.fdrcorrection(df[pvalue_col].values)[1]

    if not inplace:
        return df

In [4]:
"""Plotting functions for group comparisons"""

import altair as alt
import numpy as np
import pandas as pd


def plot_lm_result_altair(
    df,
    p_cutoff=0.1,
    p_col="fdr",
    x="variable",
    y="group",
    color="coef",
    title="heatmap",
    cluster=False,
    value_max=None,
    configure=lambda x: x.configure_mark(opacity=1),
    cmap="redblue",
    reverse=True,
    domain=lambda x: [-x, x],
    order=None,
):
    """
    Plot a results data frame of a comparison as a heatmap
    """
    df_filtered = df.loc[lambda _: _[p_col] < p_cutoff, :]
    df_subset = df.loc[
        lambda _: _[x].isin(df_filtered[x].unique()) & _[y].isin(df[y].unique())
    ]
    if not df_subset.shape[0]:
        print("No values to plot")
        return

    if order is None:
        order = "ascending"
        if cluster:
            from scipy.cluster.hierarchy import leaves_list, linkage

            values_df = df_subset.pivot(index=y, columns=x, values=color)
            order = values_df.columns.values[
                leaves_list(
                    linkage(values_df.values.T, method="average", metric="euclidean")
                )
            ]

    def _get_significance(fdr):
        if fdr < 0.001:
            return "< 0.001"
        elif fdr < 0.01:
            return "< 0.01"
        elif fdr < 0.1:
            return "< 0.1"
        else:
            return np.nan

    df_subset["FDR"] = pd.Categorical([_get_significance(x) for x in df_subset[p_col]])

    if value_max is None:
        value_max = max(
            abs(np.nanmin(df_subset[color])), abs(np.nanmax(df_subset[color]))
        )
    # just setting the domain in altair will lead to "black" fields. Therefore, we constrain the values themselves.
    df_subset[color] = np.clip(df_subset[color], *domain(value_max))
    return configure(
        alt.Chart(df_subset, title=title)
        .mark_rect()
        .encode(
            x=alt.X(x, sort=order),
            y=y,
            color=alt.Color(
                color,
                scale=alt.Scale(scheme=cmap, reverse=reverse, domain=domain(value_max)),
            ),
        )
        + alt.Chart(df_subset.loc[lambda x: ~x["FDR"].isnull()])
        .mark_point(color="white", filled=True, stroke="black", strokeWidth=0)
        .encode(
            x=alt.X(x, sort=order),
            y=y,
            size=alt.Size(
                "FDR:N",
                scale=alt.Scale(
                    domain=["< 0.001", "< 0.01", "< 0.1"],
                    range=4 ** np.array([3, 2, 1]),
                ),
            ),
        )
    )

In [5]:
import warnings
from functools import reduce
from operator import and_
from typing import Sequence, Union

import numpy as np
import pandas as pd
from anndata import AnnData, ImplicitModificationWarning


def pseudobulk(
    adata,
    *,
    groupby: Union[str, Sequence[str]],
    aggr_fun=np.sum,
    min_obs=10,
) -> AnnData:
    """
    Calculate Pseudobulk of groups

    Parameters
    ----------
    adata
        annotated data matrix
    groupby
        One or multiple columns to group by
    aggr_fun
        Callback function to calculate pseudobulk. Must be a numpy ufunc supporting
        the `axis` attribute.
    min_obs
        Exclude groups with less than `min_obs` observations

    Returns
    -------
    New anndata object with same vars as input, but reduced number of obs.
    """
    if isinstance(groupby, str):
        groupby = [groupby]

    combinations = adata.obs.loc[:, groupby].drop_duplicates()

    if adata.is_view:
        # for whatever reason, the pseudobulk function is terribly slow when operating on a view.
        adata = adata.copy()

    # precompute masks
    masks = {}
    for col in groupby:
        masks[col] = {}
        for val in combinations[col].unique():
            masks[col][val] = adata.obs[col] == val

    expr_agg = []
    obs = []

    for comb in combinations.itertuples(index=False):
        mask = reduce(and_, (masks[col][val] for col, val in zip(groupby, comb)))
        if np.sum(mask) < min_obs:
            continue
        expr_row = aggr_fun(adata.X[mask, :], axis=0)
        obs_row = comb._asdict()
        obs_row["n_obs"] = np.sum(mask)
        # convert matrix to array if required (happens when aggregating spares matrix)
        try:
            expr_row = expr_row.A1
        except AttributeError:
            pass
        obs.append(obs_row)
        expr_agg.append(expr_row)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ImplicitModificationWarning)
        return AnnData(
            X=np.vstack(expr_agg),
            var=adata.var,
            obs=pd.DataFrame.from_records(obs),
        )

In [6]:
"""Helper functions for cellphonedb analysis

Focuses on differential cellphonedb analysis between conditions.
"""

from typing import List, Literal

import altair as alt

# from .pseudobulk import pseudobulk
import numpy as np
import pandas as pd
import scanpy as sc

# from .compare_groups.pl import plot_lm_result_altair
# from .util import fdr_correction


class CpdbAnalysis:
    def __init__(
        self, cpdb, adata, *, pseudobulk_group_by: List[str], cell_type_column: str
    ):
        """
        Class that handles comparative cellphonedb analysis.

        Parameters
        ----------
        cpdb
            pandas data frame with cellphonedb interactions.
            Required columns: `source_genesymbols`, `target_genesymbol`.
            You can get this from omnipathdb:
            https://omnipathdb.org/interactions/?fields=sources,references&genesymbols=1&databases=CellPhoneDB
        adata
            Anndata object with the target cells. Will use this to derive mean fraction of expressed cells.
            Should contain counts in X.
        pseudobulk_group_by
            See :func:`scanpy_helper.pseudobulk.pseudobulk`. Pseudobulk is used to compute the mean fraction
            of expressed cells by patient
        cell_type_column
            Column in anndata that contains the cell-type annotation.
        """
        self.cpdb = cpdb
        self.cell_type_column = cell_type_column
        self._find_expressed_genes(adata, pseudobulk_group_by)

    def _find_expressed_genes(self, adata, pseudobulk_group_by):
        """Compute the mean expression and fraction of expressed cells per cell-type.
        This is performed on the pseudobulk level, i..e. the mean of means per patient is calculated.
        """
        pb_fracs = pseudobulk(
            adata,
            groupby=pseudobulk_group_by + [self.cell_type_column],
            aggr_fun=lambda x, axis: np.sum(x > 0, axis) / x.shape[axis],  # type: ignore
        )
        fractions_expressed = pseudobulk(
            pb_fracs, groupby=self.cell_type_column, aggr_fun=np.mean
        )
        fractions_expressed.obs.set_index(self.cell_type_column, inplace=True)

        pb = pseudobulk(
            adata,
            groupby=pseudobulk_group_by + [self.cell_type_column],
        )
        sc.pp.normalize_total(pb, target_sum=1e6)
        sc.pp.log1p(pb)
        pb_mean_cell_type = pseudobulk(
            pb, groupby=self.cell_type_column, aggr_fun=np.mean
        )
        pb_mean_cell_type.obs.set_index(self.cell_type_column, inplace=True)

        self.expressed_genes = (
            fractions_expressed.to_df()
            .melt(ignore_index=False, value_name="fraction_expressed")
            .reset_index()
            .merge(
                pb_mean_cell_type.to_df()
                .melt(ignore_index=False, value_name="expr_mean")
                .reset_index(),
                on=[self.cell_type_column, "variable"],
            )
        )

    def significant_interactions(
        self,
        de_res: pd.DataFrame,
        *,
        pvalue_col="pvalue",
        fc_col="log2FoldChange",
        gene_symbol_col="gene_id",
        max_pvalue=0.1,
        min_abs_fc=1,
        adjust_fdr=True,
        min_frac_expressed=0.1,
        de_genes_mode: Literal["ligand", "receptor"] = "ligand",
    ) -> pd.DataFrame:
        """
        Generates a data frame of differentiall cellphonedb interactions.

        This function will extract all known ligands (or receptors, respectively) from a list of differentially expressed
        and find all receptors (or ligands, respectively) that are expressed above a certain cutoff in all cell-types.

        Parameters:
        -----------
        de_res
            List of differentially expressed genes
        pvalue_col
            column in de_res that contains the pvalue or false discovery rate
        gene_id_col
            column in de_res that contains the gene symbol
        min_frac_expressed
            Minimum fraction cells that need to express the receptor (or ligand) to be considered a potential interaction
        de_genes_mode
            If the list of de genes provided are ligands (default) or receptors. In case of `ligand`, cell-types
            that express corresonding receptors above the threshold will be identified. In case of `receptor`,
            cell-types that express corresponding ligands above the threshold will be identified.
        adjust_fdr
            If True, calculate false discovery rate on the pvalue, after filtering for genes that are contained
            in the cellphonedb.
        """
        if de_genes_mode == "ligand":
            cpdb_de_col = "source_genesymbol"
            cpdb_expr_col = "target_genesymbol"
        elif de_genes_mode == "receptor":
            cpdb_de_col = "target_genesymbol"
            cpdb_expr_col = "source_genesymbol"
        else:
            raise ValueError("Invalud value for de_genes_mode!")

        de_res = de_res.loc[lambda x: x[gene_symbol_col].isin(self.cpdb[cpdb_de_col])]
        if adjust_fdr:
            de_res = fdr_correction(de_res, pvalue_col=pvalue_col, key_added="fdr")
            pvalue_col = "fdr"

        significant_genes = de_res.loc[
            lambda x: (x[pvalue_col] < max_pvalue) & (np.abs(x[fc_col]) >= min_abs_fc),
            gene_symbol_col,
        ].unique()  # type: ignore
        significant_interactions = self.cpdb.loc[
            lambda x: x[cpdb_de_col].isin(significant_genes)
        ]

        res_df = (
            self.expressed_genes.loc[
                lambda x: x["fraction_expressed"] >= min_frac_expressed
            ]  # type: ignore
            .merge(
                significant_interactions,
                left_on="variable",
                right_on=cpdb_expr_col,
            )
            .drop(columns=["variable"])
            .merge(de_res, left_on=cpdb_de_col, right_on=gene_symbol_col)
            .drop(columns=[gene_symbol_col])
        )

        return res_df

    def plot_result(
        self,
        cpdb_res,
        *,
        pvalue_col="fdr",
        group_col="group",
        fc_col="log2FoldChange",
        title="CPDB analysis",
        aggregate=True,
        clip_fc_at=(-5, 5),
        label_limit=100,
        cluster: Literal["heatmap", "dotplot"] = "dotplot",
        de_genes_mode: Literal["ligand", "receptor"] = "ligand",
    ):
        """
        Plot cpdb results as heatmap

        Parameters
        ----------
        cpdb_res
            result of `significant_interactions`. May be further filtered or modified.
        group_col
            column to be used for the y axis of the heatmap
        aggregate
            whether to merge multiple targets of the same ligand into a single column
        de_genes_mode
            If the list of de genes provided are ligands (default) or receptors. If receptor, will show the dotplot
            at the top (source are expressed ligands) and the de heatmap at the bottom (target are the DE receptors).
            Otherwise the other way round.
        """
        if de_genes_mode == "ligand":
            cpdb_de_col = "source_genesymbol"
            cpdb_expr_col = "target_genesymbol"
        elif de_genes_mode == "receptor":
            cpdb_de_col = "target_genesymbol"
            cpdb_expr_col = "source_genesymbol"
        else:
            raise ValueError("Invalud value for de_genes_mode!")

        cpdb_res[fc_col] = np.clip(cpdb_res[fc_col], *clip_fc_at)

        # aggregate if there are multiple receptors per ligand
        if aggregate:
            cpdb_res = (
                cpdb_res.groupby(
                    [
                        self.cell_type_column,
                        cpdb_de_col,
                        fc_col,
                        pvalue_col,
                        group_col,
                    ]
                )
                .agg(
                    n=(cpdb_expr_col, len),
                    fraction_expressed=("fraction_expressed", np.max),
                    expr_mean=("expr_mean", np.max),
                )
                .reset_index()
                .merge(
                    cpdb_res.groupby(cpdb_de_col).agg(
                        **{
                            cpdb_expr_col: (
                                cpdb_expr_col,
                                lambda x: "|".join(np.unique(x)),
                            )
                        }
                    ),
                    on=cpdb_de_col,
                )
            )

        cpdb_res["interaction"] = [
            f"{s}_{t}" for s, t in zip(cpdb_res[cpdb_de_col], cpdb_res[cpdb_expr_col])
        ]

        # cluster heatmap
        if cluster is not None:
            from scipy.cluster.hierarchy import leaves_list, linkage

            _idx = self.cell_type_column if cluster == "dotplot" else group_col
            _values = "fraction_expressed" if cluster == "dotplot" else fc_col
            _columns = "interaction"
            values_df = (
                cpdb_res.loc[:, [_idx, _values, _columns]]
                .drop_duplicates()
                .pivot(
                    index=_idx,
                    columns=_columns,
                    values=_values,
                )
                .fillna(0)
            )
            order = values_df.columns.values[
                leaves_list(
                    linkage(values_df.values.T, method="average", metric="euclidean")
                )
            ]
        else:
            order = "ascending"

        p1 = plot_lm_result_altair(
            cpdb_res,
            color=fc_col,
            p_col=pvalue_col,
            x="interaction",
            configure=lambda x: x,
            title="",
            order=order,
            p_cutoff=1,
        ).encode(
            x=alt.X(
                title=None,
                axis=alt.Axis(
                    labelExpr="split(datum.label, '_')[0]",
                    orient="top" if de_genes_mode == "receptor" else "bottom",
                ),
            )
        )

        p2 = (
            alt.Chart(cpdb_res)
            .mark_circle()
            .encode(
                x=alt.X(
                    "interaction",
                    axis=alt.Axis(
                        grid=True,
                        orient="bottom" if de_genes_mode == "receptor" else "top",
                        title=None,
                        labelExpr="split(datum.label, '_')[1]",
                        labelLimit=label_limit,
                    ),
                    sort=order,
                ),
                y=alt.Y(self.cell_type_column, axis=alt.Axis(grid=True), title=None),
                size=alt.Size("fraction_expressed"),
                color=alt.Color("expr_mean", scale=alt.Scale(scheme="cividis")),
            )
        )

        if de_genes_mode == "receptor":
            p1, p2 = p2, p1

        return (
            alt.vconcat(p1, p2, title=title)
            .resolve_scale(size="independent", color="independent", x="independent")
            .configure_mark(opacity=1)
            .configure_concat(spacing=label_limit - 130)
        )

## 04. Load adata with LIANA ranked information 

In [None]:
adata = sc.read_h5ad(f"{dataDir}/adata_rank_agregate.h5ad")

In [None]:
dataDir

In [None]:
adata

In [10]:
adata.obs.sample_type.value_counts()

sample_type
tumor    1557400
Name: count, dtype: int64

In [12]:
adata.obs.cell_type_coarse.value_counts()

cell_type_coarse
Cancer cell         386113
T cell CD8          246021
T cell CD4          229632
Plasma cell         192909
B cell               98174
Fibroblast           86281
Macrophage           72972
Endothelial cell     59723
Monocyte             39388
Pericyte             33936
Neutrophil           33055
Dendritic cell       21755
Mast cell            18083
NK                   11491
gamma-delta          11191
Schwann cell          6551
Eosinophil            5562
Tuft                  1916
ILC                   1538
Enteroendocrine        817
NKT                    292
Name: count, dtype: int64

### Load DESEQ results from Valentin

In [None]:
import os
import pandas as pd

# Path to the directory
folder_path = "/data/projects/2022/CRCA/results/v1/downstream_analyses/de_analysis/de_analysis_paired_tumor_normal_coarse/deseq2_dgea_adhoc"

# Initialize an empty list to store dataframes
dataframes = []

# Loop through all files in the directory
for file_name in os.listdir(folder_path):
    if file_name.endswith("-DESeq2_result.tsv"):
        # Extract the cell type from the file name
        cell_type = file_name.replace("-DESeq2_result.tsv", "")
        
        # Read the file into a DataFrame
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t')  # Assuming the file is tab-separated
        
        # Add the cell type as a new column
        df['cell_type_coarse'] = cell_type
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames into one
de_res = pd.concat(dataframes, ignore_index=True)




In [None]:
de_res.rename(columns={"symbol": "gene_id"}, inplace=True)

In [None]:
de_res

In [22]:
de_res["cell_type_coarse"] = de_res["cell_type_coarse"].str.replace("_", " ")

In [23]:
de_res.cell_type_coarse.value_counts()

cell_type_coarse
Cancer stem like        13493
Cancer non stem like    13003
Endothelial cell        10756
Fibroblast              10657
Pericyte                 9933
Macrophage               9780
T cell CD8               9368
T cell CD4               9246
Dendritic cell           9017
Treg                     8721
B cell                   8152
Plasma cell              7850
Monocyte                 7663
Schwann cell             7220
gamma delta              4712
NK                       4020
Tuft                     3544
Eosinophil               3072
Neutrophil               3027
Enteroendocrine          1541
Name: count, dtype: int64

In [24]:
de_res.to_csv("combined_DESeq2_results_coarse.tsv", sep="\t")

In [15]:
#de_res = pd.read_csv("combined_DESeq2_results_coarse.tsv", sep="\t")

## CPDB 

In [25]:
cpdb_res = adata.uns["rank_aggregate"].loc[lambda x: x["specificity_rank"] <= 0.01]

In [26]:
cpdb_res = cpdb_res.rename(
    columns={
        "ligand_complex": "source_genesymbol",
        "receptor_complex": "target_genesymbol",
    }
)

In [27]:
cpdba = CpdbAnalysis(
    cpdb_res,
    adata,
    pseudobulk_group_by=["patient_id"],
    cell_type_column="cell_type_coarse",
)

In [28]:
cpdb_de_col = "source_genesymbol"
cpdb_expr_col = "target_genesymbol"
de_genes_mode = "ligand"
pvalue_col = "pvalue"
fc_col = "log2FoldChange"
gene_symbol_col = "gene_id"
max_pvalue = 0.1
min_abs_fc = 1
adjust_fdr = True
min_frac_expressed = 0.1

In [29]:
de_res = de_res.loc[lambda x: x[gene_symbol_col].isin(cpdba.cpdb[cpdb_de_col])]

In [30]:
if adjust_fdr:
    de_res = fdr_correction(de_res, pvalue_col=pvalue_col, key_added="fdr")
    pvalue_col = "fdr"

In [31]:
significant_genes = de_res.loc[
    lambda x: (x[pvalue_col] < max_pvalue) & (np.abs(x[fc_col]) >= min_abs_fc),
    gene_symbol_col,
].unique()  # type: ignore

In [32]:
significant_interactions = cpdba.cpdb.loc[
    lambda x: x[cpdb_de_col].isin(significant_genes)
]

In [33]:
res_df = (
    cpdba.expressed_genes.loc[
        lambda x: x["fraction_expressed"] >= min_frac_expressed
    ]  # type: ignore
    .merge(
        significant_interactions,
        left_on="variable",
        right_on=cpdb_expr_col,
    )
    .drop(columns=["variable"])
    .merge(de_res, left_on=cpdb_de_col, right_on=gene_symbol_col)
)

In [35]:
res_df = res_df.drop(columns=["cell_type_coarse_y"]).rename(columns={"cell_type_coarse_x": "cell_type_coarse"})


In [36]:
res_df

Unnamed: 0,cell_type_coarse,fraction_expressed,expr_mean,source,target,source_genesymbol,target_genesymbol,lr_means,cellphone_pvals,expr_prod,...,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,weight,comparison,fdr
0,Cancer cell,0.372095,4.412338,Schwann cell,Cancer cell,L1CAM,EPHB2,1.505405,0.0,2.053308,...,L1CAM,100.486810,-0.760285,0.185366,-4.101527,4.104327e-05,1.030519e-03,1.255705,tumor_vs_normal,9.466332e-05
1,Cancer cell,0.372095,4.412338,Schwann cell,Cancer cell,L1CAM,EPHB2,1.505405,0.0,2.053308,...,L1CAM,5.280643,-2.389735,0.182927,-13.063857,5.297408e-39,2.143526e-37,1.277975,tumor_vs_normal,8.028516e-38
2,Cancer cell,0.372095,4.412338,Schwann cell,Cancer cell,L1CAM,EPHB2,1.505405,0.0,2.053308,...,L1CAM,3.934522,0.158226,0.806703,0.196139,8.445017e-01,7.965974e-01,1.113618,tumor_vs_normal,9.018009e-01
3,Cancer cell,0.372095,4.412338,Schwann cell,Cancer cell,L1CAM,EPHB2,1.505405,0.0,2.053308,...,L1CAM,4.894431,1.327248,2.140158,0.620163,5.351503e-01,6.530793e-01,1.000000,tumor_vs_normal,6.260249e-01
4,Cancer cell,0.372095,4.412338,Schwann cell,Enteroendocrine,L1CAM,EPHB2,1.481128,0.0,1.957806,...,L1CAM,100.486810,-0.760285,0.185366,-4.101527,4.104327e-05,1.030519e-03,1.255705,tumor_vs_normal,9.466332e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46630,Mast cell,0.100744,3.336087,Macrophage,Mast cell,IL18,IL1RAPL1,1.095489,0.0,0.774486,...,IL18,23.626388,-0.243703,0.125868,-1.936179,5.284573e-02,1.328068e-01,0.891324,tumor_vs_normal,8.080894e-02
46631,Mast cell,0.100744,3.336087,Macrophage,Mast cell,IL18,IL1RAPL1,1.095489,0.0,0.774486,...,IL18,102.181132,0.013617,0.088214,0.154368,8.773200e-01,8.637789e-01,1.051364,tumor_vs_normal,9.271105e-01
46632,Mast cell,0.100744,3.336087,Macrophage,Mast cell,IL18,IL1RAPL1,1.095489,0.0,0.774486,...,IL18,43.501649,-0.243871,0.088581,-2.753091,5.903552e-03,2.887998e-02,0.714152,tumor_vs_normal,1.045772e-02
46633,Mast cell,0.100744,3.336087,Macrophage,Mast cell,IL18,IL1RAPL1,1.095489,0.0,0.774486,...,IL18,13.347163,-2.329451,0.190255,-12.243861,1.812031e-34,4.126243e-33,1.000000,tumor_vs_normal,2.288528e-33


In [37]:
res_df.to_csv("res_df_coarse.csv")

In [None]:
res_df_ss = res_df

In [197]:
res_df["source"].replace(
    {
        "B cell activated": "B cell",
          'GC B cell' : 'B cell',
        "B cell memory": "B cell",
        "B cell naive": "B cell",
        "Fibroblast S1": "Fibroblast",
        "Fibroblast S2": "Fibroblast",
        "Fibroblast S3": "Fibroblast",
        "Plasma IgA": "Plasma",
        "Plasma IgG": "Plasma",
        "Plasma IgM": "Plasma",
        "Plasmablast": "Plasma",
        "cDC1": "DC",
        "cDC2": "DC",
        "DC mature": "DC",
        "DC3": "DC",
        "pDC": "DC",
        "Macrophage cycling": "Macrophage",
        "Monocyte classical": "Monocyte",
         'Monocyte non classical': "Monocyte",
        "Monocyte non-classical": "Monocyte",
         'NKT':'NK',
        'CD4 cycling':'CD4',
        'CD8 cycling':'CD8',
         'Colonocyte BEST4':'Colonocyte',
        'gamma-delta':'gamma delta',
             'Endothelial arterial':'Endothelial',
 'Endothelial lymphatic':'Endothelial',
 'Endothelial venous':'Endothelial',
    },
    inplace=True,
)

In [198]:
res_df["target"].replace(
    {
        "B cell activated": "B cell",
          'GC B cell' : 'B cell',
        "B cell memory": "B cell",
        "B cell naive": "B cell",
        "Fibroblast S1": "Fibroblast",
        "Fibroblast S2": "Fibroblast",
        "Fibroblast S3": "Fibroblast",
        "Plasma IgA": "Plasma",
        "Plasma IgG": "Plasma",
        "Plasma IgM": "Plasma",
        "Plasmablast": "Plasma",
        "cDC1": "DC",
        "cDC2": "DC",
        "DC mature": "DC",
        "DC3": "DC",
        "pDC": "DC",
        "Macrophage cycling": "Macrophage",
        "Monocyte classical": "Monocyte",
        'Monocyte non classical': "Monocyte",
 'Monocyte non-classical': "Monocyte",
         'NKT':'NK',
        'CD4 cycling':'CD4',
        'CD8 cycling':'CD8',
         'Colonocyte BEST4':'Colonocyte',
        'gamma-delta':'gamma delta',
    'Endothelial arterial':'Endothelial',
 'Endothelial lymphatic':'Endothelial',
 'Endothelial venous':'Endothelial',
    },
    inplace=True,
)

In [146]:
# Replace "Granulocyte progenitor" in the 'source' column with the value from 'cell_type_fine'
res_df.loc[res_df['target'] == "Granulocyte progenitor", 'target'] = res_df['cell_type_fine']

In [147]:
# Replace "Granulocyte progenitor" in the 'source' column with the value from 'cell_type_fine'
res_df.loc[res_df['source'] == "Granulocyte progenitor", 'source'] = res_df['cell_type_fine']

In [166]:
res_df.to_csv("res_df_lastest_25nov_grouped.csv")

In [196]:
res_df_ss["cell_type_fine"].replace(
    {
        "B cell activated": "B cell",
        'GC B cell' : 'B cell',
        "B cell memory": "B cell",
        "B cell naive": "B cell",
        "Fibroblast S1": "Fibroblast",
        "Fibroblast S2": "Fibroblast",
        "Fibroblast S3": "Fibroblast",
        "Plasma IgA": "Plasma",
        "Plasma IgG": "Plasma",
        "Plasma IgM": "Plasma",
        "Plasmablast": "Plasma",
        "cDC1": "DC",
        "cDC2": "DC",
        "DC mature": "DC",
        "DC3": "DC",
        "pDC": "DC",
        "Macrophage cycling": "Macrophage",
        "Monocyte classical": "Monocyte",
        "Monocyte non classical": "Monocyte",
         'NKT':'NK',
        'CD4 cycling':'CD4',
        'CD8 cycling':'CD8',
         'Colonocyte BEST4':'Colonocyte',
        'Endothelial arterial':'Endothelial',
 'Endothelial lymphatic':'Endothelial',
 'Endothelial venous':'Endothelial',
    },
    inplace=True,
)

In [197]:
res_df_ss["source"].replace(
    {
        "B cell activated": "B cell",
          'GC B cell' : 'B cell',
        "B cell memory": "B cell",
        "B cell naive": "B cell",
        "Fibroblast S1": "Fibroblast",
        "Fibroblast S2": "Fibroblast",
        "Fibroblast S3": "Fibroblast",
        "Plasma IgA": "Plasma",
        "Plasma IgG": "Plasma",
        "Plasma IgM": "Plasma",
        "Plasmablast": "Plasma",
        "cDC1": "DC",
        "cDC2": "DC",
        "DC mature": "DC",
        "DC3": "DC",
        "pDC": "DC",
        "Macrophage cycling": "Macrophage",
        "Monocyte classical": "Monocyte",
         'Monocyte non classical': "Monocyte",
        "Monocyte non-classical": "Monocyte",
         'NKT':'NK',
        'CD4 cycling':'CD4',
        'CD8 cycling':'CD8',
         'Colonocyte BEST4':'Colonocyte',
        'gamma-delta':'gamma delta',
             'Endothelial arterial':'Endothelial',
 'Endothelial lymphatic':'Endothelial',
 'Endothelial venous':'Endothelial',
    },
    inplace=True,
)

In [198]:
res_df_ss["target"].replace(
    {
        "B cell activated": "B cell",
          'GC B cell' : 'B cell',
        "B cell memory": "B cell",
        "B cell naive": "B cell",
        "Fibroblast S1": "Fibroblast",
        "Fibroblast S2": "Fibroblast",
        "Fibroblast S3": "Fibroblast",
        "Plasma IgA": "Plasma",
        "Plasma IgG": "Plasma",
        "Plasma IgM": "Plasma",
        "Plasmablast": "Plasma",
        "cDC1": "DC",
        "cDC2": "DC",
        "DC mature": "DC",
        "DC3": "DC",
        "pDC": "DC",
        "Macrophage cycling": "Macrophage",
        "Monocyte classical": "Monocyte",
        'Monocyte non classical': "Monocyte",
 'Monocyte non-classical': "Monocyte",
         'NKT':'NK',
        'CD4 cycling':'CD4',
        'CD8 cycling':'CD8',
         'Colonocyte BEST4':'Colonocyte',
        'gamma-delta':'gamma delta',
    'Endothelial arterial':'Endothelial',
 'Endothelial lymphatic':'Endothelial',
 'Endothelial venous':'Endothelial',
    },
    inplace=True,
)

In [146]:
# Replace "Granulocyte progenitor" in the 'source' column with the value from 'cell_type_fine'
res_df_ss.loc[res_df_ss['target'] == "Granulocyte progenitor", 'target'] = res_df_ss['cell_type_fine']

In [147]:
# Replace "Granulocyte progenitor" in the 'source' column with the value from 'cell_type_fine'
res_df_ss.loc[res_df_ss['source'] == "Granulocyte progenitor", 'source'] = res_df_ss['cell_type_fine']

In [200]:
res_df_ss.to_csv("res_df_ss_lastest_25nov_grouped.csv") # grouped the endothelial 

In [208]:
values_to_remove = [
    'Plasma', 'Schwann cell', 'TA progenitor', 'Tuft', 'Goblet',
    'ILC', 'Crypt cell', 'Enteroendocrine','gamma delta'
]

# Filter out rows where 'cell_type_fine' is in the list
res_df_ss = res_df_ss[~res_df_ss['cell_type_fine'].isin(values_to_remove)]


In [210]:
res_df_ss.to_csv("res_df_ss_lastest_25nov_grouped_subset.csv") # subset  'Plasma', 'Schwann cell', 'TA progenitor', 'Tuft', 'Goblet','ILC', 'Crypt cell', 'Enteroendocrine','gamma delta'

## 07. Cell-cell interactions heatmap 

In [None]:
immune_cells = [
    "Cancer cell",
    "B cell",
    "Endothelial",
    "DC mature",
    "Macrophage",
    "Mast cell",
    "Monocyte",
    "NK",
    "Neutrophil",
    "Plasma cell",
    "T cell CD4",
    "T cell CD8",
    "T cell regulatory",
    "cDC",
    "pDC",
]

In [None]:
cpdb_sig_int = cpdb_sig_int.loc[lambda x: x["cell_type_fine"].isin(immune_cells)]

In [None]:
top_genes = (
    cpdb_sig_int.loc[:, ["source_genesymbol", "fdr"]]
    .drop_duplicates()
    .sort_values("fdr")["source_genesymbol"][:30]
    .tolist()
)

In [None]:
title_plot = f"{perturbation} vs {baseline}: {cell_type_oi}, FDR<0.1"

In [None]:
save_name_plot = f"{perturbation}_vs_{baseline}_{cell_type_oi}_fdr_0.1"

In [None]:
heatmap = cpdba.plot_result(
    cpdb_sig_int.loc[lambda x: x["source_genesymbol"].isin(top_genes)],
    title=title_plot,
    aggregate=False,
    cluster="heatmap",
    label_limit=110,
)
heatmap

In [None]:
heatmap.save(f"{resDir}/figures/{save_name_plot}.png")
heatmap.save(f"{resDir}/figures/{save_name_plot}.svg")
heatmap.save(f"{resDir}/figures/{save_name_plot}.pdf")