In [None]:
import pandas as pd
import numpy as np
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pegasus as pg
import ddqc

In [None]:
from statannotations.Annotator import Annotator

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

# Run DDQC

Method described here: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02820-w

In [None]:
data1 = pg.read_input("/add/path/here/Pancreas_Steele_10X.h5ad", genome = 'hg38')

In [None]:
df_qc = ddqc.ddqc_metrics(data1, return_df_qc=True)

In [None]:
pg.filter_data(data1)

In [None]:
pg.write_output(data1, "/add/path/here/DDQC_data.h5ad")

# Run regular Scanpy filtering

In [None]:
adata = sc.read_h5ad("/add/path/here/Pancreas_Steele_10X.h5ad")

In [None]:
adata.var_names_make_unique()

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata = adata[adata.obs.pct_counts_mt < 15, :].copy()

In [None]:
sc.pp.scrublet(adata, batch_key="sample")

In [None]:
adata = adata[~adata.obs.predicted_doublet].copy()

In [None]:
adata.write_h5ad("/add/path/here/Scanpy_data.h5ad")

# Compare filtering

## Download signatures

In [None]:
kegg = {}
with open("/add/path/here/KEGG_2021_Human.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        vals = line.split("\t")
        kegg[vals[0]] = vals[2:-1]

In [None]:
state_sig_df = pd.read_csv("/add/path/here/auxiliary_data/PDAC_states_markers.txt",sep="\t")

state_sigs = {}
for state in state_sig_df["cluster"].unique():
    state_sigs[state] = state_sig_df[state_sig_df["cluster"]==state].gene.to_numpy()

## Compute 

In [None]:
def normalize_adatas(adata):
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=10000)
    sc.pp.log1p(adata)

In [None]:
def score_xenobiotic_metabolism(adata):
    sc.tl.score_genes(adata, gene_list=adata.var_names[adata.var_names.str.startswith("CYP")], 
                      score_name="CYP genes")
    sc.tl.score_genes(adata, gene_list=adata.var_names[adata.var_names.str.startswith(("UGT","B3GAT"))],
                      score_name="UGT genes")
    sc.tl.score_genes(adata, gene_list=adata.var_names[adata.var_names.str.startswith("GST")], 
                      score_name="GST genes")
    sc.tl.score_genes(adata, gene_list=kegg['ABC transporters'], 
                      score_name="ABC transporters")

In [None]:
def score_states(adata, state_sigs):
    for state in state_sigs:
        sc.tl.score_genes(adata, gene_list=state_sigs[state], 
                          score_name=state)


In [None]:
adata_ours = sc.read_h5ad("/add/path/here/Ours_adata.h5ad")
adata_scanpy = sc.read_h5ad("/add/path/here/Scanpy_data.h5ad")
adata_DDQC = sc.read_h5ad("/add/path/here/DDQC_data.h5ad")

In [None]:
adata_ours.var_names_make_unique()
adata_scanpy.var_names_make_unique()
adata_DDQC.var_names_make_unique()

In [None]:
adata_ours.X = adata_ours.layers["counts"].copy()

In [None]:
list_idx = np.array(adata_ours.obs_names.str.split("-").str[:2])
list_idx = ["-".join(list_idx[i]) for i in range(len(list_idx))]

adata_ours.obs_names = list_idx

In [None]:
normalize_adatas(adata_ours)
normalize_adatas(adata_scanpy)
normalize_adatas(adata_DDQC)

In [None]:
score_xenobiotic_metabolism(adata_ours)
score_xenobiotic_metabolism(adata_scanpy)
score_xenobiotic_metabolism(adata_DDQC)

In [None]:
df1 = adata_ours.obs.loc[adata_ours.obs.cell_type=="Malignant",["CYP genes","UGT genes","GST genes","ABC transporters"]]
df1["Filtering"] = "Ours"

df2 = adata_scanpy.obs.loc[adata_scanpy.obs.cell_type=="Malignant",["CYP genes","UGT genes","GST genes","ABC transporters"]]
df2["Filtering"] = "Thresholds"

df3 = adata_DDQC.obs.loc[adata_DDQC.obs.cell_type=="Malignant",["CYP genes","UGT genes","GST genes","ABC transporters"]]
df3["Filtering"] = "DDQC"

CYP_comparison = pd.concat([df1,df2,df3])
CYP_comparison = CYP_comparison.dropna()

In [None]:
pairs = [("Ours","Thresholds"),("Ours","DDQC"),("Thresholds","DDQC")]
list_show = ["UGT genes", "ABC transporters"]

fig, ax = plt.subplots(1,2,figsize=(4,3))
flatax = ax.flatten()

for i,ax in enumerate(flatax):
    sns.boxplot(data=CYP_comparison, x="Filtering", y=list_show[i], ax= ax)
    pretty_ax(ax)
    ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha="right")
    ax.set_xlabel("")
    annot = Annotator(
        ax,
        pairs=pairs,
        data=CYP_comparison, x="Filtering", y=list_show[i],
    )
    annot.configure(
        test="Mann-Whitney",
        loc="inside",
        text_format="star",
        show_test_name=False,
        verbose=2,
        comparisons_correction=None,
        fontsize=10,
    )
    annot.apply_test()
    _, test_results = annot.annotate()
fig.tight_layout()

In [None]:
score_states(adata_ours, state_sigs)
score_states(adata_scanpy, state_sigs)
score_states(adata_DDQC, state_sigs)

In [None]:
list_states = list(state_sigs.keys())

In [None]:
df1 = adata_ours.obs.loc[adata_ours.obs.cell_type=="Malignant",list_states]
df1["Filtering"] = "Ours"

df2 = adata_scanpy.obs.loc[adata_scanpy.obs.cell_type=="Malignant",list_states]
df2["Filtering"] = "Thresholds"

df3 = adata_DDQC.obs.loc[adata_DDQC.obs.cell_type=="Malignant",list_states]
df3["Filtering"] = "DDQC"

state_comparison = pd.concat([df1,df2,df3])
state_comparison = state_comparison.dropna()

In [None]:
pairs = [("Ours","Thresholds"),("Ours","DDQC"),("Thresholds","DDQC")]
sublist = ["Ductal cell TFF1", "Ductal cell MALAT1"]

fig, ax = plt.subplots(1,2,figsize=(4,3))
flatax = ax.flatten()

for i,ax in enumerate(flatax):
    sns.boxplot(data=state_comparison, x="Filtering", y=sublist[i], ax= ax)
    pretty_ax(ax)
    ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha="right")
    ax.set_xlabel("")
    annot = Annotator(
        ax,
        pairs=pairs,
        data=state_comparison, x="Filtering", y=sublist[i],
    )
    annot.configure(
        test="Mann-Whitney",
        loc="inside",
        text_format="star",
        show_test_name=False,
        verbose=2,
        comparisons_correction=None,
        fontsize=10,
    )
    annot.apply_test()
    _, test_results = annot.annotate()
fig.tight_layout()