In [None]:
import pandas as pd
import scanpy as sc
import seaborn as sns

In [None]:
#from statannot import add_stat_annotation
#from statannotations.Annotator import Annotator

In [None]:
import matplotlib.pyplot as plt

In [None]:
import decoupler as dc

### Data loading

In [None]:
adata = sc.read_h5ad(
    "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/40_gex_surface_prot/13_january_2025/mapped_data/mapped_til_colon.h5ad"
)

In [None]:
# Replace values in the 'origin' column
adata.obs["origin"] = adata.obs["origin"].replace({"colon": "normal", "til": "tumor"})


In [None]:
adata.obs["cell_type"] = adata.obs["cell_type"].replace("nan", "CD8_NaiveLike")

# If 'nan' values are true NaN (e.g., from pandas)
adata.obs["cell_type"].fillna("CD8_NaiveLike", inplace=True)

In [None]:
# Check if 'Ifng' is in the variable names
if 'ENSMUSG00000055170'in adata.var_names:
    # Safely extract the expression values for 'Ifng'
    ifng_expression = (
        adata[:, 'ENSMUSG00000055170'].X.toarray().flatten()
        if hasattr(adata[:, 'ENSMUSG00000055170'].X, "toarray")
        else adata[:, 'ENSMUSG00000055170'].X.flatten()
    )
    
    # Add Ifng expression status to adata.obs
    adata.obs['Ifng_expression'] = (ifng_expression > 6).astype(int)
    
    # Create a new column 'cell_type_2' based on 'Ifng_expression' and 'cell_type'
    adata.obs['cell_type_2'] = adata.obs.apply(
        lambda row: 'Ifng+' if row['Ifng_expression'] == 1 else row['cell_type'], axis=1
    )
else:
    print("Gene 'Ifng' not found in adata.var_names")

In [None]:
# Create new column based on Ifng_expression values
adata.obs["Ifng_expression_str"] = adata.obs["Ifng_expression"].map({1: "pos", 0: "neg"})


In [None]:
adata.obs['Ifng_expression_str'].value_counts()

In [None]:
adata.obs['cell_type_2'].value_counts()

In [None]:
adata.obs.condition.value_counts()

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
adata_ifng = adata[adata.obs["cell_type_2"]=="Ifng+"]

In [None]:
import scanpy as sc
import decoupler as dc

# Only needed for processing
import numpy as np
import pandas as pd

# Needed for some plotting
import matplotlib.pyplot as plt

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata_ifng,
    sample_col='sample_id',
    groups_col='cell_type',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

In [None]:
pdata = pdata[pdata.obs["condition"].isin(["10mix","11mix"])]

In [None]:
#pdata.var_names = pdata.var.gene_name

In [None]:
pdata.obs

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
#Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=pdata,
    design_factors=['condition','origin'],
    ref_level=['condition','10mix'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between COVID-19 vs normal
stat_res = DeseqStats(
    dds,
    contrast=["condition", '11mix', '10mix'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Extract results
results_df = stat_res.results_df
results_df

In [None]:
results_df = results_df.merge(pdata.var[['gene_name']], left_index=True, right_index=True, how="left")

In [None]:
# Set 'gene_id' as the index
results_df.set_index("gene_name", inplace=True)


In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    sign_thr=0.05,
    lFCs_thr=0.5,
    top=50,
    figsize=(4, 4)
)

In [None]:
# Define significance thresholds
sign_thr = 0.05  # Adjusted p-value threshold
lFCs_thr = 0.5   # Log2 fold-change threshold

# Filter results_df for significant genes
significant_genes_df = results_df[
    (results_df["padj"] < sign_thr) & (results_df["log2FoldChange"].abs() > lFCs_thr)
]


In [None]:
significant_genes_df

In [None]:
list(significant_genes_df.index)

In [None]:
import pandas as pd

# Ensure pdata is available and correctly loaded
# Extract sample annotations (metadata)
metadata_df = pdata.obs

# Extract read counts (gene expression matrix)
read_counts_df = pd.DataFrame(pdata.X.toarray() if hasattr(pdata.X, "toarray") else pdata.X,
                              index=pdata.obs_names,
                              columns=pdata.var_names)

# Save sample annotations to CSV
metadata_df.to_csv("sample_annotations.csv", index=True)
#
# Save read counts to TSV
#read_counts_df.to_csv("read_counts.tsv", sep="\t", index=True)




In [None]:
read_counts_df = read_counts_df.T 
# Add gene_id column from pdata.var["gene_name"]
read_counts_df.insert(0, "gene_name", pdata.var["gene_name"].values)  # Insert as the first column

In [None]:
read_counts_df.index.names = ['gene_id']

In [None]:
read_counts_df

In [None]:

# Save read counts to TSV
read_counts_df.to_csv("read_counts.tsv", sep="\t", index=True)  # Index (gene IDs) is included