In [None]:
import scanpy as sc
import pandas as pd
import seaborn as sns

In [None]:
from statannot import add_stat_annotation
from statannotations.Annotator import Annotator

In [None]:
adata = sc.read_h5ad("/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/40_gex_surface_prot/13_january_2025/mapped_data/mapped_til_colon.h5ad")

In [None]:
adata.obs.cell_type = adata.obs.cell_type.replace("nan","CD8_NaiveLike")

In [None]:
adata.obs.origin =  adata.obs.origin.replace("colon","normal")

In [None]:
adata.obs.origin =  adata.obs.origin.replace("til","tumor")

In [None]:
adata_normal = adata[adata.obs["origin"]=="normal"]

In [None]:
adata_tumor = adata[adata.obs["origin"]=="tumor"]

In [None]:
# Check if 'Ifng' is in the variable names
if 'ENSMUSG00000055170' in adata.var_names:
    # Safely extract the expression values for 'Ifng'
    ifng_expression = (
        adata[:, 'ENSMUSG00000055170'].X.toarray().flatten()
        if hasattr(adata[:, 'ENSMUSG00000055170'].X, "toarray")
        else adata[:, 'ENSMUSG00000055170'].X.flatten()
    )
    
    # Add Ifng expression status to adata.obs
    adata.obs['Ifng_expression'] = (ifng_expression > 7).astype(int)
    
    # Create a new column 'ifng' based on 'Ifng_expression'
    adata.obs['ifng'] = adata.obs['Ifng_expression'].apply(lambda x: 'pos' if x == 1 else 'neg')
else:
    print("Gene 'Ifng' not found in adata.var_names")


In [None]:
adata.obs.sample_id.value_counts()

In [None]:
adata.obs.origin.value_counts()

In [None]:
#adata = adata[adata.obs["sample_id"].isin(["10mix1","10mix2","11mix1","11mix2","10mix_ICI1","10mix_ICI2","11mix_ICI1","11mix_ICI2"])]

In [None]:
#adata = adata[adata.obs["sample_id"].isin(["GF1","GF2","11mix1","11mix2","GF_ICI1","GF_ICI2","GF_ICI1_plus","GF_ICI2_plus"])]

In [None]:
#adata_normal = adata[adata.obs['sample_id'].isin(["10mix1", "11mix1", "10mix2", "11mix2","GF1","GF2"])]
#adata_tumor = adata[adata.obs['sample_id'].isin(["10mix_ICI1", "11mix_ICI1", "10mix_ICI2", "11mix_ICI2","GF_ICI2_plus","GF_ICI1_plus","GF_ICI1","GF_ICI2"])]


In [None]:
adata.obs['cell_type_2'] = adata.obs['cell_type'].astype(str) + "_" + adata.obs['origin'].astype(str)

In [None]:
adata.obs['cell_type_2'].value_counts()

In [None]:
annotation_dict = {
    "CD8_Tex_tumor": "CD8_Tex_tumor",
    "CD8_NaiveLike_tumor": "CD8_NaiveLike_tumor",
    "CD8_EffectorMemory_tumor": "CD8_EffectorMemory_tumor",
    "CD8_NaiveLike_normal":"CD8_NaiveLike_normal",
    "CD8_Tex_normal":"CD8_EffectorMemory_normal",
    "CD8_EarlyActiv_normal":"CD8_EffectorMemory_normal",
    "CD8_EarlyActiv_tumor":"CD8_EarlyActiv_tumor",
    "CD8_Tpex_tumor":"CD8_Tpex_tumor",

}
## Add cell type column based on annotation
adata.obs["cell_type_2"] = [
    annotation_dict[clust] for clust in adata.obs["cell_type_2"]
]



In [None]:
adata_normal.obs['cell_type'].value_counts()

In [None]:
adata_tumor.obs['cell_type'].value_counts()

In [None]:
adata.obs['sample_id'].value_counts()

In [None]:
adata_normal.obs['sample_id'].value_counts()

In [None]:
adata_tumor.obs['sample_id'].value_counts()

In [None]:
# Group by 'sample_id' and 'cell_type', and count the number of cells in each group
cell_counts = (
    adata.obs.groupby(["sample_id","ifng"])
    .size()
    .reset_index(name='counts')
)

# Calculate the total number of cells per sample_id
total_counts_per_sample = cell_counts.groupby('sample_id')['counts'].transform('sum')

# Calculate the percentage of each cell type within each sample_id
cell_counts['percent'] = (cell_counts['counts'] / total_counts_per_sample) 

# Convert to a DataFrame (if needed)
cell_counts_df = pd.DataFrame(cell_counts)

In [None]:
cell_counts_df 

In [None]:
data=cell_counts_df

In [None]:
data['condition1'] = data['sample_id'].apply(lambda x: 'GF' if 'GF' in str(x) else '')

In [None]:
data['condition2'] = data['sample_id'].apply(lambda x: '10mix' if '10' in str(x) else '')

In [None]:
data['condition3'] = data['sample_id'].apply(lambda x: '11mix' if '11' in str(x) else '')

In [None]:
data["condition"] = data["condition1"].astype("str") +  data["condition2"].astype("str") + data["condition3"].astype("str") 

In [None]:
data

In [None]:
x="condition"
y='percent'
hue="condition"

In [None]:
data=cell_counts_df[["undefined" not in x for x in cell_counts_df.condition]&(cell_counts_df["ifng"]=="pos")]
data.sort_values(by="condition", inplace=True)

In [None]:
data

In [None]:
specific_order = [ "GF", "10mix", "11mix"]  # Replace with your desired order
data['condition'] = pd.Categorical(data['condition'], categories=specific_order, ordered=True)
g=sns.boxplot(y=y, x=x, data=data, palette=["orange","lightblue"])#kind="box",
sns.stripplot(y=y, x=x, data=data, palette=["orange","lightblue"], dodge=True, alpha=0.6, edgecolor="black", linewidth=0.5)

add_stat_annotation(g, data=data, x=x, y=y, 
                    box_pairs=[
                               ("10mix", "11mix"),("GF","11mix")
                       
                                ],
                    test='Mann-Whitney', text_format='simple', loc='inside', verbose=0, comparisons_correction='bonferroni')



g.figure.set_figwidth(5)
g.figure.set_figheight(4)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_ylabel("% IFNG+ cells")
g.set_xlabel("")

In [None]:
x="condition"
y='percent'
hue="condition"

In [None]:
g=sns.boxplot(y=y, x=x, data=data, palette=["orange","lightblue"])#kind="box",
sns.stripplot(y=y, x=x, data=data, palette=["orange","lightblue"], dodge=True, alpha=0.6, edgecolor="black", linewidth=0.5)

add_stat_annotation(g, data=data, x=x, y=y, 
                    box_pairs=[("10mix","11mix"),
                    
                          
                                ],
                   test='Mann-Whitney', text_format='simple', loc='inside', verbose=0)#, comparisons_correction='bonferroni')


g.figure.set_figwidth(5)
g.figure.set_figheight(4)
g.set_title("Tumor: Condition differences")
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_ylabel("Percent Ifng+ cells")
g.set_xlabel("")

In [None]:
x="sample_id"
y='percent'
hue="sample_id"

In [None]:
g=sns.boxplot(y=y, x=x, data=data, palette=["orange","lightblue"])#kind="box",
sns.stripplot(y=y, x=x, data=data, palette=["orange","lightblue"], dodge=True, alpha=0.6, edgecolor="black", linewidth=0.5)



g.figure.set_figwidth(5)
g.figure.set_figheight(4)
g.set_title("Tumor: Sample differences")
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_ylabel("Percent Ifng+ cells")
g.set_xlabel("")

In [None]:
import matplotlib.pyplot as plt

# Define a function to extract Ifng expression levels for a given dataset
def extract_ifng_expression(adata, gene='ENSMUSG00000055170', threshold=0):
    if gene in adata.var_names:
        ifng_expression = (
            adata[:, gene].X.toarray().flatten()
            if hasattr(adata[:, gene].X, "toarray")
            else adata[:, gene].X.flatten()
        )
        # Filter cells that express Ifng based on the threshold
        return ifng_expression[ifng_expression > threshold], adata.obs['condition'][ifng_expression > threshold]
    else:
        return None, None

# Extract Ifng expression levels and conditions for each dataset
ifng_expression_adata, condition_adata = extract_ifng_expression(adata)
ifng_expression_normal, condition_normal = extract_ifng_expression(adata[adata.obs['sample_id'].isin(["10mix1", "11mix1", "10mix2", "11mix2"])])
ifng_expression_tumor, condition_tumor = extract_ifng_expression(adata[adata.obs['sample_id'].isin(["10mix_ICI1", "11mix_ICI1", "10mix_ICI2", "11mix_ICI2"])])

# Create a figure with 3 subplots
plt.figure(figsize=(18, 6))

# Plot histogram for adata, colored by condition
plt.subplot(1, 3, 1)
plt.hist(ifng_expression_adata[condition_adata == '10mix'], bins=30, edgecolor='black', alpha=0.7, label='10mix', color='blue')
plt.hist(ifng_expression_adata[condition_adata == '11mix'], bins=30, edgecolor='black', alpha=0.7, label='11mix', color='orange')
plt.title('Ifng Expression (Tumor & Normal)')
plt.xlabel('Ifng Expression Level')
plt.ylabel('Number of Cells')
plt.legend(title="Condition")

# Plot histogram for adata_normal, colored by condition
plt.subplot(1, 3, 2)
plt.hist(ifng_expression_normal[condition_normal == '10mix'], bins=30, edgecolor='black', alpha=0.7, label='10mix', color='blue')
plt.hist(ifng_expression_normal[condition_normal == '11mix'], bins=30, edgecolor='black', alpha=0.7, label='11mix', color='orange')
plt.title('Ifng Expression (Normal)')
plt.xlabel('Ifng Expression Level')
plt.ylabel('Number of Cells')
plt.legend(title="Condition")

# Plot histogram for adata_tumor, colored by condition
plt.subplot(1, 3, 3)
plt.hist(ifng_expression_tumor[condition_tumor == '10mix'], bins=30, edgecolor='black', alpha=0.7, label='10mix', color='blue')
plt.hist(ifng_expression_tumor[condition_tumor == '11mix'], bins=30, edgecolor='black', alpha=0.7, label='11mix', color='orange')
plt.title('Ifng Expression (Tumor)')
plt.xlabel('Ifng Expression Level')
plt.ylabel('Number of Cells')
plt.legend(title="Condition")

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()



In [None]:
adata.obs

In [None]:
### annotation

In [None]:
marker_dict_small  = {"CD8_NaiveLike":[
    "Ccr7",
"Il7r",
"Sell",
"Tcf7",
"Txk",
"S1pr1",
"Lef1",
"Satb1",
],"CD8_EarlyActiv":[
"Fosb",
"Gzmm",
"Samd3",
"Cxcr4",
"Btg2"
], "CD8_EffectorMemory":["Gzma",
"Gzmk",
"Gzmb",
"Ccl5",
],"CD8_Tpex":[
"Xcl1",
"Crtam",
"Ifng",
"Ccl4",
"Dusp4",
"Zeb2",
"Nr4a2",
"Sla",
],"CD8_Tex":["Pdcd1",
"Prf1",
"Havcr2",
] }

In [None]:
marker_dict_small_normal  = {"CD8_NaiveLike":[
    "Ccr7",
"Il7r",
"Sell",
"Tcf7",
"Txk",
"S1pr1",
"Lef1",
"Satb1",
],"CD8_EarlyActiv":[
"Fosb",
"Gzmm",
"Samd3",
"Cxcr4",
"Btg2"
],"CD8_Tex":["Pdcd1",
"Prf1",
"Havcr2",
"Nkg7",
"Cd8b1",
"Ctsd",
] }

In [None]:
adata.obs.cell_type_2.value_counts()

In [None]:
# Specify the desired order of the categories
desired_order = ["CD8_NaiveLike_normal","CD8_EarlyActiv_normal","CD8_Tex_normal","CD8_NaiveLike_tumor","CD8_EarlyActiv_tumor","CD8_EffectorMemory_tumor","CD8_Tpex_tumor","CD8_Tex_tumor"]

# Reorder the categories in adata.obs['leiden_res0_25']
adata.obs["cell_type_2"] = pd.Categorical(
    adata.obs["cell_type_2"],
    categories=desired_order,
    ordered=True
)

In [None]:
sc.pl.dotplot(adata, marker_dict_small, groupby='cell_type_2',gene_symbols="gene_name",dendrogram=False,  standard_scale ='var', cmap = "viridis")#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
sc.pl.matrixplot(adata, marker_dict_small, groupby='cell_type',gene_symbols="gene_name",dendrogram=False,  standard_scale ='var', cmap = "viridis",  )#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
sc.pl.matrixplot(adata, marker_dict_small, groupby='cell_type',gene_symbols="gene_name",dendrogram=False,  standard_scale ='var', cmap = "viridis" )#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
# Specify the desired order of the categories
desired_order = ["CD8_NaiveLike","CD8_EarlyActiv","CD8_Tex"]

# Reorder the categories in adata.obs['leiden_res0_25']
adata_normal.obs["cell_type"] = pd.Categorical(
    adata_normal.obs["cell_type"],
    categories=desired_order,
    ordered=True
)

In [None]:
sc.pl.dotplot(adata_normal, marker_dict_small_normal, groupby='cell_type',gene_symbols="gene_name",dendrogram=False,  standard_scale ='var', cmap='viridis' )#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
# Specify the desired order of the categories
desired_order = ["CD8_NaiveLike","CD8_EarlyActiv","CD8_EffectorMemory","CD8_Tpex","CD8_Tex"]

# Reorder the categories in adata.obs['leiden_res0_25']
adata_tumor.obs["cell_type"] = pd.Categorical(
    adata_tumor.obs["cell_type"],
    categories=desired_order,
    ordered=True
)

In [None]:
sc.pl.matrixplot(adata, marker_dict_small, groupby='cell_type_2',gene_symbols="gene_name",dendrogram=False,  standard_scale ='var', cmap='viridis')#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
sc.pl.matrixplot(adata, "Ifng", groupby='condition',gene_symbols="gene_name",cmap='viridis')#, save ="mapped_annotation_tumor_normal.svg")

In [None]:
sc.pl.umap(adata, color = "cell_type_2")

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
adata.obs.leiden_res0_5.value_counts()

In [None]:
sc.pl.umap(adata, color = ["cell_type"])

In [None]:
sc.pl.umap(adata, color = ["leiden"], groups=["2"])

In [None]:
annotation_dict = {
    "0": "CD8_Tex_Tumor",
    "1": "CD8_EffectorMemory_Tumor",
    "2": "",
    "3": "",
    "4": "",
    "5": "",
    "6": "",
    "7": "",
    "8": "",
    "9": "",
    "10": "",
    "11": "",
    "12": "",
    "13": "",
    "14": "",


}
## Add cell type column based on annotation
adata.obs["cell_type_2"] = [
    annotation_dict[clust] for clust in adata.obs["leiden"]
]


In [None]:
marker_dict  = {"CD8_NaiveLike":[
    "Ccr7",
"Il7r",
"Sell",
"Tcf7",
"Txk",
"S1pr1",
"Lef1",
"Satb1",
],"CD8_EarlyActiv":["Gzmk",
"Fos",
"Cd69",
"Zfp36",
"Fosb",
"Ccl5",
"Gzmm",
"Dusp2",
"Lyar",
"Samd3",
"Cxcr4",
"Ctsw",
"Cd8a",
"Anxa1",
"Klrg1",
"Cd8b1",
"Aoah",
"Tagap",
"Klrd1",
"Ier2",
"Gzma",
"Cst7",
"Itm2c",
"Parp8",
"Btg2"], "CD8_EffectorMemory":["Gzma",
"Gzmk",
"Nkg7",
"Cd8a",
"Cd8b1",
"Ctsw",
"Gzmb",
"Ccl5",
"Cst7",
"Prf1",
"Abi3",
"Fasl",
"Itm2c",
"1500009L16Rik",
"Eomes",
"Chst12",
"Ccr5",
"Hcst",
"Aoah",
"Hopx",
"Slamf7",
"Cxcr3",
"Oasl1",
"F2r",
"Cxcr6"],"CD8_Tpex":["Lag3",
"Xcl1",
"Crtam",
"Ifng",
"Ccl4",
"Pdcd1",
"Dusp4",
"Cd8a",
"Zeb2",
"Nr4a2",
"Sla",
"Nkg7",
"Tigit",
"Ctsw",
"Tnfrsf9",
"Tox",
"Lyst",
"Tnfsf4",
"Ccl3",
"Gzmb",
"Rab27a",
"Prf1",
"Cd70",
"Plscr1",],"CD8_Tex":["Lag3",
"Prf1",
"Cd8a",
"Havcr2",
"Gzmb",
"Nkg7",
"Cd8b1",
"Ctsd",
"Klrd1",
"Id2",
"Cst7",
"Pdcd1",
"Tnfrsf9",
"Tigit",
"Ctsw",
"Ccl4",
"Ccl3",
"Ifng",
"Cxcr6",
"Fasl",
"Rbpj",
"Chst12",
"Fam3c",
"Csf1"] }

In [None]:
sc.pl.clustermap(adata_tumor)

In [None]:
u=sns.FacetGrid(adata.obs,  col="ifng",hue='cell_type', sharey=False, sharex=True, height=2, aspect=2, margin_titles=True, col_wrap=4)
u.map_dataframe(sns.histplot, x="cell_type",common_norm=True, stat='count')
u.set_xticklabels(rotation="vertical")

In [None]:
sc.pp.highly_variable_genes(adata, min_disp=0.1, min_mean=0.01, max_mean=50)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.var["variance"] = adata.to_df().var()

In [None]:
adata.var["variance"]

In [None]:
adata.var

In [None]:
sns.scatterplot(data=adata.var, x="means", y="variance")

In [None]:
f=sns.pointplot(data=adata[:,adata.var.variance>0.1].var, y="gene_name", x="variance", color="orange",
              order=[
"Cxcr6",
"Chst12",
"Fam3c",
"Csf1"]
               )
#f.figure.savefig(resultsDir+"/variance_genes_CD8_MSI.png",  bbox_inches='tight')

In [None]:
adata.var

In [None]:
sc.pl.highest_expr_genes(adata,gene_symbols="gene_name")#, save="boxplot_CD8_NormMSI.png")

In [None]:
sc.pl.highest_expr_genes(adata_tumor,gene_symbols="gene_name")#, save="boxplot_CD8_NormMSI.png")

In [None]:
sc.pl.highest_expr_genes(adata_normal,gene_symbols="gene_name")#, save="boxplot_CD8_NormMSI.png")

In [None]:
adata.obs

In [None]:
adata.to_df()

In [None]:
f=sns.kdeplot(data=adata.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:orange", linestyle=":")
sns.kdeplot(data=adata_tumor.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:blue", linestyle=":")
sns.kdeplot(data=adata_normal.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:orange")
f.axvline(x=0.2, color="black", linestyle=":")
f.grid(False)
#f.figure.savefig(resultsDir+"/kdeplot_CD8_IFNGdist_BOTHmsimsswithNorm.png")
#f.figure.savefig(resultsDir+"/kdeplot_CD8_IFNGdist_BOTHmsimsswithNorm.svg")

In [None]:
f=sns.kdeplot(data=adata.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:orange", linestyle=":")
sns.kdeplot(data=adata_tumor.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:blue", linestyle=":")
sns.kdeplot(data=adata_normal.to_df(), x="ENSMUSG00000055170", common_norm=False, color="tab:orange")
f.axvline(x=0.2, color="black", linestyle=":")
f.grid(False)
#f.figure.savefig(resultsDir+"/kdeplot_CD8_IFNGdist_BOTHmsimsswithNorm.png")
#f.figure.savefig(resultsDir+"/kdeplot_CD8_IFNGdist_BOTHmsimsswithNorm.svg")

In [None]:
sc.pl.clustermap(adata)