# Integrate annotated TIL & COLON

## Load adata annotated 

In [None]:
# Libraries
import anndata as ad
import matplotlib as plt
import numpy as np
import pandas as pd
import sc_atlas_helpers as ah
import scanpy as sc
from matplotlib.pyplot import rc_context
from scipy.stats import median_abs_deviation

In [None]:
from functools import partial

import altair as alt

In [None]:
import seaborn as sns

In [None]:
import decoupler as dc

In [None]:
resDir = "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/40_gex_surface_prot/"
inputDir = "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/40_gex_surface_prot/"

In [None]:
adata = sc.read_h5ad(
    "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/adata_merged_til_slec_mpec_annotation_tryneighbors.h5ad"
)

In [None]:
path = "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/solo"

In [None]:
adata_solo = sc.read_h5ad(f"{path}/adata_nodoublet.h5ad")

In [None]:
adata = sc.read_h5ad("adata_solo_annotated_all.h5ad")

In [None]:
adata

### Functions

In [None]:
def update_columns_origin(row):

    if "ICI" not in row["sample_id"]:
        row["origin"] = "colon"
    else:
        row["origin"] = "til"

    return row

In [None]:
def update_columns_batch(row):

    if "ICI1" in row["sample_id"]:
        row["batch_id"] = "ICI1"
    elif "ICI2" in row["sample_id"]:
        row["batch_id"] = "ICI2"
    elif row["sample_id"] == "10mix1":
        row["batch_id"] = "1"
    elif row["sample_id"] == "10mix2":
        row["batch_id"] = "2"

    return row

In [None]:
def update_columns_condition(row):

    if row["sample_id"] == "10mix1":
        row["condition"] = "10mix"
    elif row["sample_id"] == "10mix2":
        row["condition"] = "10mix"
    elif row["sample_id"] == "11mix1":
        row["condition"] = "11mix"
    elif row["sample_id"] == "11mix2":
        row["condition"] = "11mix"
    elif row["sample_id"] == "GF1":
        row["condition"] = "GF"
    elif row["sample_id"] == "GF2":
        row["condition"] = "GF"
    elif row["sample_id"] == "GF_ICI2_plus":
        row["condition"] = "GF-plus"
    elif row["sample_id"] == "GF_ICI1_plus":
        row["condition"] = "GF-plus"
    elif row["sample_id"] == "GF_ICI2":
        row["condition"] = "GF"
    elif row["sample_id"] == "GF_ICI1":
        row["condition"] = "GF"
    elif row["sample_id"] == "10mix_ICI1":
        row["condition"] = "10mix"
    elif row["sample_id"] == "10mix_ICI2":
        row["condition"] = "10mix"
    elif row["sample_id"] == "11mix_ICI1":
        row["condition"] = "11mix"
    elif row["sample_id"] == "11mix_ICI2":
        row["condition"] = "11mix"

    return row

In [None]:
# %%
from typing import Dict

import numpy as np
import pandas as pd
from anndata import AnnData


def score_seeds(
    adata: AnnData,
    seed_marker_genes: Dict[str, Dict[str, list]],
    layer: str = "log1p_norm",
    cutoff_sum_pos_marker_expression: float = 0.1,
    cutoff_sum_neg_marker_expression: float = 0,
) -> pd.Series:
    """Label seed cell types based on input pos/neg marker gene expression"""
    
    def _score_ct(adata, seed_marker_genes, layer, cutoff_sum_pos_marker_expression):
        tmp_df = pd.DataFrame(index=adata.obs_names)
        
        for cell_type, markers in seed_marker_genes.items():
            positive_conditions = []
            negative_conditions = []
            
            for positive_markers_list in markers['positive']:
                positive_condition = np.ravel(adata[:, adata.var_names.isin(positive_markers_list)].layers[layer].sum(1)) > cutoff_sum_pos_marker_expression
                positive_conditions.append(positive_condition)
            
            for negative_markers_list in markers['negative']:
                negative_condition = ~(np.ravel(adata[:, adata.var_names.isin(negative_markers_list)].layers[layer].sum(1)) > cutoff_sum_neg_marker_expression)
                negative_conditions.append(negative_condition)
            
            combined_positive_condition = np.all(positive_conditions, axis=0)
            combined_negative_condition = np.all(negative_conditions, axis=0)
            
            condition = combined_positive_condition & combined_negative_condition
            tmp_df[f"tmp_{cell_type}"] = np.where(condition, cell_type, "unknown")
        
        return tmp_df

    
    def _combine_ct(row):
        for ct in seed_ct:
            if row[ct] != 'unknown':
                return row[ct]
        return 'unknown'

    seed_df = _score_ct(adata, seed_marker_genes, layer, cutoff_sum_pos_marker_expression)
    seed_ct = [f"tmp_{ct}" for ct in seed_marker_genes.keys()]
    
    return seed_df.apply(_combine_ct, axis=1)

In [None]:
set(adata.obs.sample_id)

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

### Map adata cell type column to adata_solo based on cell barcodes 

In [None]:
adata_solo.obs["sample_id"] = adata_solo.obs["sample_id"].str.replace("-", "_")

In [None]:
new_obs_names = adata_solo.obs_names.str.replace(r"-.+", "-1", regex=True)

# Step 2: Concatenate with "_" and adata_solo.obs["sample_id"]
new_obs_names += "_" + adata_solo.obs["sample_id"].astype(str)

# Assign the new values back to adata_solo.obs_names
adata_solo.obs_names = new_obs_names

In [None]:
cell_type_data = adata.obs["cell_type"]
adata_solo.obs["cell_type"] = adata_solo.obs.index.map(cell_type_data)

In [None]:
sc.pl.umap(adata_solo, color=["cell_type"], vmax="p99", cmap="Reds")

In [None]:
# Apply the function to each row in the DataFrame
adata_solo.obs = adata_solo.obs.apply(update_columns_origin, axis=1)

In [None]:
# Apply the function to each row in the DataFrame
adata_solo.obs = adata_solo.obs.apply(update_columns_batch, axis=1)

In [None]:
# Apply the function to each row in the DataFrame
adata_solo.obs = adata_solo.obs.apply(update_columns_condition, axis=1)

In [None]:
sc.pl.umap(adata_solo, color=["cell_type"])

## QC 

In [None]:
# mitochondrial genes
adata_solo.var["mt"] = adata_solo.var_names.str.startswith("mt-")
# ribosomal genes
adata_solo.var["ribo"] = adata_solo.var_names.str.startswith(("Rps", "Rpl"))

In [None]:
sc.pp.calculate_qc_metrics(
    adata_solo, qc_vars=["mt", "ribo"], inplace=True, percent_top=[20], log1p=True
)
adata_solo

In [None]:
adata_solo.var_names_make_unique()

In [None]:
#adata_solo.var

In [None]:
#sc.pl.violin(adata_solo, "pct_counts_mt", use_raw=False)

In [None]:
#sc.pl.violin(adata_solo, "pct_counts_ribo", use_raw=False)

In [None]:
adata_solo.obs["outlier"] = (
    is_outlier(adata_solo, "log1p_total_counts", 5)
    | is_outlier(adata_solo, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata_solo, "pct_counts_in_top_20_genes", 5)
)
adata_solo.obs.outlier.value_counts()

In [None]:
adata_solo.obs["mt_outlier"] = is_outlier(adata_solo, "pct_counts_mt", 3) | (
    adata_solo.obs["pct_counts_mt"] > 10
)
adata_solo.obs.mt_outlier.value_counts()

In [None]:
adata_solo.obs["ribo_outlier"] = is_outlier(adata_solo, "pct_counts_ribo", 3) | (
    adata_solo.obs["pct_counts_ribo"] > 15
)
adata_solo.obs.mt_outlier.value_counts()

In [None]:
print(f"Total number of cells: {adata_solo.n_obs}")
adata_solo = adata_solo[(~adata_solo.obs.outlier) & (~adata_solo.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata_solo.n_obs}")

In [None]:
print(f"Total number of cells: {adata_solo.n_obs}")
adata_solo = adata_solo[
    (~adata_solo.obs.outlier) & (~adata_solo.obs.ribo_outlier)
].copy()

print(f"Number of cells after filtering of low quality cells: {adata_solo.n_obs}")

In [None]:
#sc.pl.violin(adata_solo, "pct_counts_mt", use_raw=False)

In [None]:
#sc.pl.violin(adata_solo, "pct_counts_ribo", use_raw=False)

In [None]:
adata_solo.var["ribo"] = adata_solo.var_names.str.startswith(("Rps", "Rpl"))

In [None]:
# adata_solo.var.ribo.value_counts()

In [None]:
adata_solo = adata_solo[:, adata_solo.var["ribo"] == False].copy()

In [None]:
adata_solo.var["mt"] = adata_solo.var_names.str.startswith(("mt-", "mt-"))

In [None]:
# adata_solo.var.mt.value_counts()

In [None]:
adata_solo = adata_solo[:, adata_solo.var["mt"] == False].copy()

In [None]:
# Visualize
# adata_solo
#sc.pl.umap(
#    adata_solo,
#    color=["pct_counts_mt", "total_counts", "n_genes_by_counts"],
#    vmax="p99",
#    cmap="inferno",
#)

In [None]:
#sc.tl.rank_genes_groups(adata_solo, "leiden", method="wilcoxon")

In [None]:
#pd.DataFrame(adata_solo.uns["rank_genes_groups"]["names"]).head(20)

In [None]:
#sc.pl.highest_expr_genes(adata_solo, n_top=40, gene_symbols="gene_name")

### Split adata

In [None]:
adata_solo.var.index = adata_solo.var["ensembl_id"]

In [None]:
adata_solo_til = adata_solo[adata_solo.obs["origin"] == "til"]

In [None]:
#sc.tl.rank_genes_groups(adata_solo_til, "cell_type", method="wilcoxon")

In [None]:
#pd.DataFrame(adata_solo_til.uns["rank_genes_groups"]["names"]).head(20)

In [None]:
#sc.pl.highest_expr_genes(adata_solo_til, n_top=20, gene_symbols="gene_name")

In [None]:
adata_solo_colon = adata_solo[adata_solo.obs["origin"] == "colon"]

### Annotate colon

In [None]:
sc.pl.umap(adata_solo_colon, color=["condition", "sample_id", "leiden"])

In [None]:
#sc.tl.rank_genes_groups(adata_solo_colon, "leiden", method="wilcoxon")

In [None]:
#pd.DataFrame(adata_solo_colon.uns["rank_genes_groups"]["names"]).head(20)

In [None]:
adata_solo_colon.var["ribo"] = adata_solo_colon.var_names.str.startswith(("Rps", "Rpl"))

In [None]:
adata_solo_colon = adata_solo_colon[:, adata_solo_colon.var["ribo"] == False].copy()

In [None]:
sc.pp.neighbors(adata_solo_colon, n_neighbors=10)
sc.tl.leiden(adata_solo_colon, resolution=0.5, n_iterations=-1)

In [None]:
#sc.pl.umap(adata_solo_colon, color=["leiden"], vmax="p99", cmap="Reds")

In [None]:
markers = {
    "Naive": ["Lef1", "Sell", "Ccr7", "Tcf7","Il7r"],
    "Effector M.": ["Gzmk", "Cxcr4", "Cxcr3", "Cd44"], # Predysfunctional
    "Exhausted": ["Havcr2", "Pdcd1", "Ifng", "Itgae","Lag3","Ctla4"], # Dysfunctional
    "Cytotoxic": ["Cx3cr1", "Klrg1", "Prf1"],
}

In [None]:
sc.pl.dotplot(
    adata_solo_colon,
    groupby="leiden",
    use_raw=False,
    var_names=markers,
    dendrogram=False,
    log=True,
    gene_symbols="gene_name",
)


In [None]:
sc.pl.umap(
    adata_solo_colon,
    color=[
        "Lef1",
        "Sell",
        "Ccr7",
        "Tcf7",
        "Gzmk",
        "Cxcr4",
        "Cxcr3",
        "Cd44",
        "Havcr2",
        "Pdcd1",
        "Ifng",
        "Itgae",
        "Cx3cr1",
        "Klrg1",
        "Prf1",
        "Mki67",
        "Ccl5",
        "H2-K1",
        "H2-Q4",
        "H2-Q7",
    ],
    vmax="p99",
    cmap="Reds",
    add_outline=True,
)

In [None]:
sc.pl.umap(
    adata_solo_colon, color=["leiden"], vmax="p99", cmap="Reds", legend_loc="on data"
)

In [None]:
annotation_dict = {
    "0": "COLON_Exhausted",
    "1": "COLON_Exhausted",
    "2": "COLON_Naive",
    "3": "COLON_Naive",
    "4": "COLON_Naive",
    "5": "COLON_Exhausted",
    "6": "COLON_Intermediate",
    "7": "COLON_Exhausted",
    "8": "COLON_Naive",
    "9": "COLON_Exhausted",
}



In [None]:
## Add cell type column based on annotation
adata_solo_colon.obs["cell_type"] = [
    annotation_dict[clust] for clust in adata_solo_colon.obs["leiden"]
]
# Visualize
sc.pl.umap(
    adata_solo_colon,
    color=["cell_type", "leiden"],

    legend_fontsize=8,
)

In [None]:
#sc.tl.rank_genes_groups(adata_solo_colon, "cell_type", method="wilcoxon")

In [None]:
#pd.DataFrame(adata_solo_colon.uns["rank_genes_groups"]["names"]).head(20)

## Exhausted subtypes

In [None]:
adata_solo_colon_ex = adata_solo_colon[
    adata_solo_colon.obs["cell_type"] == "COLON_Exhausted"
]

In [None]:
sc.pp.neighbors(adata_solo_colon_ex, n_neighbors=10)
sc.tl.leiden(adata_solo_colon_ex, resolution=0.3, n_iterations=-1)

In [None]:
sc.pl.umap(
    adata_solo_colon_ex,
    color=["Il7r","Ccr7","Ifng","Tbx21","Lag3","Cxcr6","Gzmb","Gzmk"],
    vmax="p99",
    cmap="Reds",
    add_outline=True,
)

In [None]:
sc.pl.umap(adata_solo_colon_ex, color=["leiden"], vmax="p99")

In [None]:
#sc.tl.rank_genes_groups(adata_solo_colon_ex, "leiden", method="wilcoxon")

In [None]:
#pd.DataFrame(adata_solo_colon_ex.uns["rank_genes_groups"]["names"]).head(20)

In [None]:
annotation_dict = {
    "0": "COLON_Exhausted",
    "1": "COLON_Infg",  # Ifng
    "2": "COLON_Exhausted",
    "3": "COLON_Exhausted",
    "4": "COLON_Exhausted",
}


In [None]:
## Add cell type column based on annotation
adata_solo_colon_ex.obs["cell_type"] = [
    annotation_dict[clust] for clust in adata_solo_colon_ex.obs["leiden"]
]
# Visualize
sc.pl.umap(
    adata_solo_colon_ex,
    color=["cell_type", "leiden"],

    legend_fontsize=8,
)

### Reintegrate annotated subsets

In [None]:
adata_solo_colon_rest = adata_solo_colon[
    adata_solo_colon.obs["cell_type"].isin(["COLON_Naive","COLON_Intermediate"])
]

In [None]:
# Visualize
sc.pl.umap(
    adata_solo_colon_rest,
    color=["cell_type", "leiden"],

    legend_fontsize=8,
)

In [None]:
adata_solo_colon.obs["cell_type"] = adata_solo_colon.obs["cell_type"].astype("str")#
adata_solo_colon.obs.loc[adata_solo_colon_ex.obs.index, "cell_type"] = adata_solo_colon_ex.obs["cell_type"].astype("str")

In [None]:
adata_solo_colon.obs["cell_type"] = adata_solo_colon.obs["cell_type"].astype("str")#
adata_solo_colon.obs.loc[adata_solo_colon_rest.obs.index, "cell_type"] = adata_solo_colon_rest.obs["cell_type"].astype("str")

In [None]:
# Visualize
sc.pl.umap(
    adata_solo_colon,
    color=["cell_type", "leiden"],

    legend_fontsize=8,
)

In [None]:
markers = {
    "Naive": ["Lef1", "Sell", "Ccr7"],
    "Effector M.": ["Gzmk", "Cxcr4", "Cxcr3", "Cd44"], # Predysfunctional
    "Exhausted": ["Pdcd1", "Ifng", "Itgae","Lag3","Ctla4"], # Dysfunctional
}

In [None]:
sc.pl.dotplot(
    adata_solo_colon,
    groupby="cell_type",
    use_raw=False,
    var_names=markers,
    dendrogram=False,
    categories_order = [ 'COLON_Naive',"COLON_Intermediate","COLON_Infg","COLON_Exhausted"],
    log=True,
    gene_symbols="gene_name",
)


## Reintegrate with adata_solo til & colon

In [None]:
adata_solo.obs["cell_type"] = adata_solo.obs["cell_type"].astype("str")#
adata_solo.obs.loc[adata_solo_colon.obs.index, "cell_type"] = adata_solo_colon.obs["cell_type"].astype("str")

In [None]:
set(adata_solo.obs["cell_type"])

In [None]:
# Visualize
sc.pl.umap(
    adata_solo,
    color=["cell_type"], legend_loc="on data", legend_fontsize=7,

)

In [None]:
# Visualize
sc.pl.umap(
    adata_solo,
    color=["cell_type"],groups=['COLON_Naive',"COLON_Intermediate","COLON_Infg","COLON_Exhausted"],
legend_loc="on data", legend_fontsize=7,

)

In [None]:
# Visualize
sc.pl.umap(
    adata_solo,
    color=["cell_type"],groups=['MPEC_Effector',
 'MPEC_Intermediate',
 'MPEC_Progenitor',
 'SLEC_Effector',
 'SLEC_Inf',
 'SLEC_Intermediate',
 'SLEC_Plastic',
 'SLEC_Progenitor',
 'SLEC_Terminal'],legend_loc="on data", legend_fontsize=7,

)

In [None]:
## Annotated unknown cells 

In [None]:
# Apply the function to each row in the DataFrame
adata.obs = adata.obs.apply(update_columns_origin, axis=1)

In [None]:
# Apply the function to each row in the DataFrame
adata.obs = adata.obs.apply(update_columns_batch, axis=1)

In [None]:
# Apply the function to each row in the DataFrame
adata.obs = adata.obs.apply(update_columns_condition, axis=1)

In [None]:
adata.obs

In [None]:
adata_u = adata[adata.obs["cell_type"]=="nan"]

In [None]:
adata_u

In [None]:
sc.pl.umap(adata_u, color=["cell_type","leiden","origin"])

In [None]:
sc.pl.umap(adata, color=["cell_type","origin","leiden"], legend_loc="on data", legend_fontsize=7)

In [None]:
markers = {
    "Early A.": ["Cd69"],
    "Late A.": ["Il2ra"],
        "Naive": ["Sell"],
        "E.Memory": ["Cd44"],

       "MPEC": ["Il7r"],#Memory Precursor Effector Cells
       "SLEC": ["Klrg1","Tbx21"], #Short Lived Effector Cells
 
    "Exhaustion":["Havcr2","Entpd1","Tox"],
    "Cytotoxic":["Gzmb","Gzmk","Ifng"],
    "Sel-renewal":["Cxcr3","Casp3"],
    }

In [None]:
sc.pl.dotplot(adata_u, groupby="leiden",var_names=markers, dendrogram=False,log=True, gene_symbols="gene_name")

In [None]:
marker_list = ["Cd69","Il2ra","Sell","Cd44","Il7r","Klrg1","Tbx21","Havcr2","Entpd1","Tox","Gzmb","Gzmk","Ifng","Cxcr3","Casp3"]

In [None]:
sc.pl.umap(
    adata_u,
    color=marker_list,
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata_u,
    color=["Ifng","Cxcr3"],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata_u,
    color=["Tox","Casp3"],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata,
    color=["cell_type"],groups=['MPEC_Effector',
 'MPEC_Intermediate',
 'MPEC_Progenitor',
],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata,
    color=["cell_type"],groups=[ 'SLEC_Effector',
 'SLEC_Inf',
 'SLEC_Intermediate',
 'SLEC_Plastic',
 'SLEC_Progenitor',
 'SLEC_Terminal'
],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(adata_u, color=["leiden"])

In [None]:
sc.pp.neighbors(adata_u, n_neighbors=10)
sc.tl.leiden(adata_u, resolution=0.5,  n_iterations=-1)
sc.tl.umap(adata_u)

In [None]:
sc.pl.umap(adata_u, color=["leiden"])

In [None]:
sc.pl.dotplot(adata_u, groupby="leiden",var_names=markers, dendrogram=False,log=True, gene_symbols="gene_name")

In [None]:
marker_list = ["Cd69","Il2ra","Sell","Cd44","Il7r","Klrg1","Tbx21","Havcr2","Entpd1","Tox","Gzmb","Gzmk","Ifng","Cxcr3","Casp3"]

In [None]:
sc.pl.umap(
    adata_u,
    color=marker_list,
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata_u,
    color=["Ifng","Cxcr3"],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
sc.pl.umap(
    adata_u,
    color=["Tox","Casp3"],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
annotation_dict = {
    '0':'MPEC_Effector',
    '1':'SLEC_Terminal',
    '2':'MPEC_Effector',
    '3':'SLEC_Effector',
    '4':'SLEC_Terminal',
    '5':'SLEC_Effector',
    '6':"MPEC_Effector",
    "7":"SLEC_Terminal",
    "8":"SLEC_Plastic"
  }

In [None]:
## Add cell type column based on annotation
adata_u.obs['cell_type'] = [annotation_dict[clust] for clust in adata_u.obs['leiden']]

# Visualize
sc.pl.umap(adata_u, color='cell_type')

In [None]:
sc.pl.dotplot(adata_u, groupby="cell_type",var_names=markers, dendrogram=False,log=True, gene_symbols="gene_name")

In [None]:
adata.obs["cell_type"] = adata.obs["cell_type"].astype("str")
adata.obs.loc[adata_u.obs.index, "cell_type"] = adata_u.obs["cell_type"].astype("str")

In [None]:
sc.pl.umap(
    adata,
    color=["cell_type"],groups=[ 'SLEC_Effector',
 'SLEC_Inf',
 'SLEC_Intermediate',
 'SLEC_Plastic',
 'SLEC_Progenitor',
 'SLEC_Terminal','MPEC_Effector',
 'MPEC_Intermediate',
 'MPEC_Progenitor',
],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_name"
)

In [None]:
adata.obs.cell_type.value_counts()

In [None]:
set(adata.obs.cell_type)

In [None]:
#adata.write_h5ad("adata_solo_annotated_all.h5ad")

## Create gene counts and samplesheet for DS analysis

In [None]:
adata.var

In [None]:
adata.var.rename(columns={"gene_name": "gene_id"}, inplace=True)

In [None]:
adata.raw = None

In [None]:
path = "/data/scratch/kvalem/projects/2021/honda_microbial_metabolites_2021/40_tables/40_single-cell-sorted-cd8/40_gex_surface_prot"

In [None]:
### Samplesheet
samplesheet = adata.obs.copy()
#samplesheet["sample_id"]=samplesheet.index
samplesheet.reset_index(inplace=True)

In [None]:
samplesheet.to_csv(f"{path}/samplesheet.csv", index=False)
samplesheet.to_csv(f"{path}/samplesheet.tsv", index=False, sep = "\t")

In [None]:
bulk_df = adata.to_df().T

In [None]:
### Raw counts
bulk_df = adata.to_df().T
bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")

In [None]:
bulk_df.to_csv(f"{path}/counts.csv")

In [None]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
adata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [None]:
sc.pp.log1p()

In [None]:
### log1p_norm counts
adata_log1p_norm = pd.DataFrame(adata.layers["log1p_norm"], 
    adata.to_df().index, 
    adata.to_df().columns)
bulk_df = adata_log1p_norm.T
bulk_df["gene_id"]=bulk_df.index
bulk_df.columns = bulk_df.columns.str.replace("[.-]", "_").str.replace(" ", "_")
bulk_df.to_csv(f"{path}/log1p_norm_counts.csv")   

## Cells expressing Ifng , Cxcr3, 

In [None]:
adata.obs["cell_type_seed"] = score_seeds(
    adata,
    {
        "Interferon": {
            "positive": [['Ifng']],
            "negative": [[""]],
        },
        "Cxcr3": {
            "positive": [['Cxcr3']],
            "negative": [[""]],
        }
   },
    cutoff_sum_pos_marker_expression=0.25,
)

In [None]:
adata.obs.cell_type_seed

In [None]:
adata.obs.cell_type_seed.value_counts()

In [None]:
sc.pl.umap(adata, color="cell_type_seed", groups="Interferon")

In [None]:
sc.pl.umap(
    adata,
    color=["cell_type_seed"],
    cmap="Reds",
    frameon=False,add_outline=True, vmax="p99", gene_symbols="gene_id"
)

## Highest expressed genes

In [None]:
adata

In [None]:
sc.tl.rank_genes_groups(
    adata, groupby="leiden", method="wilcoxon", key_added="dea_leiden"
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata, groupby="leiden", standard_scale="var", n_genes=5, key="dea_leiden",gene_symbols="gene_name"
)

In [None]:
sc.tl.rank_genes_groups(
    adata, groupby="cell_type", method="wilcoxon", key_added="dea_cell_type"
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata, groupby="cell_type", standard_scale="var", n_genes=5, key="dea_cell_type",gene_symbols="gene_name", cmap="seismic")#, save="rank_genes_groups_dotplot")


In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata, groupby="leiden", standard_scale="var", n_genes=5, key="dea_cell_type",gene_symbols="gene_name", cmap="seismic")#, save="rank_genes_groups_dotplot")


In [None]:
adata.obs.cell_type

In [None]:
adata.var.set_index= "gene_name"

In [None]:
sc.tl.rank_genes_groups(adata, "leiden",  method="wilcoxon", gene_symbols="gene_name")

In [None]:
adata_colon = adata[adata.obs["origin"]=="colon"]

In [None]:
adata_til = adata[adata.obs["origin"]=="til"]

In [None]:
sc.pl.highest_expr_genes(adata_colon, n_top=20,gene_symbols="gene_name")

In [None]:
sc.pl.highest_expr_genes(adata_til, n_top=20,gene_symbols="gene_name")

In [None]:
adata_11mix = adata[adata.obs["condition"]=="11mix"]

In [None]:
adata10mix = adata[adata.obs["condition"]=="10mix"]

In [None]:
adataGF= adata[adata.obs["condition"]=="GF"]

In [None]:
adata.obs["condition"].value_counts()

In [None]:
adataGFplus= adata[adata.obs["condition"]=="GF-plus"]

In [None]:
sc.pl.highest_expr_genes(adata_11mix, n_top=20,gene_symbols="gene_name")

In [None]:
sc.pl.highest_expr_genes(adata10mix, n_top=20,gene_symbols="gene_name")

In [None]:
sc.pl.highest_expr_genes(adataGF, n_top=20,gene_symbols="gene_name")

In [None]:
sc.pl.highest_expr_genes(adataGFplus, n_top=20,gene_symbols="gene_name")
