# Filtering low quality reads

In [None]:
import scanpy as sc
import seaborn as sns
import anndata as ad
from anndata import AnnData
from typing import Optional
import numpy as np
import pandas as pd
import scvi
from scipy.stats import median_abs_deviation

In [None]:
adata_denoised_path= "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/merged/colon_and_tumor_denoised_data.h5ad"

In [None]:
adata_denoised = sc.read_h5ad(adata_denoised_path)

In [None]:
adata_denoised

In [None]:
adata_denoised.obs["sample_id"].value_counts()

In [None]:
adata_denoised.var

In [None]:
adata_denoised.obs

In [None]:
# mitochondrial genes
adata_denoised.var["mt"] = adata_denoised.var.gene_name.str.startswith("mt-")

In [None]:
adata_denoised.var["mt"].value_counts()

In [None]:
# ribosomal genes
adata_denoised.var["ribo"] = adata_denoised.var.gene_name.str.startswith("Rps","Rpl")

In [None]:
adata_denoised.var["ribo"].value_counts()

In [None]:
adata_denoised.var

In [None]:
sc.pp.calculate_qc_metrics(adata_denoised, qc_vars=["mt","ribo"], inplace=True, percent_top=[20], log1p=True)

In [None]:
adata_denoised.var

In [None]:
adata_denoised.obs

## PRE FILTER

In [None]:
p1 = sns.displot(adata_denoised.obs["total_counts"], bins=100, kde=False)
sc.pl.violin(adata_denoised, 'total_counts')
p2 = sc.pl.violin(adata_denoised, "pct_counts_mt")
p3 = sc.pl.scatter(adata_denoised, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
#sc.pl.violin(adata_denoised, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
#             jitter=0.4, groupby = 'sample_id', rotation = 45);

## POST FILTER

In [None]:
sc.pp.filter_cells(adata_denoised, min_counts=200)
sc.pp.filter_cells(adata_denoised, min_genes=200)
sc.pp.filter_cells(adata_denoised, max_counts=20000)
sc.pp.filter_genes(adata_denoised, min_cells=10)
sc.pp.filter_genes(adata_denoised, min_counts=10)

In [None]:
p1 = sns.displot(adata_denoised.obs["total_counts"], bins=100, kde=False)
sc.pl.violin(adata_denoised, 'total_counts')
p2 = sc.pl.violin(adata_denoised, "pct_counts_mt")
p3 = sc.pl.scatter(adata_denoised, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
#sc.pl.violin(adata_denoised, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
#             jitter=0.4, groupby = 'sample_id', rotation = 45)

In [None]:
adata_denoised.var_names_make_unique()

In [None]:
adata_denoised.obs_names_make_unique()

In [None]:
def is_outlier(adata: AnnData, metric_col: str, *, groupby: Optional[str] = None, n_mads: float = 5) -> pd.Series:
    """Detect outliers by median absolute deviation (MAD).

    Adapted from https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html#motivation

    Parameters
    ----------
    adata
        AnnData object
    metric_col
        column in adata.obs to consider
    groupby
        grouping variable. If specified, outliers will be determined for each value of this grouping variable separately.
        E.g. `dataset`.
    n_mads
        label points that are outside of median +/- nmads * MAD.
    """

    def _is_outlier(df):
        """Check if metric value deviates from the median by more than n_mads MADs."""
        metric_values = df[metric_col]
        return np.abs(metric_values - np.median(metric_values)) > n_mads * median_abs_deviation(metric_values)

    if groupby is None:
        return _is_outlier(adata.obs)
    else:
        return adata.obs.groupby(groupby).apply(_is_outlier).droplevel(groupby).reindex(adata.obs_names)  # type: ignore

In [None]:
adata_denoised.obs["is_outlier_counts"] =is_outlier(adata_denoised, "log1p_total_counts", n_mads=5, groupby="sample_id")
adata_denoised.obs["is_outlier_genes"] =is_outlier(adata_denoised, "log1p_n_genes_by_counts", n_mads=5, groupby="sample_id")
adata_denoised.obs["is_outlier_top_20"] = is_outlier(adata_denoised, "pct_counts_in_top_20_genes", n_mads=5, groupby="sample_id")

In [None]:
adata_denoised.obs["is_outlier_mito"] = is_outlier(adata_denoised, "pct_counts_mt", n_mads=1, groupby="sample_id")

In [None]:
adata_denoised.obs["is_outlier"] = (
    np.sum(
        adata_denoised.obs.loc[
            :,
            [
                "is_outlier_counts",
                "is_outlier_genes",
                "is_outlier_top_20",
                "is_outlier_mito",
            ],
        ],
        axis=1,
    )
    >= 2
)

In [None]:
adata_denoised

In [None]:
adata_denoised.obs["is_outlier"].value_counts()

In [None]:
adata_filtered = adata_denoised[~adata_denoised.obs["is_outlier"]].copy()

In [None]:
p1 = sns.displot(adata_filtered.obs["total_counts"], bins=100, kde=False)
sc.pl.violin(adata_filtered, 'total_counts')
p2 = sc.pl.violin(adata_filtered, "pct_counts_mt")
p3 = sc.pl.scatter(adata_filtered, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
adata_filtered.var.index = adata_filtered.var.gene_name

## **ERROR adata_filtered.var_names_make_unique()**


TypeError: Cannot setitem on a Categorical with a new category (Ptp4a1-1), set the categories first

In [None]:
sc.pl.highest_expr_genes(adata_filtered,n_top=20)

In [None]:
adata_filtered

## **Perform normalization on RAW counts or on denoised?**

In [None]:
adata_filtered.X = adata_filtered.layers["denoised"].copy()

In [None]:
adata_filtered.X.sum(axis=1) # each rows is a cell calculate the sum  --> total count for each cell 

In [None]:
# normalize the counts in each cell so that the total counts adds up to the same value 
sc.pp.normalize_total(adata_filtered, target_sum = 1e6)

In [None]:
adata_filtered.X.sum(axis=1) # the sum of each adds up to 100000 the same for all 

In [None]:
# Convert  those to log counts 
sc.pp.log1p(adata_filtered) 
adata_filtered.layers["logcounts"] = adata_filtered.X.copy()

In [None]:
adata_filtered.X.sum(axis=1) # now the logtransformation has been applied (not a linear transformation, that is why they are not all the same). 
# They are still comparable

In [None]:
# FREEZE a data as it is now before filtering based on variable genes, regressing etc 
adata_filtered.raw = adata_filtered

## **STIL NEED TO GET RID OF SOME MT GENES**

In [None]:
out_dir = "/data/projects/2021/MicrobialMetabolites/single-cell-sorted-cd8/results/filtered"

In [None]:
experiment = "colon_and_tumor"

In [None]:
# The final adata filtered contains raw and log counts 
adata_filtered.write_h5ad(f"{out_dir}/{experiment}_adata_final_filtered.h5ad") 