In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc

from sys import path
from os.path import abspath
path.append(abspath("/home/ng136/nico"))
import ng_functions as ng

from glob import glob
from pathlib import Path
import warnings
from time import time
from tqdm import tqdm

# Load annotated untreated data

In [2]:
ndata = sc.read("/n/groups/klein/nico/neutrophils/backups/totalseq_exp2_neutrophils_untreated_annotated_embedding_5258x13126_backup_220422_15h46.h5ad")
ndata.layers['umi'] = ndata.X.copy()
ndata.X = ndata.layers['log1p'].copy()

# Filter out non-neutrophils

In [3]:
n_states = ['N1a', 'N1b', 'N2', 'N3', 'N4', 'N5', 'N6']

In [4]:
ndata = ndata[ndata.obs['smoothed_Zilionis'].isin(n_states)].copy()

# Find markers that best differentiate N1a~N6 populations
* Filter markers:
        FC > 2
        Positive expression in ≥ 10% of each group
        Negative expression in ≥ 50% of other groups
        Appear in the top 500 (in terms of p_adj) for exactly one state
        
        

In [5]:
sc.tl.rank_genes_groups(ndata, groupby='smoothed_Zilionis', method='wilcoxon', use_raw=False, layer='log1p', key_added='wilcoxon_untreated_markers', groups = n_states)

In [6]:
# filter markers
sc.tl.filter_rank_genes_groups(ndata, min_fold_change=2, min_in_group_fraction=0.1, max_out_group_fraction=0.5, 
                               key='wilcoxon_untreated_markers', key_added='wilcoxon_untreated_markers_filtered')

# top 500 markers from each label
markers_raw = pd.DataFrame(ndata.uns['wilcoxon_untreated_markers_filtered']['names']).head(500)

# filter markers found in top 500 of other subsets
duplicated_markers = markers_raw.values.reshape(-1)[pd.Series(markers_raw.values.reshape(-1)).duplicated(keep=False)]

# top 5 non duplicated markers that fulfill all previous filters for each label
untreated_markers = {k:[v0 for v0 in v if (v0 not in duplicated_markers) and isinstance(v0, str)][:5] for k,v in markers_raw.to_dict('list').items()}

untreated_markers_all = sum([*untreated_markers.values()],[])

In [7]:
untreated_markers

{'N1a': ['Wfdc21', 'Lcn2', 'Tmcc1', 'Mmp8'],
 'N1b': ['S100a6', 'Dusp1', 'Klf2', 'Taldo1', 'Lsp1'],
 'N2': ['Ifitm3', 'Isg15', 'Rsad2', 'Mxd1', 'Trim30a'],
 'N3': ['Ier3', 'Cxcl2', 'G0s2', 'Wfdc17', 'Tgm2'],
 'N4': ['Mrpl52', 'Rps29', 'Rps21', 'Rps26', 'AA467197'],
 'N5': ['Cstb', 'Ccl3', 'Atp6v0c', 'Cd63', 'Psap'],
 'N6': ['Lyz2', 'mt-Atp6', 'mt-Co2', 'mt-Co3', 'mt-Co1']}