In [27]:
import anndata
import pickle
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

from copy import copy
from scipy.stats import norm
from scipy import sparse
from scipy import stats
from sklearn.neighbors import KernelDensity
from ete3 import Tree
from typing import Tuple

In [15]:
sc.settings.verbosity = 0
sc.settings.figdir = '/Genomics/chanlab/blaw/TLS/sandbox/scRNA/TLS_integrated/'

clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)
    

TLS_barcodes = ['Bar1', 'Bar2', 'Bar4', 'Bar5', 'Bar7', 'Bar8', 'Bar10', 'Bar11', 'Bar13', 'Bar16', 'Bar19', 'Bar22']
TLSCL_barcodes = ['Bar3', 'Bar6', 'Bar9', 'Bar12', 'Bar14', 'Bar15', 'Bar20', 'Bar21', 'Bar23', 'Bar24']

# Read in tables of the cell states for each cell
TLS1_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_120h_1_cellBC_cellState.tsv', sep='\t')
TLS2_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_120h_2_cellBC_cellState.tsv', sep='\t')
TLS_M_cell_state_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/LineageTracer/scRNAseq/TLS_TLSCL_cellBC_cellState.tsv', sep='\t')
TLS1_cell_state_table.set_index('cellBC', inplace = True)
TLS2_cell_state_table.set_index('cellBC', inplace = True)
TLS_M_cell_state_table.set_index('cellBC', inplace = True)

# Load in the data and add metadata

In [3]:
# Load the integrated object
TLS_integrated = sc.read_h5ad('/Genomics/chanlab/blaw/TLS/raw_data/scRNA/Integrated/TLS_TLSCL_1_2_Time_integrated.h5ad')

# Load the multiseq barcodes metadata
multiseq_barcodes = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/multiseq_barcodes.txt', sep='\t')

# Add the multiseq group numbers and whether the sample is a TLS or TLSCL (in the multiseq experiment)
# The TLS replicates from TLS1, TLS2, TIME ones are counted as NA in both of these categories to seperate them from the multiseq
multiseq_barcode_dict = dict(zip(multiseq_barcodes['Sequence'], multiseq_barcodes['ID']))
TLS_TLSCL_dict = {}
for key, value in multiseq_barcode_dict.items():
    multiseq_barcode_dict[key] = str(int(value[3:]))
for row in multiseq_barcodes.iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLS'
for row in multiseq_barcodes[multiseq_barcodes['TLS ID'].str.startswith('TLSCL')].iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLSCL'
    
multiseq_barcode_dict['Doublet'] = 'Doublet'
multiseq_barcode_dict['Negative'] = 'Negative'
TLS_TLSCL_dict['Doublet'] = 'Doublet'
TLS_TLSCL_dict['Negative'] = 'Negative'

# Merge the multiseq group metadata with the entire integrated dataset
TLS_integrated.obs['MultiSeqGroup'] = TLS_integrated.obs['MultiSeqBCseq'].map(multiseq_barcode_dict)
TLS_integrated.obs['TLSCL'] = TLS_integrated.obs['MultiSeqBCseq'].map(TLS_TLSCL_dict)

# Load the monocle position values
pseudotime = pd.read_csv('/Genomics/chanlab/blaw/TLS/raw_data/monocle_results/TLS_TLSCL_1_2_Time_integrated_Monocle_Pos.txt', sep = '\t', index_col = 0)

# Rename the index column
pseudotime.rename(columns = {'x': 'pseudotime'}, inplace = True)

# Merge the pseudotime values into the integrated object
TLS_integrated.obs = pd.merge(TLS_integrated.obs, pseudotime, left_index = True, right_index = True)

# Recenter the pseudotime value to put the NMP median in the center since both trajectories originate from NMPs
NMP_median = np.median(TLS_integrated.obs[TLS_integrated.obs['cell_state'] == 'NMPs']['pseudotime'].values)
TLS_integrated.obs['NT-Somite_Traj'] = TLS_integrated.obs['pseudotime'] - NMP_median

# Identify NMPs that are in the lineage datasets

In [12]:
# Open TLS1 and TLS2 trees
TLS1_loc = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-097/lineage/2_lineage_reconstruction/AM-DNA-097_hybrid_newick_noMutationlessEdges_Labeled.nwk'
t1 = Tree(TLS1_loc, format=1)

TLS2_loc = '/Genomics/chanlab/blaw/TLS/data/AM-DNA-098/lineage/2_lineage_reconstruction/AM-DNA-098_hybrid_newick_noMutationlessEdges_Labeled.nwk'
t2 = Tree(TLS2_loc, format=1)

# Save lists of all the leaves that appear TLS1 and TLS2
TLS1_leaves = [leaf.name for leaf in t1.get_leaves()]
TLS2_leaves = [leaf.name for leaf in t2.get_leaves()]

# Open TLS M and TLSCL trees to grab the leaves
TLS_m_leaves = []
TLSCL_leaves = []

# For each barcode in TLS and TLSCL, open the tree and add the leaves (cells) to the total tree leaf lists
for barcode in TLS_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)

    for leaf in t.get_leaves():
        TLS_m_leaves.append(leaf.name)

for barcode in TLSCL_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)

    for leaf in t.get_leaves():
        TLSCL_leaves.append(leaf.name)
        
# Reformat the leaf names so that they can be used to index the integrated object
tree_cells = ['TLS_120h_rep1_' + i for i in TLS1_leaves] + ['TLS_120h_rep2_' + i for i in TLS2_leaves] + ['TLS_TLSCL_' + i for i in TLS_m_leaves] + ['TLS_TLSCL_' + i for i in TLSCL_leaves]

In [13]:
# Subset just the NMPs from the full object
NMPs = TLS_integrated[TLS_integrated.obs['cell_state'] == 'NMPs'].copy()

# Make a list of all the NMPs in the integrated object
NMP_index = list(NMPs.obs.index)

# Make a list of all the NMPs that appear on a tree
NMP_tree_cells = []

for i in tree_cells:
    #print(i)
    if i in NMP_index:
        NMP_tree_cells.append(i)

# copy a dataframe of the obs table of the NMPs that are in trees
tree_NMPs = NMPs[NMPs.obs.index.isin(NMP_tree_cells)].copy()

  df_sub[k].cat.remove_unused_categories(inplace=True)


# Label the NMPs by their neighboring cells on the tree

- 'Drop' NMPs that are neighbors with a non-NMP cell type ('PGC', 'Endoderm', 'Endothelial'. Unassigned is +/-)
- Proliferating <- NMPs that are neighbors with only other NMPs (or nothing)
- Bipotent <- NMPs that are neighbors with both Neural and Somitic cells
- Somitic <- NMPs that are neighbors with only somitic cells
- Neural <- NMPs that are neighbors with only neural cells

Also capture the fraction of somitic and fraction of neural neighbors for us to score the NMP 'commitment'
- Commitment is scored as the # of somitic cells + 1 / # of neural cells + 1

In [16]:
# Add a column in the tree_NMPs table to classify the NMPs
tree_NMPs.obs['NMP Type'] = ''
tree_NMPs.obs['frac_Somitic'] = 0
tree_NMPs.obs['frac_Neural'] = 0
tree_NMPs.obs['Somite_Neural_ratio'] = 0

# Populate the new column by manually looking at all nodes in TLS1
for node in t1.traverse():
    # find nodes so that we know the parent node for a leaf
    if not node.is_leaf():
        # check if any of the children are an NMP
        for child in node.children:
            if child.is_leaf() and TLS1_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                # classify the NMP based on all the cell types that get made from the same parent
                #leaves = [leaf.name for leaf in node.get_leaves()]
                
                # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                leaves = []
                for subnode in node.children:
                    if subnode.is_leaf():
                        leaves.append(subnode.name)
                
                states = set()
                neural_count = 0
                somitic_count = 0
                for leaf in leaves:
                    if TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                        states.add('Somitic')
                        somitic_count += 1
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                        states.add('Neural')
                        neural_count += 1
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                        states.add('NMPs')
                    elif TLS1_cell_state_table.loc[leaf, 'cell_state'] in ['Endothelial', 'Endoderm', 'PCGLC']:
                        states.add('Drop')
                        
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        
                if states == set(['Neural', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                elif states == set(['Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                elif states == set(['Neural', 'Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                elif states == set(['NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                else:
                    tree_NMPs.obs.loc['TLS_120h_rep1_' + child.name, 'NMP Type'] = 'Dropped'
                    
                    
for node in t2.traverse():
    # find nodes so that we know the parent node for a leaf
    if not node.is_leaf():
        # check if any of the children are an NMP
        for child in node.children:
            if child.is_leaf() and TLS2_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                # classify the NMP based on all the cell types that get made from the same parent
                #leaves = [leaf.name for leaf in node.get_leaves()]
                
                # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                leaves = []
                for subnode in node.children:
                    if subnode.is_leaf():
                        leaves.append(subnode.name)
                
                states = set()
                neural_count = 0
                somitic_count = 0
                for leaf in leaves:
                    if TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                        states.add('Somitic')
                        somitic_count += 1
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                        states.add('Neural')
                        neural_count += 1
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                        states.add('NMPs')
                    elif TLS2_cell_state_table.loc[leaf, 'cell_state'] in ['Endothelial', 'Endoderm', 'PCGLC']:
                        states.add('Drop')
                        
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        
                if states == set(['Neural', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                elif states == set(['Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                elif states == set(['Neural', 'Somitic', 'NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                elif states == set(['NMPs']):
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                else:
                    tree_NMPs.obs.loc['TLS_120h_rep2_' + child.name, 'NMP Type'] = 'Dropped'
                    

In [17]:
for barcode in TLS_barcodes + TLSCL_barcodes:
    treeFile = "/Genomics/chanlab/blaw/TLS/data/AM-DNA-258/lineage/3_lineage_reconstruction/{}/{}_newick_noMutationlessEdges_Labeled.nwk".format(barcode, barcode)
    t = Tree(treeFile, format = 1)
    
    for node in t.traverse():
        # only do analysis on non leaves
        if not node.is_leaf():
            # check if any of the children are an NMP
            for child in node.children:
                if child.is_leaf() and TLS_M_cell_state_table.loc[child.name, 'cell_state'] == 'NMPs':
                    #leaves = [leaf.name for leaf in node.get_leaves()]

                    # Classify the NMP based on the cell types of sibling cells (leafs that are children of the same parent node)
                    leaves = []
                    for subnode in node.children:
                        if subnode.is_leaf():
                            leaves.append(subnode.name)
                            
                    states = set()
                    neural_count = 0
                    somitic_count = 0
                    for leaf in leaves:
                        if TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['pPSM', 'aPSM', 'Somite', 'Somite0', 'Somite1', 'Somite-1', 'SomiteSclero', 'SomiteDermo']:
                            states.add('Somitic')
                            somitic_count += 1
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['NeuralTube1', 'NeuralTube2']:
                            states.add('Neural')
                            neural_count += 1
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['NMPs']:
                            states.add('NMPs')
                        elif TLS_M_cell_state_table.loc[leaf, 'cell_state'] in ['Endoderm', 'Endothelial', 'PCGLC']:
                            states.add('Drop')
                            
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'frac_Somitic'] = somitic_count / len(leaves)
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'frac_Neural'] = neural_count / len(leaves)
                    tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'Somite_Neural_ratio'] = (somitic_count + 1) / (neural_count + 1)
                        

                    if states == set(['Neural', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Neural'
                    elif states == set(['Somitic', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Somitic'
                    elif states == set(['Neural', 'Somitic', 'NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Bipotent'
                    elif states == set(['NMPs']):
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'NMPs-Renewing'
                    else:
                        tree_NMPs.obs.loc['TLS_TLSCL_' + child.name, 'NMP Type'] = 'Dropped'

In [21]:
for j in ['TLS_120h_rep1', 'TLS_120h_rep2', 'TLS_TLSCL']:
    print(j)
    for i in tree_NMPs.obs['NMP Type'].unique():
        print('\t', i, len(tree_NMPs.obs[(tree_NMPs.obs['orig.ident'] == j) & (tree_NMPs.obs['NMP Type'] == i)]['NMP Type']))

TLS_120h_rep1
	 NMPs-Renewing 29
	 NMPs-Neural 13
	 NMPs-Somitic 24
	 NMPs-Bipotent 24
	 Dropped 32
TLS_120h_rep2
	 NMPs-Renewing 1
	 NMPs-Neural 6
	 NMPs-Somitic 3
	 NMPs-Bipotent 0
	 Dropped 3
TLS_TLSCL
	 NMPs-Renewing 1032
	 NMPs-Neural 299
	 NMPs-Somitic 330
	 NMPs-Bipotent 52
	 Dropped 7


# DE between NMP types

In [22]:
def add_names(index, adata):
    return adata.var.loc[index][0]

def add_avg_expr(index, scipy_sparse):
    return scipy_sparse[:, int(index)].mean()

def add_frac_expressed(index, scipy_sparse):
    cell_count = scipy_sparse.shape[0]
    cells_expressing = scipy_sparse[:, int(index)].count_nonzero()
    
    return cells_expressing / cell_count

In [24]:
def calculateDE(adata, column, comparisons, output_loc, sampleName):
    '''
    input:
        adata - a scanpy object to run the analysis on
        column - a column in the adata.obs table that will be used to separate the data into 2 categories
        comparisons - a tuple that contains the 2 categories from the column to separate the data. The second is the ref name
        output_loc - a string that points to a valid folder to save the files
        name - a name for the 
    output:
        a volcano plot of the resulting gene expression data (logfc vs -log10pval)
        a plot of the scanpy outputs from the differential expression (top genes by score)
        a table that stores the differential expression values for each gene
    '''
    # set the scanpy figure directory to output_loc
    sc.settings.figdir = output_loc + 'plots/'
    sc.settings.verbosity = 0
    
    upName = comparisons[0]
    downName = comparisons[1]

    # exclude the TLS1 and timecourse data, so i'm only using the TLS and TLSCL from the multiseq
    temp_adata = adata[(adata.obs[column].isin(comparisons))].copy()
    
    temp_adata = temp_adata.raw.to_adata()
    sc.pp.normalize_per_cell(temp_adata, counts_per_cell_after=1e4)
    sc.pp.log1p(temp_adata)
    
    sc.tl.rank_genes_groups(temp_adata, column, method='wilcoxon', groups = [upName], reference = downName, key_added = "wilcoxon", gene_symbols = '_index')
    sc.pl.rank_genes_groups(temp_adata, n_genes=25, sharey=False, key="wilcoxon", gene_symbols = '_index', save = '_{}_vs_{}_{}'.format(upName, downName, sampleName), show = False)
    
    # record the information from the logfc table
    names = [i[0] for i in temp_adata.uns['wilcoxon']['names'].tolist()]
    scores = [i[0] for i in temp_adata.uns['wilcoxon']['scores'].tolist()]
    pvals = [i[0] for i in temp_adata.uns['wilcoxon']['pvals'].tolist()]
    pvals_adj = [i[0] for i in temp_adata.uns['wilcoxon']['pvals_adj'].tolist()]
    logfcs = [i[0] for i in temp_adata.uns['wilcoxon']['logfoldchanges'].tolist()]
    
    # save it to a df
    temp_df = pd.DataFrame(zip(names, scores, pvals, pvals_adj, logfcs), columns = ['name_index', 'scores', 'pvals', 'pvals_adj', 'logfoldchanges'])
    temp_df['names'] = temp_df['name_index'].apply(lambda x: add_names(x, temp_adata))

    # record the avg and frac expr for TLS and TLSCL for each gene
    upName_scipy_sparse = sparse.csr_matrix(temp_adata[temp_adata.obs[column] == upName].X)
    temp_df[upName + '_avg_expr'] = temp_df['name_index'].apply(lambda x: add_avg_expr(x, upName_scipy_sparse))
    temp_df[upName + '_frac_expr'] = temp_df['name_index'].apply(lambda x: add_frac_expressed(x, upName_scipy_sparse))

    downName_scipy_sparse = sparse.csr_matrix(temp_adata[temp_adata.obs[column] == downName].X)
    temp_df[downName + '_avg_expr'] = temp_df['name_index'].apply(lambda x: add_avg_expr(x, downName_scipy_sparse))
    temp_df[downName + '_frac_expr'] = temp_df['name_index'].apply(lambda x: add_frac_expressed(x, downName_scipy_sparse))
    
    temp_df['frac_diff'] = abs(temp_df[upName + '_frac_expr'] - temp_df[downName + '_frac_expr'])
    temp_df['-log10pval'] = -np.log10(temp_df['pvals'])
    temp_df['manual_LogFC'] = np.log2(temp_df[upName + '_avg_expr'] + 0.01) - np.log2(temp_df[downName + '_avg_expr'] + 0.01) 
    
    # save the df
    temp_df.to_csv(output_loc + 'gene_lists/{}_{}_{}_wilcox_DE_full.csv'.format(upName, downName, sampleName), index = False)
    temp_df[(temp_df['logfoldchanges'] > 1) & (temp_df['-log10pval'] > 1.2)].to_csv(output_loc + 'gene_lists/{}_{}_{}_wilcox_DE_upregulated.csv'.format(upName, downName, sampleName), index = False)
    temp_df[(temp_df['logfoldchanges'] < -1) & (temp_df['-log10pval'] > 1.2)].to_csv(output_loc + 'gene_lists/{}_{}_{}_wilcox_DE_downregulated.csv'.format(upName, downName, sampleName), index = False)
    
    # Plot the volcano plot of the differential expression
    x = temp_df[(temp_df[upName + '_avg_expr'] != 0) & (temp_df[downName + '_avg_expr'] != 0)]['logfoldchanges'].values
    y = -np.log10(temp_df[(temp_df[upName + '_avg_expr'] != 0) & (temp_df[downName + '_avg_expr'] != 0)]['pvals']).values
    plt.plot(x, y, '.', alpha = 0.3, color = 'lightgray')

    # highlight genes that are higher than 1 logfoldchange and higher than 1.2 -log10pval
    x2 = []
    y2 = []
    count = 0
    for i in x:
        try:
            if x[count] > 1 and y[count] > 1.2:
                x2.append(x[count])
                y2.append(y[count])
            if x[count] < -1 and y[count] > 1.2:
                x2.append(x[count])
                y2.append(y[count])
        except:
            pass
        count += 1
    plt.plot(x2, y2, '.')
    
    # Plot Rspo3, Fgf8, and T
    genes = ['Rspo3', 'Fgf8', 'T']
    for gene in genes:
        x3 = temp_df[temp_df['names'] == gene]['logfoldchanges']
        y3 = -np.log10(temp_df[temp_df['names'] == gene]['pvals'])
        plt.annotate(text = gene, xy = (x3, y3))

    plt.xlabel('LogFC')
    plt.ylabel('-log10(pval)')
    plt.title('{} vs {} - {} Differential Expression'.format(upName, downName, sampleName))
    plt.savefig(output_loc + 'plots/{}_vs_{}_{}_volcano.pdf'.format(upName, downName, sampleName), dpi = 300)
    #plt.show()
    plt.close()
    
    
    
    # Plot the volcano plot of the differential expression using manual LogFC
    x = temp_df[(temp_df[upName + '_avg_expr'] != 0) & (temp_df[downName + '_avg_expr'] != 0)]['manual_LogFC'].values
    y = -np.log10(temp_df[(temp_df[upName + '_avg_expr'] != 0) & (temp_df[downName + '_avg_expr'] != 0)]['pvals']).values
    plt.plot(x, y, '.', alpha = 0.3, color = 'lightgray')

    # highlight genes that are higher than 1 logfoldchange and higher than 1.2 -log10pval
    x2 = []
    y2 = []
    count = 0
    for i in x:
        try:
            if x[count] > 1 and y[count] > 1.2:
                x2.append(x[count])
                y2.append(y[count])
            if x[count] < -1 and y[count] > 1.2:
                x2.append(x[count])
                y2.append(y[count])
        except:
            pass
        count += 1
    plt.plot(x2, y2, '.')
    
    # Plot Rspo3, Fgf8, and T
    genes = ['Rspo3', 'Fgf8', 'T']
    for gene in genes:
        x3 = temp_df[temp_df['names'] == gene]['manual_LogFC']
        y3 = -np.log10(temp_df[temp_df['names'] == gene]['pvals'])
        plt.annotate(text = gene, xy = (x3, y3))

    plt.xlabel('LogFC')
    plt.ylabel('-log10(pval)')
    plt.title('{} vs {} - {} Differential Expression - Manual LogFC'.format(upName, downName, sampleName))
    plt.savefig(output_loc + 'plots/{}_vs_{}_{}_volcano_man_LogFC.pdf'.format(upName, downName, sampleName), dpi = 300)
    #plt.show()
    plt.close()

In [29]:
# subset just TLS1 and TLS2 NMPs
tree_NMPs_TLS12 = tree_NMPs[(tree_NMPs.obs['orig.ident'].isin(['TLS_120h_rep1', 'TLS_120h_rep2']))].copy()

# Run DE between each of the NMP types in TLS1 and TLS2
for i in [('NMPs-Somitic', 'NMPs-Neural'), ('NMPs-Bipotent', 'NMPs-Neural'), ('NMPs-Renewing', 'NMPs-Neural'), ('NMPs-Somitic', 'NMPs-Bipotent'), ('NMPs-Renewing', 'NMPs-Bipotent'), ('NMPs-Somitic', 'NMPs-Renewing')]:
    calculateDE(tree_NMPs_TLS12,
                'NMP Type',
                i,
                '/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/1_DE/TLS1_2_NMP_type/',
                'TLS12')

  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_

In [30]:
# subset and run DE on TLS M
tree_NMPs_M = tree_NMPs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLS')].copy()

for i in [('NMPs-Somitic', 'NMPs-Neural'), ('NMPs-Bipotent', 'NMPs-Neural'), ('NMPs-Renewing', 'NMPs-Neural'), ('NMPs-Somitic', 'NMPs-Bipotent'), ('NMPs-Renewing', 'NMPs-Bipotent'), ('NMPs-Somitic', 'NMPs-Renewing')]:
    calculateDE(tree_NMPs_M,
                'NMP Type',
                i,
                '/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/1_DE/TLS1_2_NMP_type/',
                'TLS_M')

  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_

In [31]:
# Subset and run DE on TLSCL
tree_NMPs_TLSCL = tree_NMPs[(tree_NMPs.obs['orig.ident'] == 'TLS_TLSCL') & (tree_NMPs.obs['TLSCL'] == 'TLSCL') & (tree_NMPs.obs['NMP Type'].isin(['NMPs-Renewing', 'NMPs-Neural', 'NMPs-Somitic', 'NMPs-Bipotent']))].copy()

for i in [('NMPs-Somitic', 'NMPs-Neural'), ('NMPs-Bipotent', 'NMPs-Neural'), ('NMPs-Renewing', 'NMPs-Neural'), ('NMPs-Somitic', 'NMPs-Bipotent'), ('NMPs-Renewing', 'NMPs-Bipotent'), ('NMPs-Somitic', 'NMPs-Renewing')]:
    calculateDE(tree_NMPs_TLSCL,
                'NMP Type',
                i,
                '/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/1_DE/TLS1_2_NMP_type/',
                'TLSCL')

  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_unused_categories(inplace=True)
... storing 'NMP Type' as categorical
  df_sub[k].cat.remove_unused_categories(inplace=True)
  df_sub[k].cat.remove_