In [1]:
import anndata
import pickle
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

from copy import copy
from matplotlib import rc_context
from scipy.stats import norm
from scipy import sparse
from scipy import stats
from sklearn.neighbors import KernelDensity
from ete3 import Tree
from typing import Tuple

In [2]:
sc.settings.verbosity = 0
sc.settings.figdir = '/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/3_maturity/'

clusterColorsFile = "/Genomics/chanlab/mchan/Adriano/TLS/TLS_TLSCL/20211102_clusterColorsTLSCL.p"
with open(clusterColorsFile,'rb') as fp:
    colorDict = pickle.load(fp)

In [3]:
# Load the integrated object
TLS_integrated = sc.read_h5ad('/Genomics/chanlab/blaw/TLS/raw_data/scRNA/Integrated/TLS_TLSCL_1_2_Time_integrated.h5ad')

# Load the multiseq barcodes metadata
multiseq_barcodes = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/multiseq_barcodes.txt', sep='\t')

# Add the multiseq group numbers and whether the sample is a TLS or TLSCL (in the multiseq experiment)
# The TLS replicates from TLS1, TLS2, TIME ones are counted as NA in both of these categories to seperate them from the multiseq
multiseq_barcode_dict = dict(zip(multiseq_barcodes['Sequence'], multiseq_barcodes['ID']))
TLS_TLSCL_dict = {}
for key, value in multiseq_barcode_dict.items():
    multiseq_barcode_dict[key] = str(int(value[3:]))
for row in multiseq_barcodes.iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLS'
for row in multiseq_barcodes[multiseq_barcodes['TLS ID'].str.startswith('TLSCL')].iterrows():
    TLS_TLSCL_dict[row[1][2]] = 'TLSCL'
    
multiseq_barcode_dict['Doublet'] = 'Doublet'
multiseq_barcode_dict['Negative'] = 'Negative'
TLS_TLSCL_dict['Doublet'] = 'Doublet'
TLS_TLSCL_dict['Negative'] = 'Negative'

# Merge the multiseq group metadata with the entire integrated dataset
TLS_integrated.obs['MultiSeqGroup'] = TLS_integrated.obs['MultiSeqBCseq'].map(multiseq_barcode_dict)
TLS_integrated.obs['TLSCL'] = TLS_integrated.obs['MultiSeqBCseq'].map(TLS_TLSCL_dict)

# Load the monocle position values
pseudotime = pd.read_csv('/Genomics/chanlab/blaw/TLS/raw_data/monocle_results/TLS_TLSCL_1_2_Time_integrated_Monocle_Pos.txt', sep = '\t', index_col = 0)

# Rename the index column
pseudotime.rename(columns = {'x': 'pseudotime'}, inplace = True)

# Merge the pseudotime values into the integrated object
TLS_integrated.obs = pd.merge(TLS_integrated.obs, pseudotime, left_index = True, right_index = True)

# Recenter the pseudotime value to put the NMP median in the center since both trajectories originate from NMPs
NMP_median = np.median(TLS_integrated.obs[TLS_integrated.obs['cell_state'] == 'NMPs']['pseudotime'].values)
TLS_integrated.obs['NT-Somite_Traj'] = TLS_integrated.obs['pseudotime'] - NMP_median

# Investigate how NMP density changes over time in the pseudotime

In [4]:
NMPs = TLS_integrated[TLS_integrated.obs['cell_state'] == 'NMPs'].copy()

In [7]:
# NMP specific density plots
TLS_96_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_96h')]['NT-Somite_Traj']
TLS_108_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_108h')]['NT-Somite_Traj']
TLS_120_NMP_pseudotime_values = NMPs.obs[(NMPs.obs['orig.ident'] == 'mGast_120h')]['NT-Somite_Traj']
fig, ax = plt.subplots(figsize = (10, 5))

sns.kdeplot(TLS_96_NMP_pseudotime_values, color = 'red', ax = ax)
sns.kdeplot(TLS_108_NMP_pseudotime_values, color = 'green', ax = ax)
sns.kdeplot(TLS_120_NMP_pseudotime_values, color = 'blue', ax = ax)
ax.legend(['TLS_96h', 'TLS_108h', 'TLS_120h'])
ax.set_xlim(-20, 20)
ax.set_ylim(0, 0.5)

plt.tight_layout()
plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/3_maturity/pseudotime_timecourse_NMP_distributions.pdf', dpi = 300)
#plt.show()
plt.close()

In [8]:
# plot known genes related to NMP maturity over the pseudotime

gene_list = ['Sox2', 'Sox11', 'Hes7', 'Rspo3', 'Nkx1-2', 'Cyp26a1', 'Cdx1', 'Cdx2']

# Plot the expression of a gene vs the pseudotime values for each cell
# Hoxc10 is one of the genes identified to be differentially expressed
for gene in gene_list:
    fig, ax = plt.subplots(figsize = (10, 5))

    avg_range = np.arange(-10, 10, 1)
    # This is setup to expect 4 genes, if more are wanted, you must change the subplot call and the i_count and j_count logic

    avg_expr = []
    for i, value in enumerate(avg_range):
        try:
            avg_expr.append(NMPs[(NMPs.obs['NT-Somite_Traj'] >= value) & (NMPs.obs['NT-Somite_Traj'] < avg_range[i + 1])][:, gene].X.todense().mean())
        except:
            pass
    x1 = avg_range[0:-1]
    y1 = avg_expr
    ax.plot(x1, y1, color = 'black')
    y2 = NMPs[:, gene].X.todense()
    x2 = NMPs.obs['NT-Somite_Traj']
    ax.plot(x2, y2, '.', markersize = 3, alpha = 0.1)
    ax.set_title('{} - All Cells'.format(gene))
    ax.set_ylabel('{} Expression'.format(gene))
    ax.set_xlabel('NT-Somite_Traj')
    ax.set_xlim(-10, 10)

    plt.tight_layout()
    plt.savefig('/Genomics/chanlab/blaw/TLS/data/integrated_scRNA/3_maturity/{}_expr_timecourse_NMPs.pdf'.format(gene), dpi = 300)
    #plt.show()
    plt.close()