In [1]:
import sys
import os
import pandas as pd
import numpy as np
import cassiopeia as cas
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
name = 'PZ-2594_Tracer_Outgrowth_1'
output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_outgrowth_1/'

umi_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_outgrowth_1/umi_table_error_correct.txt', sep='\t')

allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_outgrowth_1/allele_table.txt', sep = '\t')

In [5]:
umis_per_cellBC = umi_table.groupby("cellBC", sort=False).size().values
umis_per_intBC = umi_table.groupby(["cellBC", "intBC"], sort=False).size().values
reads_per_umi = umi_table.groupby(['cellBC', 'UMI'])['readCount'].sum()

reads_per_umi_hist = np.histogram(reads_per_umi, bins = 100)
umis_per_cellBC_hist = np.histogram(umis_per_cellBC, bins = 50)
umis_per_intBC_hist = np.histogram(umis_per_intBC, bins = 50)

In [12]:
fig = plt.subplots(dpi=100)

plt.bar(x = reads_per_umi_hist[1][:-1], height = reads_per_umi_hist[0])
plt.title('PZ-2594_Tracer_Outgrowth_1 - Reads per UMI')
plt.ylabel('UMI Count')
plt.xlabel('Number of Reads')
plt.yscale('log')
plt.savefig(output_dir + 'QC/QC_reads_per_UMI.png')
#plt.show()
plt.close()

In [13]:
fig = plt.subplots(dpi=100)
x_values = range(0, len(umis_per_cellBC))

plt.plot(x_values, np.flip(np.sort(umis_per_cellBC)), '-')
plt.title('PZ-2594_Tracer_Outgrowth_1 - UMI per CellBC')
plt.ylabel('Number of UMI')
plt.xlabel('Rank Order')
plt.xscale('log')
plt.yscale('log')
plt.savefig(output_dir + 'QC/QC_UMI_per_cellBC.png')
#plt.show()
plt.close()

In [14]:
fig = plt.subplots(dpi=100)

plt.bar(x = umis_per_intBC_hist[1][:-1], height = umis_per_intBC_hist[0], width = 10)
plt.title('PZ-2594_Tracer_Outgrowth_1 - UMI per intBC')
plt.ylabel('intBC Count')
plt.xlabel('Number of UMIs')
plt.yscale('log')
plt.savefig(output_dir + 'QC/QC_UMI_per_intBC.png')
#plt.show()
plt.close()

In [10]:
umis_per_intBC_df = pd.DataFrame({'count' : umi_table.groupby([ "cellBC", "intBC"] ).size()}).reset_index()

In [15]:
fig, ax = plt.subplots(figsize = (10, 4), dpi=300)

sns.violinplot(ax=ax, data = umis_per_intBC_df, x = 'intBC', y = 'count', scale = 'count', cut = 0, color = 'skyblue')

plt.title('PZ-2594_PZ-2594_Tracer_Outgrowth_1 - UMIs per intBC')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.set_ylabel('UMIs per intBC')
plt.savefig(output_dir + 'QC/QC_UMI_per_Target_Site.png', bbox_inches='tight')
#plt.show()
plt.close()

# Filter the umi and allele table by cellBC that are in the single cell object

In [16]:
test = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-RNA-930_cellBC_cellState.tsv', sep = '\t')

testCellBC = []

for cellBC in test['cellBC']:
    testCellBC.append(cellBC[19:-2])

In [17]:
allele_table_filtered = allele_table[allele_table['cellBC'].isin(testCellBC)].copy()

In [18]:
allele_table_filtered['cellBC'].unique().shape

(13498,)

In [19]:
allele_table_filtered.to_csv(output_dir + 'allele_table_filtered.txt', sep = '\t')

In [20]:
umi_table_filtered = umi_table[umi_table['cellBC'].isin(testCellBC)].copy()

In [21]:
umis_per_cellBC_filtered = umi_table_filtered.groupby("cellBC", sort=False).size().values
umis_per_intBC_filtered = umi_table_filtered.groupby(["cellBC", "intBC"], sort=False).size().values
reads_per_umi_filtered = umi_table_filtered.groupby(['cellBC', 'UMI'])['readCount'].sum()

reads_per_umi_hist_filtered = np.histogram(reads_per_umi_filtered, bins = 100)
umis_per_cellBC_hist_filtered = np.histogram(umis_per_cellBC_filtered, bins = 50)
umis_per_intBC_hist_filtered = np.histogram(umis_per_intBC_filtered, bins = 50)

In [27]:
fig = plt.subplots(dpi=100)

plt.bar(x = reads_per_umi_hist_filtered[1][:-1], height = reads_per_umi_hist_filtered[0])
plt.title('PZ-2594_Tracer_Outgrowth_1 - Filtered - Reads per UMI')
plt.ylabel('UMI Count')
plt.xlabel('Number of Reads')
plt.yscale('log')
plt.savefig(output_dir + '/QC_reads_per_UMI_filtered.png')
#plt.show()
plt.close()

In [28]:
fig = plt.subplots(dpi=100)
x_values = range(0, len(umis_per_cellBC_filtered))

plt.plot(x_values, np.flip(np.sort(umis_per_cellBC_filtered)), '-')
plt.title('PZ-2594_Tracer_Outgrowth_1 - Filtered - UMI per CellBC')
plt.ylabel('Number of UMI')
plt.xlabel('Rank Order')
plt.xscale('log')
plt.yscale('log')
plt.savefig(output_dir + '/QC_UMI_per_cellBC_filtered.png')
#plt.show()
plt.close()

In [29]:
fig = plt.subplots(dpi=100)

plt.bar(x = umis_per_intBC_hist_filtered[1][:-1], height = umis_per_intBC_hist_filtered[0], width = 10)
plt.title('PZ-2594_Tracer_Outgrowth_1 - Filtered - UMI per intBC')
plt.ylabel('intBC Count')
plt.xlabel('Number of UMIs')
plt.yscale('log')
plt.savefig(output_dir + '/QC_UMI_per_intBC_filtered.png')
#plt.show()
plt.close()

In [25]:
umis_per_intBC_df_filtered = pd.DataFrame({'count' : umi_table_filtered.groupby([ "cellBC", "intBC"] ).size()}).reset_index()

In [30]:
fig, ax = plt.subplots(figsize = (10, 4), dpi=300)

sns.violinplot(ax=ax, data = umis_per_intBC_df_filtered, x = 'intBC', y = 'count', scale = 'count', cut = 0, color = 'skyblue')

plt.title('PZ-2594_Tracer_Outgrowth_1 - filtered - UMIs per intBC')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.set_ylabel('UMI per intBC Count')
plt.savefig(output_dir + '/QC_UMI_per_Target_Site_filtered.png', bbox_inches='tight')
#plt.show()
plt.close()