# Info
In this script I will use the multiseq information to combine the allele tables and add metadata to prepare for the tree reconstruction steps

I will do the following steps:
- combine the allele tables for the 3 datasets (changing the cellBC Id to incorperate the dataset name)
- add the multiseq metadata
- add timepoint metadata
- Look at QC for the multiseq / timepoints

This will allow me to split the allele table into the following allele tables when doing lineage reconstruction. For each barcode I will reconstruct the following trees:
- 120h only
- 144h only
- both

Across 6 barcodes, this will result in 18 trees to look at

In [27]:
import sys
import os
import pandas as pd
import numpy as np
import cassiopeia as cas
import matplotlib.pyplot as plt
import scipy
import pickle

In [2]:
output_dir = '/Genomics/chanlab/blaw/TLS/data/explant/lineage/2_add_metadata/'

In [3]:
explant_allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_explant/allele_table_filtered.txt', sep='\t', usecols = ['cellBC', 'intBC', 'r1', 'r2', 'r3', 'allele', 'lineageGrp', 'readCount', 'UMI'])
outgrowth_1_allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_outgrowth_1/allele_table_filtered.txt', sep='\t', usecols = ['cellBC', 'intBC', 'r1', 'r2', 'r3', 'allele', 'lineageGrp', 'readCount', 'UMI'])
outgrowth_2_allele_table = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/1_preprocessing/PZ-2594_tracer_outgrowth_2/allele_table_filtered.txt', sep='\t', usecols = ['cellBC', 'intBC', 'r1', 'r2', 'r3', 'allele', 'lineageGrp', 'readCount', 'UMI'])

explant_multiSeq_BC = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-DNA-341_MultiSeqBC.tsv', sep='\t')
outgrowth_1_multiSeq_BC = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-DNA-342_MultiSeqBC.tsv', sep='\t')
outgrowth_2_multiSeq_BC = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-DNA-343_MultiSeqBC.tsv', sep='\t')

In [4]:
# Make a combined cell state table for the entire explant experiment
explant_cell_state = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-RNA-929_cellBC_cellState.tsv', sep='\t')
outgrowth_1_cell_state = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-RNA-930_cellBC_cellState.tsv', sep='\t')
outgrowth_2_cell_state = pd.read_csv('/Genomics/chanlab/blaw/TLS/metadata/AM-RNA-931_cellBC_cellState.tsv', sep='\t')

explant_cell_state['cellBC'] = ['Tracer_Explant_' + i[:-2] for i in explant_cell_state['cellBC']]
explant_cell_state['timepoint'] = '120'
outgrowth_1_cell_state['timepoint'] = '144'
outgrowth_2_cell_state['timepoint'] = '144'

outgrowth_1_cell_state['cellBC'] = [i[:-2] for i in outgrowth_1_cell_state['cellBC']]
outgrowth_2_cell_state['cellBC'] = [i[:-2] for i in outgrowth_2_cell_state['cellBC']]

total_cell_state = pd.concat([explant_cell_state, outgrowth_1_cell_state, outgrowth_2_cell_state])
total_cell_state.to_csv('/Genomics/chanlab/blaw/TLS/metadata/TLS_Explant_Total_cellBC_cellState.tst', sep = '\t', index= False)

# Merge the allele tables with the barcode tables

In [5]:
explant_allele_table = explant_allele_table.merge(explant_multiSeq_BC, left_on='cellBC', right_on='cellBC')
outgrowth_1_allele_table = outgrowth_1_allele_table.merge(outgrowth_1_multiSeq_BC, left_on='cellBC', right_on='cellBC')
outgrowth_2_allele_table = outgrowth_2_allele_table.merge(outgrowth_2_multiSeq_BC, left_on='cellBC', right_on='cellBC')

In [9]:
# Record counts of edited and unedited cells per experiment
multiseq_bar_counts_df = pd.DataFrame(index = ['explant_edited', 'explant_unedited', 'out_1_edited', 'out_1_unedited', 'out_2_edited', 'out_2_unedited'], columns = ['Bar1', 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6', 'Doublet', 'Negative'])

# record the edited and unedited cells in the explant (120h)
for i in multiseq_bar_counts_df.columns:
    temp_allele_table = explant_allele_table[explant_allele_table['final.calls.rescued'] == i].copy()
    
    edited_cellBC = set()
    unedited_cellBC = set()
    
    for cellBC in temp_allele_table['cellBC'].unique():
        bad = True
        for allele in temp_allele_table[temp_allele_table['cellBC'] == cellBC]['allele']:
            if allele != '[None][None][None]':
                bad = False

        if bad:
            unedited_cellBC.add(cellBC)
        else:
            edited_cellBC.add(cellBC)
            
    multiseq_bar_counts_df.loc['explant_edited', i] = len(edited_cellBC)
    multiseq_bar_counts_df.loc['explant_unedited', i] = len(unedited_cellBC)
    
# record the edited and unedited cells in the outgrowth sample 1 (144h)
for i in multiseq_bar_counts_df.columns:
    temp_allele_table = outgrowth_1_allele_table[outgrowth_1_allele_table['final.calls.rescued'] == i].copy()
    
    edited_cellBC = set()
    unedited_cellBC = set()
    
    for cellBC in temp_allele_table['cellBC'].unique():
        bad = True
        for allele in temp_allele_table[temp_allele_table['cellBC'] == cellBC]['allele']:
            if allele != '[None][None][None]':
                bad = False

        if bad:
            unedited_cellBC.add(cellBC)
        else:
            edited_cellBC.add(cellBC)
            
    multiseq_bar_counts_df.loc['out_1_edited', i] = len(edited_cellBC)
    multiseq_bar_counts_df.loc['out_1_unedited', i] = len(unedited_cellBC)
    
# record the edited and unedited cells in the outgrowth sample 2 (144h)
for i in multiseq_bar_counts_df.columns:
    temp_allele_table = outgrowth_2_allele_table[outgrowth_2_allele_table['final.calls.rescued'] == i].copy()
    
    edited_cellBC = set()
    unedited_cellBC = set()
    
    for cellBC in temp_allele_table['cellBC'].unique():
        bad = True
        for allele in temp_allele_table[temp_allele_table['cellBC'] == cellBC]['allele']:
            if allele != '[None][None][None]':
                bad = False

        if bad:
            unedited_cellBC.add(cellBC)
        else:
            edited_cellBC.add(cellBC)
            
    multiseq_bar_counts_df.loc['out_2_edited', i] = len(edited_cellBC)
    multiseq_bar_counts_df.loc['out_2_unedited', i] = len(unedited_cellBC)

In [12]:
#fig, ax = plt.subplots()
multiseq_bar_counts_df.T.plot.bar()
plt.legend(bbox_to_anchor=(1, 1.05))
plt.title('Tree Cell Counts per Barcode')
plt.ylabel('# of Cells')
plt.tight_layout()
plt.savefig(output_dir + 'cell_counts_per_barcode.pdf', dpi = 300)
#plt.show()
plt.close()

# Change the cellBCs for each dataset so that they can be merged

In [13]:
explant_allele_table['cellBC'] = ['Tracer_Explant_' + i for i in explant_allele_table['cellBC']]
outgrowth_1_allele_table['cellBC'] = ['Tracer_Outgrowth_1_' + i for i in outgrowth_1_allele_table['cellBC']]
outgrowth_2_allele_table['cellBC'] = ['Tracer_Outgrowth_2_' + i for i in outgrowth_2_allele_table['cellBC']]

In [14]:
explant_allele_table['Timepoint'] = '120h'
outgrowth_1_allele_table['Timepoint'] = '144h'
outgrowth_2_allele_table['Timepoint'] = '144h'

In [15]:
explant_allele_table['orig.ident'] = 'explant'
outgrowth_1_allele_table['orig.ident'] = 'outgrowth_1'
outgrowth_2_allele_table['orig.ident'] = 'outgrowth_2'

In [17]:
merged_allele_table = pd.concat([explant_allele_table, outgrowth_1_allele_table, outgrowth_2_allele_table])
merged_allele_table['finalCalls'] = merged_allele_table['final.calls.rescued']
merged_allele_table.to_csv(output_dir + 'merged_allele_table_with_metadata.txt', sep = '\t')

In [18]:
merged_allele_table = pd.read_csv(output_dir + 'merged_allele_table_with_metadata.txt', sep = '\t', index_col = 0)

# Count the number of unique indels and alleles per experiment

In [20]:
tableFiltered = pd.read_csv('/Genomics/chanlab/blaw/TLS/data/explant/lineage/3_lineage_reconstruction/allele_table_filtered.txt', sep = '\t', index_col = 0)

In [21]:
indels = {}
unique_indels = {}

for barcode in ['Bar1', 'Bar2', 'Bar3', 'Bar4', 'Bar5', 'Bar6']:
    for ID in tableFiltered['orig.ident'].unique():
        if ID == 'explant':
            name = '{}_{}'.format(barcode, ID)
        else:
            name = '{}_outgrowth'.format(barcode)
        
        temp = tableFiltered[(tableFiltered['finalCalls'] == barcode) & (tableFiltered['orig.ident'] == ID)].copy()
        indels[name] = []
        unique_indels[name] = set()
        
        for i in temp['cellBC'].unique():
            count = 0

            for col in ['r1', 'r2', 'r3']:
                for indel in temp[temp['cellBC'] == i][col]:
                    unique_indels[name].add(indel)
                    if indel != '[None]':
                        count += 1
                        
            indels[name].append(count)

In [28]:
with open(output_dir + 'indels_per_barcode.pickle', 'wb') as handle:
    pickle.dump(indels, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(output_dir + 'unique_indels_per_barcode.pickle', 'wb') as handle:
    pickle.dump(unique_indels, handle, protocol=pickle.HIGHEST_PROTOCOL)