# Find best lineage barcode

The goal here is to generate a barcode data frame with columns "cell.id", "barcode.count", "barcode", and "site1-site10" where cell.id corresponds to the 10X barcode, barcode. count is the number of unique lineage barcodes found in the cell, barcode is the full 10-site string of the edited barcode, and site1-site10 is the barcode separated into its 10 sites. The key here is that we only want to accept barcodes which belong to the majority percent of barcodes. This threshold is determined by what we see in the unedited data. We can pull ranked barcodes from the .allReadCounts file associated with the sample. An explanation of each step in this process is included in just the first example. 

A cleaner version of this script with user friendly functions will come later. 

### inj_heat1

In [1]:
from collections import Counter

In [2]:
data_folder = "./data/10X_GESTALT_OUTPUT/"

#### Take the first twelve barcodes. 
##### 12 was picked because .007 is the proportion of the top ranked edited barcode in the unedited control. The fourteenth barcode has a proportion below this noise threshold. 

In [4]:
barcodes_inj_heat1 = open(data_folder + "inj_heat1/inj_heat1.allReadCounts", "r")
barcode_list_inj_heat1 = []
barcodes_inj_heat1.readline()
for line in barcodes_inj_heat1.readlines(): 
    line = line.split("\t")
    barcode = line[0]
    barcode = barcode.strip("\t")
    barcode_list_inj_heat1.append(barcode)
barcodes_inj_heat1.close()
barcode_list_inj_heat1 = barcode_list_inj_heat1[:12]
print(len(barcode_list_inj_heat1))
print(barcode_list_inj_heat1)

12
['84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE', '3I+45+ATC_1D+73_5D+93&2I+101+AA_11I+125+TGTTTTTCTTT_1D+151_3I+180+TTC_10I+204+TTTTTAGATG&13D+210_6D+232_1D+259&1D+262_22D+268', '84D+42_84D+42_84D+42_84D+42_NONE_NONE_NONE_NONE_NONE_NONE', '1I+45+C_15D+66_NONE_9D+123_NONE_NONE_NONE_NONE_NONE_NONE', '3I+45+ATC_1D+73_30D+76_28D+114_3D+150&1I+155+A_20D+157_41D+206_41D+206_44D+257_44D+257', '1I+45+C_NONE_NONE_1D+126_NONE_NONE_NONE_NONE_NONE_NONE', '1I+45+C_26D+73_26D+73_NONE_NONE_NONE_NONE_NONE_NONE_NONE', '3I+45+ATC_1D+73_30D+76_28D+114_3D+150&1I+155+A_20D+157_23D+206_66D+234_66D+234_66D+234', '3I+45+ATC_1D+73_30D+76_28D+114_3D+150&1I+155+A_20D+157_23D+206_66D+233_66D+233_66D+233', '3I+45+ATC_1D+73_30D+76_28D+114_3D+150&1I+155+A_20D+157_41D+206_41D+206_46D+255_46D+255', '5D+42_16D+68_9D+91_NONE_13D+152_1D+179_3I+204+CAG_5I+231+GATTA&6I+236+TGATTA_1I+261+A_NONE', '46D+38_46D+38_7D+90_36D+105_2I+155+GA_1D+182_10I+204+CTAACGCACC&13D+208_5D+227&1I+235+A_3D+263_1D+287&5D+290']


#### Pull the 10X barcode, umi, and lineage barcode (site by site) from a .stats output file from a 10X-GESTALT run. Let's write that information to a new file and read it back in. 

In [5]:
output = open('data/tables/inj_heat1_ID_umi_barcode.tsv', 'w')
# write the header for the table
output.write('cell.id' + '\t' + 'umi' + '\t' + 'PASS' + '\t'+'site1' + '\t' + 
              'site2'+ '\t' + 'site3' + '\t' + 'site4' + '\t' + 'site5' + '\t' +
              'site6' + '\t' + 'site7' '\t' + 'site8' + '\t' + 'site9' + '\t' + 'site10' + '\t'+'\n')
stats = open(data_folder + "inj_heat1/inj_heat1.stats", "r")
stats.readline()
# for each line in the stats file after the header
for line in stats.readlines():
    # split by underscore
    read = line.split("_")
    # store the first 16 characters after the underscore as the cell_id
    cell_id = read[1][:16]
    # and the next 10 characters as the umi
    umi = read[1][16:26]
    # the barcode (nearly) is the 5th item when splitting by underscore
    barcode = read[6]
    # split the barcode by tab characters
    barcode = barcode.split("\t")
    # the second item of barcode is whether it was a pass or fail
    PASS = barcode[1]
    # and these indices match the sites of the barcode
    site1 = barcode[22]
    site2 = barcode[23]
    site3 = barcode[24]
    site4 = barcode[25]
    site5 = barcode[26]
    site6 = barcode[27]
    site7 = barcode[28]
    site8 = barcode[29]
    site9 = barcode[30]
    site10 = barcode[31]
    # if the barcode passed in the stats file,
    if PASS != 'FAIL':
        # write it to the output table
        output.write(str(cell_id) + '\t' + str(umi) + '\t' + str(PASS) + '\t' + str(site1) + '\t' + str(site2) + '\t' +str(site3) + '\t' +str(site4) + '\t' +str(site5) + '\t' +str(site6) + '\t' +str(site7) + '\t' +str(site8) + '\t' +str(site9) + '\t' +str(site10) + '\n')                     
output.close()
stats.close()

In [6]:
# make a dictionary to hold barcode information for each cell
stats_brief = open("data/tables/inj_heat1_ID_umi_barcode.tsv", "r")
stats_brief.readline()
inj_heat1_dict = {}
# for each line in the table we just wrote previously
for line in stats_brief.readlines():
    # split the line by tab character
    line = line.split("\t")
    # the first index is a cell (10X barcode)
    cell = line[0]
    # the third index is PASS
    PASS = line[2]
    # string together the barcode by site with underscores between each site
    barcode = str(line[3]+'_'+line[4]+'_'+line[5]+'_'+line[6]+'_'+line[7]+'_'+line[8]+'_'+line[9]+'_'+line[10]+'_'+line[11]+'_'+line[12])
    # remove the newline character at the end
    barcode = barcode.strip("\n")
    # pay attention only to those top twelve barcodes stored in barcode_list_inj_heat1
    if barcode in barcode_list_inj_heat1:
        # if the cell is already in the dictionary, add barcode to cell's list
        if cell in inj_heat1_dict: 
            inj_heat1_dict[cell].append(barcode)
        # otherwise, create list with the barcode for the cell
        else: 
            inj_heat1_dict[cell] = [barcode]
stats_brief.close()

In [7]:
print("These are all the barcode reads associated with cell 'ATGTCCCCATTCACAG':")
for barcode in inj_heat1_dict['ATGTCCCCATTCACAG']: 
    print(barcode)

These are all the barcode reads associated with cell 'ATGTCCCCATTCACAG':
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE
84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE

In [8]:
# count unique barcodes
for cell in inj_heat1_dict.keys():
    unique_barcodes = set(inj_heat1_dict[cell])
    count = len(unique_barcodes)
    # add count to the end of the cell list 
    inj_heat1_dict[cell].append(count)
# find most abundant barcode
for cell in inj_heat1_dict: 
    test = inj_heat1_dict[cell]
    data = Counter(test)
    abundant = max(test, key=data.get)
    # repeat most abundant barcode at the end of the cell list
    inj_heat1_dict[cell].append(abundant)

Now if we look at the list associated with inj_heat1_dict['ATGTCCCCATTCACAG'], it will have two more values. The second to last is a number. This is the number of unique barcodes associated with that cell. The last value is the barcode that was most abundant in that cell. 

In [9]:
inj_heat1_dict['ATGTCCCCATTCACAG'][-2:]

[2, '84D+45_84D+45_84D+45_84D+45_NONE_NONE_NONE_NONE_NONE_NONE']

#### Now write output files for merging with Seurat object

In [10]:
output = open('data/tables/inj_heat1_bestlineagebarcode.tsv', 'w')
# write the header for this file
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'top.barcode' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
# for each cell and barcode list in the dictionary
for cell,barcode in inj_heat1_dict.items(): 
    # save the barcode as sites 
    sites = barcode[-1].split("_")
    # write the cell, number of unique barcodes in the cell, and the most abundant barcode in that cell 
    output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                    +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

In [11]:
# instead of just the most abundant lineage barcode (top barcode or "best" barcode), write all barcodes for each cell to a file
output = open('data/tables/inj_heat1_all_lineagebarcodes_filtered.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'barcodes' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in inj_heat1_dict.items(): 
    if len(inj_heat1_dict[cell]) == 3: 
        sites = barcode[-1].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
    elif len(inj_heat1_dict[cell]) >=3:
        sites = barcode[0].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
        for i in range(1,len(inj_heat1_dict[cell])-2): 
            sites = barcode[i].split("_")
            output.write(' '+'\t'+' '+'\t'+str(barcode[i])+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

### inj_heat2

In [12]:
barcodes_inj_heat2 = open(data_folder + "inj_heat2/inj_heat2.allReadCounts", "r")
barcode_list_inj_heat2 = []
barcodes_inj_heat2.readline()
for line in barcodes_inj_heat2.readlines(): 
    line = line.split("\t")
    barcode = line[0]
    barcode = barcode.strip("\t")
    barcode_list_inj_heat2.append(barcode)
barcodes_inj_heat2.close()
barcode_list_inj_heat2 = barcode_list_inj_heat2[:13]

output = open('data/tables/inj_heat2_ID_umi_barcode.tsv', 'w')
output.write('cell.id' + '\t' + 'umi' + '\t' + 'PASS' + '\t'+'site1' + '\t' + 
              'site2'+ '\t' + 'site3' + '\t' + 'site4' + '\t' + 'site5' + '\t' +
              'site6' + '\t' + 'site7' '\t' + 'site8' + '\t' + 'site9' + '\t' + 'site10' + '\t'+'\n')
stats = open(data_folder + "inj_heat2/inj_heat2.stats", "r")
stats.readline()
for line in stats.readlines():
    read = line.split("_")
    cell_id = read[1][:16]
    umi = read[1][16:26]
    barcode = read[6]
    barcode = barcode.split("\t")
    PASS = barcode[1]
    site1 = barcode[22]
    site2 = barcode[23]
    site3 = barcode[24]
    site4 = barcode[25]
    site5 = barcode[26]
    site6 = barcode[27]
    site7 = barcode[28]
    site8 = barcode[29]
    site9 = barcode[30]
    site10 = barcode[31]
    if PASS != 'FAIL':
        output.write(str(cell_id) + '\t' + str(umi) + '\t' + str(PASS) + '\t' + str(site1) + '\t' + str(site2) + '\t' +str(site3) + '\t' +str(site4) + '\t' +str(site5) + '\t' +str(site6) + '\t' +str(site7) + '\t' +str(site8) + '\t' +str(site9) + '\t' +str(site10) + '\n')                     
output.close()
stats.close()

stats_brief = open("data/tables/inj_heat2_ID_umi_barcode.tsv", "r")
stats_brief.readline()
inj_heat2_dict = {}
for line in stats_brief.readlines():
    line = line.split("\t")
    cell = line[0]
    PASS = line[2]
    barcode = str(line[3]+'_'+line[4]+'_'+line[5]+'_'+line[6]+'_'+line[7]+'_'+line[8]+'_'+line[9]+'_'+line[10]+'_'+line[11]+'_'+line[12])
    barcode = barcode.strip("\n")
    # pay attention to those top twelve barcodes stored in barcode_list_inj_heat2
    if barcode in barcode_list_inj_heat2:
        # if the cell is already in the dictionary, add barcode to cell's list
        if cell in inj_heat2_dict: 
            inj_heat2_dict[cell].append(barcode)
        # otherwise, create list with the barcode for the cell
        else: 
            inj_heat2_dict[cell] = [barcode]
stats_brief.close()

# count unique barcodes
for cell in inj_heat2_dict.keys():
    unique_barcodes = set(inj_heat2_dict[cell])
    count = len(unique_barcodes)
    # add count to the end of the cell list 
    inj_heat2_dict[cell].append(count)
# find most abundant barcode
for cell in inj_heat2_dict: 
    test = inj_heat2_dict[cell]
    data = Counter(test)
    abundant = max(test, key=data.get)
    # repeat most abundant barcode at the end of the cell list
    inj_heat2_dict[cell].append(abundant)

output = open('data/tables/inj_heat2_bestlineagebarcode.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'top.barcode' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in inj_heat2_dict.items(): 
    sites = barcode[-1].split("_")
    output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                    +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

output = open('data/tables/inj_heat2_all_lineagebarcodes_filtered.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'barcodes' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in inj_heat2_dict.items(): 
    if len(inj_heat2_dict[cell]) == 3: 
        sites = barcode[-1].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
    elif len(inj_heat2_dict[cell]) >=3:
        sites = barcode[0].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
        for i in range(1,len(inj_heat2_dict[cell])-2): 
            sites = barcode[i].split("_")
            output.write(' '+'\t'+' '+'\t'+str(barcode[i])+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

###  unedited1

In [13]:
barcodes_unedited1 = open(data_folder + "unedited1/unedited1.allReadCounts", "r")
barcode_list_unedited1 = []
barcodes_unedited1.readline()
for line in barcodes_unedited1.readlines(): 
    line = line.split("\t")
    barcode = line[0]
    barcode = barcode.strip("\t")
    barcode_list_unedited1.append(barcode)
barcodes_unedited1.close()
# NOTE: here we're just taking the top barcode since these are control samples 
# anything other than the unedited barcode is noise
# if the top ranked barcode in your unedited control is not the unedited barcode, you should not trust this control
barcode_list_unedited1 = barcode_list_unedited1[0]
output = open('data/tables/unedited1_ID_umi_barcode.tsv', 'w')
output.write('cell.id' + '\t' + 'umi' + '\t' + 'PASS' + '\t'+'site1' + '\t' + 
              'site2'+ '\t' + 'site3' + '\t' + 'site4' + '\t' + 'site5' + '\t' +
              'site6' + '\t' + 'site7' '\t' + 'site8' + '\t' + 'site9' + '\t' + 'site10' + '\t'+'\n')
stats = open(data_folder + "unedited1/unedited1.stats", "r")
stats.readline()
for line in stats.readlines():
    read = line.split("_")
    cell_id = read[1][:16]
    umi = read[1][16:26]
    barcode = read[6]
    barcode = barcode.split("\t")
    PASS = barcode[1]
    site1 = barcode[22]
    site2 = barcode[23]
    site3 = barcode[24]
    site4 = barcode[25]
    site5 = barcode[26]
    site6 = barcode[27]
    site7 = barcode[28]
    site8 = barcode[29]
    site9 = barcode[30]
    site10 = barcode[31]
    if PASS != 'FAIL':
        output.write(str(cell_id) + '\t' + str(umi) + '\t' + str(PASS) + '\t' + str(site1) + '\t' + str(site2) + '\t' +str(site3) + '\t' +str(site4) + '\t' +str(site5) + '\t' +str(site6) + '\t' +str(site7) + '\t' +str(site8) + '\t' +str(site9) + '\t' +str(site10) + '\n')                     
output.close()
stats.close()

stats_brief = open("data/tables/unedited1_ID_umi_barcode.tsv", "r")
stats_brief.readline()
unedited1_dict = {}
for line in stats_brief.readlines():
    line = line.split("\t")
    cell = line[0]
    PASS = line[2]
    barcode = str(line[3]+'_'+line[4]+'_'+line[5]+'_'+line[6]+'_'+line[7]+'_'+line[8]+'_'+line[9]+'_'+line[10]+'_'+line[11]+'_'+line[12])
    barcode = barcode.strip("\n")
    # pay attention to those top twelve barcodes stored in barcode_list_unedited1
    if barcode in barcode_list_unedited1:
        # if the cell is already in the dictionary, add barcode to cell's list
        if cell in unedited1_dict: 
            unedited1_dict[cell].append(barcode)
        # otherwise, create list with the barcode for the cell
        else: 
            unedited1_dict[cell] = [barcode]
stats_brief.close()

# count unique barcodes
for cell in unedited1_dict.keys():
    unique_barcodes = set(unedited1_dict[cell])
    count = len(unique_barcodes)
    # add count to the end of the cell list 
    unedited1_dict[cell].append(count)
# find most abundant barcode
for cell in unedited1_dict: 
    test = unedited1_dict[cell]
    data = Counter(test)
    abundant = max(test, key=data.get)
    # repeat most abundant barcode at the end of the cell list
    unedited1_dict[cell].append(abundant)

output = open('data/tables/unedited1_bestlineagebarcode.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'top.barcode' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in unedited1_dict.items(): 
    sites = barcode[-1].split("_")
    output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                    +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

output = open('data/tables/unedited1_all_lineagebarcodes_filtered.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'barcodes' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in unedited1_dict.items(): 
    if len(unedited1_dict[cell]) == 3: 
        sites = barcode[-1].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
    elif len(unedited1_dict[cell]) >=3:
        sites = barcode[0].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
        for i in range(1,len(unedited1_dict[cell])-2): 
            sites = barcode[i].split("_")
            output.write(' '+'\t'+' '+'\t'+str(barcode[i])+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

### unedited2

In [14]:
barcodes_unedited2 = open(data_folder + "unedited2/unedited2.allReadCounts", "r")
barcode_list_unedited2 = []
barcodes_unedited2.readline()
for line in barcodes_unedited2.readlines(): 
    line = line.split("\t")
    barcode = line[0]
    barcode = barcode.strip("\t")
    barcode_list_unedited2.append(barcode)
barcodes_unedited2.close()
# notice here we're just taking the top barcode since these are control samples
barcode_list_unedited2 = barcode_list_unedited2[0]
output = open('data/tables/unedited2_ID_umi_barcode.tsv', 'w')
output.write('cell.id' + '\t' + 'umi' + '\t' + 'PASS' + '\t'+'site1' + '\t' + 
              'site2'+ '\t' + 'site3' + '\t' + 'site4' + '\t' + 'site5' + '\t' +
              'site6' + '\t' + 'site7' '\t' + 'site8' + '\t' + 'site9' + '\t' + 'site10' + '\t'+'\n')
stats = open(data_folder + "unedited2/unedited2.stats", "r")
stats.readline()
for line in stats.readlines():
    read = line.split("_")
    cell_id = read[1][:16]
    umi = read[1][16:26]
    barcode = read[6]
    barcode = barcode.split("\t")
    PASS = barcode[1]
    site1 = barcode[22]
    site2 = barcode[23]
    site3 = barcode[24]
    site4 = barcode[25]
    site5 = barcode[26]
    site6 = barcode[27]
    site7 = barcode[28]
    site8 = barcode[29]
    site9 = barcode[30]
    site10 = barcode[31]
    if PASS != 'FAIL':
        output.write(str(cell_id) + '\t' + str(umi) + '\t' + str(PASS) + '\t' + str(site1) + '\t' + str(site2) + '\t' +str(site3) + '\t' +str(site4) + '\t' +str(site5) + '\t' +str(site6) + '\t' +str(site7) + '\t' +str(site8) + '\t' +str(site9) + '\t' +str(site10) + '\n')                     
output.close()
stats.close()

stats_brief = open("data/tables/unedited2_ID_umi_barcode.tsv", "r")
stats_brief.readline()
unedited2_dict = {}
for line in stats_brief.readlines():
    line = line.split("\t")
    cell = line[0]
    PASS = line[2]
    barcode = str(line[3]+'_'+line[4]+'_'+line[5]+'_'+line[6]+'_'+line[7]+'_'+line[8]+'_'+line[9]+'_'+line[10]+'_'+line[11]+'_'+line[12])
    barcode = barcode.strip("\n")
    # pay attention to those top twelve barcodes stored in barcode_list_unedited2
    if barcode in barcode_list_unedited2:
        # if the cell is already in the dictionary, add barcode to cell's list
        if cell in unedited2_dict: 
            unedited2_dict[cell].append(barcode)
        # otherwise, create list with the barcode for the cell
        else: 
            unedited2_dict[cell] = [barcode]
stats_brief.close()

# count unique barcodes
for cell in unedited2_dict.keys():
    unique_barcodes = set(unedited2_dict[cell])
    count = len(unique_barcodes)
    # add count to the end of the cell list 
    unedited2_dict[cell].append(count)
# find most abundant barcode
for cell in unedited2_dict: 
    test = unedited2_dict[cell]
    data = Counter(test)
    abundant = max(test, key=data.get)
    # repeat most abundant barcode at the end of the cell list
    unedited2_dict[cell].append(abundant)

output = open('data/tables/unedited2_bestlineagebarcode.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'top.barcode' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in unedited2_dict.items(): 
    sites = barcode[-1].split("_")
    output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                    +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()

output = open('data/tables/unedited2_all_lineagebarcodes_filtered.tsv', 'w')
output.write('cell.id' + '\t' 'barcode.count' + '\t' + 'barcodes' + '\t'+ 'site1'+ '\t'+ 
             'site2'+ '\t'+'site3'+ '\t'+'site4'+ '\t'+'site5'+ '\t'+'site6'+ '\t'+'site7'+ '\t'+'site8'+ '\t'+
             'site9'+ '\t'+'site10'+'\n')
for cell,barcode in unedited2_dict.items(): 
    if len(unedited2_dict[cell]) == 3: 
        sites = barcode[-1].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
    elif len(unedited2_dict[cell]) >=3:
        sites = barcode[0].split("_")
        output.write(cell+'\t'+str(barcode[-2])+'\t'+barcode[0]+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
        for i in range(1,len(unedited2_dict[cell])-2): 
            sites = barcode[i].split("_")
            output.write(' '+'\t'+' '+'\t'+str(barcode[i])+'\t'+sites[0]+'\t'+sites[1]+'\t'+sites[2]
                     +'\t'+sites[3]+'\t'+sites[4]+'\t'+sites[5]+'\t'+sites[6]+'\t'+sites[7]+'\t'+sites[8]
                     +'\t'+sites[9]+'\n')
output.close()