#### In this notebook, precomputed barcode_stat files are used. To find the command for their generation, see `Notebook_Test_IndividualBarcode.ipynb`

In [4]:
import pandas as pd
import numpy as np
from Bio.Seq import reverse_complement

import numpy as np

barcode_list = pd.read_csv(
    '../data/MGI_index_list.txt', 
    header = None,
    names = ('skip', 'Index', 'seq'),
    sep = ' '
)


barcode_list_rev = barcode_list.copy()
barcode_list_dict = {
    barcode_list.seq[i] : barcode_list.Index[i] for i in barcode_list.index
}


barcode_list_rev['seq'] = [reverse_complement(s) for s in barcode_list_rev['seq']]
barcode_list_rev_dict = {
    barcode_list_rev.seq[i] : barcode_list_rev.Index[i] for i in barcode_list_rev.index
}

In [5]:


FORWARD = 'forward'
REVERSE = 'reverse'
barcodes = list(map(str, range(1,129)))

folder = '../data/barcode_stats/'


subfolders = [
    'wgs_metagenomes',
    'bacteria',
    'bats',
    'molodtsov'
]


barcode_to_save = '104'

rates = {
    'forward': [],
    'reverse': [],
}

saved_df = {}

for subfolder in subfolders:
    
    print(subfolder)
    for barcode in barcodes:

        rate = {}
        total_bc = {}
        
        for read_type in (FORWARD, REVERSE):

            data = pd.read_csv(
                f'{folder}/{subfolder}/{barcode}_stat_{read_type}.txt', 
                sep=' ',
                skipinitialspace = True,
                header = None,
                names = ('Count', 'seq')
            )

            if len(data) == 0:
                continue


            if read_type is FORWARD:

                data = data[data.index.isin ([i for i in data.index if data.seq[i] in set(barcode_list.seq)]) ]
                data['Index'] = [barcode_list_dict[s] for s in data.seq]
                
                # extract the count of the target barcode
                for j in data.index:
                    if data.Index[j] == int(barcode):
                        target_count = data.Count[j]
                        

            elif read_type is REVERSE:
                data = data[data.index.isin ([i for i in data.index if data.seq[i] in set(barcode_list_rev.seq)]) ]
                data['Index'] = [barcode_list_rev_dict[s] for s in data.seq]
                
                # extract the count of the target barcode
                for j in data.index:
                    if data.Index[j] == int(barcode):
                        target_count = data.Count[j]

            
            if barcode == barcode_to_save: saved_df[read_type] = data
            data.reset_index(drop=True, inplace=True)
            
            total = np.sum(data.Count)
            
            percent = (total - target_count) * 100/total

            rate[read_type] = percent
            total_bc[read_type] = total
            
            rates[read_type].append(rate[read_type])
            

        try:
            ratio = rate[FORWARD]/rate[REVERSE]
            print(f'{barcode}', f'{round(rate[FORWARD], 2)}%', f'{round(rate[REVERSE], 2)}%', total_bc[FORWARD], total_bc[REVERSE], sep='\t')
        except KeyError:
            continue
            
    print()

wgs_metagenomes
42	1.51%	0.31%	4799393	1575615
43	0.08%	0.01%	721719	401868
44	2.15%	0.32%	2049531	764418
45	3.71%	0.79%	1933276	651929
65	2.76%	0.49%	2222598	754232
66	1.92%	0.22%	2078544	702973
67	1.27%	0.22%	3757871	1428055
68	0.91%	0.1%	3095884	1200687

bacteria
41	7.04%	2.76%	20282	6298
46	1.71%	0.49%	192786	60333
47	3.52%	1.15%	75240	18791
48	2.78%	0.92%	220146	69937

bats
73	1.42%	0.23%	1006645	500189
74	1.21%	0.1%	1274957	605771
75	2.29%	0.27%	785660	345108
76	1.61%	0.22%	871336	341681
77	1.43%	0.21%	998777	412973
78	1.82%	0.26%	1462596	661921
79	3.03%	0.71%	434316	164141
80	8.41%	1.99%	259018	85300
81	2.48%	0.39%	514137	235614
82	2.37%	0.27%	512193	225202
83	2.1%	0.29%	608697	251354
84	1.73%	0.21%	874144	388697
85	1.57%	0.2%	1116461	527364
86	1.81%	0.24%	866633	385674
87	1.86%	0.27%	742732	365479
88	2.14%	0.34%	1389366	643877
89	1.66%	0.19%	776627	335098
90	1.59%	0.24%	986116	455138
91	1.43%	0.2%	1165070	517730
92	1.46%	0.18%	953817	385509
94	3.52%	0.6%	641937	359088

molodtso