In [1]:
import os
from Bio.SeqIO import QualityIO
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import gzip
import glob
import re
from DMS_utils import dna_rev_comp, translate_dna2aa
import pandas as pd
import seaborn as sns
import pickle as pkl
import matplotlib.colors as mcolors
from scipy import stats
import os.path
from matplotlib.lines import Line2D
import json
import shutil
#from evaluation_functions import *
from functions_ import *
from plotting import *
from Bio import SeqIO
import matplotlib.patches as patches
from collections import Counter
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
### define the necessary variables

base_dir = os.getcwd() 

catch_left = 'actgtctgttttcctgaattc'.upper() # region before retron seq
catch_right = 'gtacgaattcaggaaa'.upper() # region after retron seq

Barcodes = {"BC1_fwd": "AAGG", 
            "BC1_rev" : "ATCA", 
            "BC2_fwd":	"AGTC",
            "BC2_rev":	"GCCG", 
            "BC3_fwd":	"AGGA",
            "BC3_rev":	"AATT",}



Primer_seq = {"_fwd": "agcgagaggtttatcattaaggtcaac".upper(),
              "_rev":"aggcttttgacttggctgaggag".upper()}

Primer_out_of_triplets = {"_fwd": 0, 
                          "_rev": 0}


genetic_code = {
  'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
  'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
  'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
  'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
  'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
  'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
  'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
  'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
  'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
  'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
  'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
  'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
  'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
  'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
}

codons = list(genetic_code.keys())


quality_score = {
  '!':0, '"':1, '#':2, '$':3, '%':4, '&':5, "'":6, '(':7, ')':8, '*':9,
  '+':10, ',':11, '-':12, '.':13, '/':14, '0':15, '1':16, '2':17, '3':18, '4':19,
  '5':20, '6':21, '7':22, '8':23, '9':24, ':':25, ';':26, '<':27, '=':28, '>':29,
  '?':30, '@':31, 'A':32, 'B':33, 'C':34, 'D':35, 'E':36, 'F':37, 'G':38, 'H':39, 'I':40
}


ecoli_pref = { ### codons used for retron library (RL8) construction
            "A": 'GCG',
            "R": 'CGT',
            "N": 'AAC',
            "D": 'GAT',
            "C": 'TGC',
            "Q": 'CAG',
            "E": 'GAA',
            "G": 'GGC',
            "H": 'CAT',
            "I": 'ATT',
            "L": "CTG",
            "K": 'AAA',
            "M": 'ATG',
            "F": "TTT",
            "P": 'CCG',
            "S": 'AGC',
            "T": 'ACC',
            "W": 'TGG',
            "Y": "TAT",
            "V": 'GTG',
}

In [16]:

### make sure to run the second code chunk before this one, since the primer dict could have been updated during analysis of the repeated seq run of S2, S3
remove_read_qualities = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*','+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5']
#remove_read_qualities =['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',] 
#remove_read_qualities =['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/']
#ref_gene = amplicon_seq

used_BCs = ["BC1", "BC2", "BC3"]
variant = "RetronLib"
filter_for_n_mut = False
read_len_treshold = None

## variant 4
a_seq, b_seq, _, _, a_ids, b_ids = read_sequences(variant = variant, arbitrary_cutoff_a = False, arbitrary_cutoff_b = False, catch_left=catch_left, catch_right=catch_right, return_qualities_ids=True, quality_score=remove_read_qualities, base_dir = f"{os.getcwd()}/data/fastq/P0111_Retrons"
)

all_reads, all_ids = demultiplex_reads(a_seq, b_seq, ref_gene = None ,Barcodes=Barcodes, Primer_seq=Primer_seq, used_Barcodes = used_BCs, Sections = [""], max_mismatch_primerseq = 5, filter_for_n_mut = filter_for_n_mut, n_mut_treshold = 10, a_ids=a_ids, b_ids=b_ids,  read_len_treshold= read_len_treshold, Primer_out_of_triplets= Primer_out_of_triplets, cut_primer_start=False, cut_BC_seq=True)


total reads 92086
[0, 2, 6, 8, 11, 12, 17, 18, 24, 25]
33237
6941 b reads are empty
3377 a reads are empty
24860 forward reads with matching BC and primer seq
18871 reads with index swapping
BC1  24860 reads before filtering
################# Done: BC1 
################# Done: BC1
[3, 5, 7, 9, 16, 19, 20, 29, 31, 34]
26794
5306 b reads are empty
2637 a reads are empty
19470 forward reads with matching BC and primer seq
15921 reads with index swapping
BC2  19470 reads before filtering
################# Done: BC2 
################# Done: BC2
[1, 13, 14, 15, 21, 22, 23, 28, 32, 36]
13558
4851 b reads are empty
1180 a reads are empty
9280 forward reads with matching BC and primer seq
12687 reads with index swapping
BC3  9280 reads before filtering
################# Done: BC3 
################# Done: BC3


In [17]:
## cut all reads to region of interest: 
for key, reads in all_reads.items():
    if "R1" in key: 
        all_reads[key] = [read[read.index(catch_left)+len(catch_left):read.index(catch_right)] if catch_left in read and catch_right in read else "" for read in reads ]
        all_reads
    elif "R2" in key:
        all_reads[key] = [read[read.index(dna_rev_comp(catch_right))+len(catch_right):read.index(dna_rev_comp(catch_left))] if dna_rev_comp(catch_right) in read  and dna_rev_comp(catch_left) in read else "" for read in reads ]


In [22]:
all_reads["BC3__R1"]

['',
 'TACGAAGAGCGGAAAAAAATTCTGGAATTTATTTCACTTGCAGATGATGCGTGACCAGAAGGGCGATGTCCAGTATTTCATTGGCGTTCA',
 'TATTTTTGCGTCCGATAGCTTCCTGCAGTTAACCGAATACAGCCGCGAAGAAATTCT',
 'CGAGCATGTTCGCGATGCTGCGGAGCGTGAAGGTGTAATGTTAATTAAAAAGACTGCTGAAAACATTGTGAGGCGGCCAAAGGGAGCCTGCATCCGCC',
 'TCACTTGCAGCCTATGCGTGACCAGAAGCTGGATGTCCAGTATTTCATTGGCGTTCAGCTTGATGGTACCGAGCA',
 '',
 '',
 'GCTGCTGCTGCGCCGCATGGAAGCGATTAACGAAAGCAGCGGTTGCGCCACAACGCTGGAACGCATTGAAAAGAATTTCGTAATCACAGACCC',
 'CAGCCGCGAAGAAATTCTGGGTCGTAATTGTCGCTTCCTTCAGGCGCCAGAGACTGACCGTGCTACGGTACGCAAAATCCGCGACGCAAT',
 'GCTGCTGCGCCGCATGGAAGCGATTAACGAAAGCAGCGGTTTATGGACAACGCTGGAACGCATTGAAAAGAATTTCGTAATCACAGACCC',
 '',
 'GCTGCTGCTGCGCCGCATGGAAGCGATTAACGAAAGCAGCGGTTGGGCCACAACGCTGGAACGCATTGAAAAGAATTTCGTAATCACAGACCCGCGCCTTCCCGACAATCCAATTATTTT',
 'AATTATTTTTGCGTCCGATAGCTTCTGTGCTTAACCGAATACAGCCGCGAAGAAATTCTGGGTCGTAATTGTCGCTTCCTTCTGGGGCCGAGACTGACCGTGCTACGGTACGCAAAATCCGCGACGC',
 'CTTCGCGATGCTGCGGAGCGTGAAGGTGTAAT',
 '',
 'TAACGAAAGCAGCGGTTTAGCCACAACGCTGGAACGCATTGAAGCGAATTT

In [92]:
Bc = "BC3"

Read_dir = "R2"

reads = all_reads[f"{Bc}__{Read_dir}"] if Read_dir == "R1" else [dna_rev_comp(r) for r in all_reads[f"{Bc}__{Read_dir}"]]

output_file = f"data/fastq/P0111_Retrons/{variant}_{Bc}_Nt_filt_{Read_dir}_001.fasta"
sequences = [SeqIO.SeqRecord(Seq(read), id = all_ids[f"{Bc}__{Read_dir}"][i], description = f"{variant} {Bc} DNA sequence") for i, read in enumerate(reads)]

count = SeqIO.write(sequences, output_file, "fasta")
with open(output_file, "w") as output_handle:
    SeqIO.write(sequences, output_handle, "fasta")
print("Saved %i records to %s" % (count, output_file))

Saved 9280 records to data/fastq/P0111_Retrons/RetronLib_BC3_Nt_filt_R2_001.fasta


In [74]:
"cagctgctgctgcgccgcatggaagcgattaacgaaagcagcggtttagccacaacgctggaacgcattgaaaagaatttcgtaatcacagacccgcgccttcccgacaatccaattatttttgcgtccgatagcttcctgcaattaaccgaatacagccgcgaagaaattctgggtcgtaattgtcgcttccttcaggggccagagactgaccgtgctacggtacgcaaaatccgcgacgcaatcgacaatcaaacggaagtcacggttcagttgattaactatacgaagagcggaaaaaaattctggaatttatttcacttgcagcctatgcgtgaccagaagggcgatgtccagtatttcattggcgttcagcttgatggtaccgagcatgttcgcgatgctgcggagcgtgaaggtgtaatgttaattaaaaagactgctgaaaacattgatgaggcggccaaagggagcctgcatccgccgatggataaccgcgtgcgcgaagcgtgc".upper()

'CAGCTGCTGCTGCGCCGCATGGAAGCGATTAACGAAAGCAGCGGTTTAGCCACAACGCTGGAACGCATTGAAAAGAATTTCGTAATCACAGACCCGCGCCTTCCCGACAATCCAATTATTTTTGCGTCCGATAGCTTCCTGCAATTAACCGAATACAGCCGCGAAGAAATTCTGGGTCGTAATTGTCGCTTCCTTCAGGGGCCAGAGACTGACCGTGCTACGGTACGCAAAATCCGCGACGCAATCGACAATCAAACGGAAGTCACGGTTCAGTTGATTAACTATACGAAGAGCGGAAAAAAATTCTGGAATTTATTTCACTTGCAGCCTATGCGTGACCAGAAGGGCGATGTCCAGTATTTCATTGGCGTTCAGCTTGATGGTACCGAGCATGTTCGCGATGCTGCGGAGCGTGAAGGTGTAATGTTAATTAAAAAGACTGCTGAAAACATTGATGAGGCGGCCAAAGGGAGCCTGCATCCGCCGATGGATAACCGCGTGCGCGAAGCGTGC'

In [19]:
same_bc = 0
diff_bc = 0

BC_corresp = { "AAGG" : "ATCA", 
            "AGTC" : "GCCG", 
            "AGGA" :	"AATT"}

for idx, a_read in enumerate(a_seq): 
    Bc1 = a_read[:4]
    if Bc1 not in BC_corresp.keys():
        diff_bc += 1
        continue
    if BC_corresp[Bc1] == b_seq[idx][:4]:
        same_bc += 1  
    else: 
        diff_bc += 1
        
print("same BC: ", same_bc)
print("idx swapping BC: ", diff_bc)
print(diff_bc/(same_bc+diff_bc), " of reads have swapped BCs") # roughly 40% !!!!!

same BC:  53614
idx swapping BC:  38472
0.417783376409009  of reads have swapped BCs
