In [2]:
import os
from Bio.SeqIO import QualityIO
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches

import gzip
import glob
import re
from DMS_utils import dna_rev_comp, translate_dna2aa
import pysam
import pandas as pd
import seaborn as sns
import pickle as pkl
import matplotlib.colors as mcolors
from scipy import stats
import os.path
from matplotlib.lines import Line2D
import json
import matplotlib.gridspec as gridspec
import shutil

from evaluation_functions import *
from importlib import reload

In [3]:
base_dir = os.getcwd()
R5_Pool_seq = "GGCGCAACTATCGGTATCAAGCTGTTTAAGAAATTCACCTCGAAAGCAAGTTGATAAACTGATACAATTAAAGGCTCCTTTTGGAGCCTTTTTTTTTGGAGTAAGGAGGAAAAATGTCCGAGGAAGCGCAGAACGATCCGCTGCTGCCGGGCTATAGCTTTAACGCGCATCTGGTGGCGGGCCTGACCCCGATTGAAGCGAACGGCTATCTGGATTTTTTTATTGATCGCCCGCTGGGCATGAAAGGCTATATTCTGAACCTGACCATTCGCGGCCAGGGCGTGGTGAAAAACCAGGGCCGCGAATTTGTGTGCCGCCCGGGCGATATTCTGCTGTTTCCGCCGGGCGAAATTCATCATTATGGCCGCCATCCGGAAGCGCGCGAATGGTATCATCAGTGGGTGTATTTTCGCCCGCGCGCGTATTGGCATGAATGGCTGAACTGGCCGAGCATTTTTGCGAACACCGGCTTTTTTCGCCCGGATGAAGCGCATCAGCCGCATTTTAGCGATCTGTTTGGCCAGATTATTAACGCGGGCCAGGGCGAAGGCCGCTATAGCGAACTGCTGGCGATTAACCTGATAGAACAGCTGCTGCTGCGCCGCATGGAAGCGATTAACGAAAGCAGCGGTTTAGCCACAACGCTGGAACGCATTGAAAAGAATTTCGTAATCACAGACCCGCGCCTTCCCGACAATCCAATTATTTTTGCGTCCGATAGCTTCCTGCAATTAACCGAATACAGCCGCGAAGAAATTCTGGGTCGTAATTGTCGCTTCCTTCAGGGGCCAGAGACTGACCGTGCTACGGTACGCAAAATCCGCGACGCAATCGACAATCAAACGGAAGTCACGGTTCAGTTGATTAACTATACGAAGAGCGGAAAAAAATTCTGGAATTTATTTCACTTGCAGCCTATGCGTGACCAGAAGGGCGATGTCCAGTATTTCATTGGCGTTCAGCTTGATGGTACCGAGCATGTTCGCGATGCTGCGGAGCGTGAAGGTGTAATGTTAATTAAAAAGACTGCTGAAAACATTGATGAGGCGGCCAAAGGGAGCCTGCATCCGCCGATGGATAACCGCGTGCGCGAAGCGTGCCAGTATATTAGCGATCATCTGGCGGATAGCAACTTTGATATTGCGAGCGTGGCGCAGCATGTGTGCCTGAGCCCGAGCCGCCTGAGCCATCTGTTTCGCCAGCAGCTGGGCATTAGCGTGCTGAGCTGGCGCGAAGATCAGCGCATTAGCCAGGCGAAACTGCTGCTGAGCACCACCCGCATGCCGATTGCGACCGTGGGCCACAACGTGGGCTTTGATGATCAGCTGTATTTTAGCCGCGTGTTTAAAAAATGCACCGGCGCGAGCCCGAGCGAATTTCGCGCGGGCTGCGAAGAAAAAGTGAACGATGTGGCGGTGAAACTGAGCGGGTAAGGCTAATGGAGATTTTCAACATGGGCTAGCACAGCCCTAGGTATTATGCTAGCGTGGTGTCTGCGTAATAAGGAGTCTTAATCATGCCAGTTC"

catch_left = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT" ## Adapter fwd
catch_right = "GACTGGAGTTCAGACGTGTGCTCTTCCGATCT"

Barcodes = {"BC1_Fwd": "AAGG",
            "BC1_Rev": "ATCA",
            "BC2_Fwd": "AGTC",
            "BC2_Rev": "GCCG",
            "BC3_Fwd": "AGGA",
            "BC3_Rev": "AATT",
            "BC4_Fwd": "GCGA",
            "BC4_Rev": "TGGT",
            }

S1_fwd_primer = "GGCGCAACTATCGGTATCAAGC"
S1_rev_primer = "TCGGCCAGTTCAGCCATTCA"
S1_rev_primer_compseq = dna_rev_comp(S1_rev_primer)
S2_fwd_primer = "ATTATGGCCGCCATCCGGAAGCG"
S2_rev_primer = "TAGCACGGTCAGTCTCTGGC"
S2_rev_primer_compseq = dna_rev_comp(S2_rev_primer)
S3_fwd_primer = "GTCCGATAGCTTCCTGCAATTAACC"
S3_rev_primer = "ACATGCTGCGCCACGCTC"
S3_rev_primer_compseq = dna_rev_comp(S3_rev_primer)
S4_fwd_primer = "GATAACCGCGTGCGCGAA"
S4_rev_primer = "GAACTGGCATGATTAAGACTCCTTATTACG" 
S4_rev_primer_compseq = dna_rev_comp(S4_rev_primer)

Primer_seq = dict(
            S1_fwd_primer = "GGCGCAACTATCGGTATCAAGC",
            S1_rev_primer = "TCGGCCAGTTCAGCCATTCA",
            S2_fwd_primer = "ATTATGGCCGCCATCCGGAAGCG",
            S2_rev_primer = "TAGCACGGTCAGTCTCTGGC",
            S3_fwd_primer = "GTCCGATAGCTTCCTGCAATTAACC",
            S3_rev_primer = "ACATGCTGCGCCACGCTC",
            S4_fwd_primer = "GATAACCGCGTGCGCGAA",
            S4_rev_primer = "GAACTGGCATGATTAAGACTCCTTATTACG" 
)

genetic_code = {
  'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
  'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
  'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
  'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
  'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
  'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
  'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
  'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
  'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
  'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
  'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
  'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
  'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
  'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
}

codons = list(genetic_code.keys())

quality_score = {
  '!':0, '"':1, '#':2, '$':3, '%':4, '&':5, "'":6, '(':7, ')':8, '*':9,
  '+':10, ',':11, '-':12, '.':13, '/':14, '0':15, '1':16, '2':17, '3':18, '4':19,
  '5':20, '6':21, '7':22, '8':23, '9':24, ':':25, ';':26, '<':27, '=':28, '>':29,
  '?':30, '@':31, 'A':32, 'B':33, 'C':34, 'D':35, 'E':36, 'F':37, 'G':38, 'H':39, 'I':40
}

Primer_out_of_triplets = {"S1_fwd_primer": 2,
                     "S1_rev_primer": 1, 
                     "S2_fwd_primer": 2,
                     "S2_rev_primer": 1,
                     "S3_fwd_primer": 1,
                     "S3_rev_primer": 2,
                     "S4_fwd_primer": 0,
                     "S4_rev_primer": 3} ##?



In [4]:
def build_genotype_table_from_AAseqs(AAseqs_dict, ref_AAseq): 
    """
    AAseqs_dict = {timepoint: list of AA sequences}
    ref_AAseq_dict =reference AA sequence
    """
    genotypes = {}

    for timepoint, reads in AAseqs_dict.items():
        variants = []
        for read in reads: 
            variants.append(" ".join([ref_AAseq[i] + str(i) + read[i] for i in range(len(read)) if ref_AAseq[i] != read[i]]))
        
        #print(variants)
        for variant in variants: 
            if variant == "": 
                variant = "WT"
            if variant in genotypes: 
                genotypes[variant][timepoint] += 1
            else: 
                timepoints = {tp : 0 for tp in AAseqs_dict.keys()}
                genotypes[variant] = timepoints
                genotypes[variant][timepoint] += 1

    return genotypes
        

In [5]:
remove_read_qualities = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', ]#'.', '/']

a_seq, b_seq, _, _, a_ids, b_ids= read_sequences(variant = "Mutagenesis", arbitrary_cutoff_a = False, arbitrary_cutoff_b = False, catch_left="", catch_right="", return_qualities_ids=True, quality_score=remove_read_qualities)
ref_gene = R5_Pool_seq

Mutagenesis_reads, Mut_ids_Dict = demultiplex_reads(a_seq, b_seq, ref_gene,Barcodes=Barcodes, Primer_seq=Primer_seq, used_Barcodes = ["BC1", "BC2"], Sections = ["S1", "S2", "S3", "S4"], max_mismatch_primerseq = 3, filter_for_n_mut = True, n_mut_treshold = 10, a_ids=a_ids, b_ids=b_ids)

a_seq, b_seq, _, _, a_ids, b_ids = read_sequences(variant = "NegPosSelection", arbitrary_cutoff_a = False, arbitrary_cutoff_b = False, catch_left="", catch_right="", return_qualities_ids=True, quality_score=remove_read_qualities)

NegPosSelection_reads, NegPos_ids = demultiplex_reads(a_seq, b_seq, ref_gene,Barcodes=Barcodes, Primer_seq=Primer_seq, used_Barcodes = ["BC1", "BC2", "BC3", "BC4"], Sections = ["S1", "S2", "S3", "S4"], max_mismatch_primerseq = 3, filter_for_n_mut = True, n_mut_treshold = 10, a_ids=a_ids, b_ids=b_ids)

all_reads = {**{"Mutagenesis_"+key : value for key, value in Mutagenesis_reads.items()},**{"NegPosSelection_"+key : value for key, value in NegPosSelection_reads.items()}}

all_ids = {**{"Mutagenesis_"+key : value for key, value in Mut_ids_Dict.items()},**{"NegPosSelection_"+key : value for key, value in NegPos_ids.items()}}

total reads 32117
BC1
total forward reads before filtering 3907
total reverse reads before filtering 3907
total forward reads after filtering 3079
total reverse reads after filtering 3186
3907 reads
total forward reads before filtering 4347
total reverse reads before filtering 4347
total forward reads after filtering 3395
total reverse reads after filtering 3312
4347 reads
total forward reads before filtering 5561
total reverse reads before filtering 5561
total forward reads after filtering 4529
total reverse reads after filtering 4236
5561 reads
total forward reads before filtering 6086
total reverse reads before filtering 6086
total forward reads after filtering 4956
total reverse reads after filtering 5151
6086 reads
BC2
total forward reads before filtering 4193
total reverse reads before filtering 4193
total forward reads after filtering 3093
total reverse reads after filtering 3542
4193 reads
total forward reads before filtering 3735
total reverse reads before filtering 3735
total

In [9]:
keys = [name for name in all_reads.keys() if "S4" in name and "R1" in name]
print(keys)

['Mutagenesis_BC1_S4_R1', 'Mutagenesis_BC2_S4_R1', 'NegPosSelection_BC1_S4_R1', 'NegPosSelection_BC2_S4_R1', 'NegPosSelection_BC3_S4_R1', 'NegPosSelection_BC4_S4_R1']


In [14]:
## build genotype table: 
Section = "S4"
keys = ['Mutagenesis_BC1_S4_R1', 'NegPosSelection_BC1_S4_R1', 'NegPosSelection_BC2_S4_R1', 'Mutagenesis_BC2_S4_R1', 'NegPosSelection_BC3_S4_R1', 'NegPosSelection_BC4_S4_R1']
Barcode_order = ["BC1", "BC1", "BC2", "BC2",  "BC3","BC4"]
AA_seqs_dict = {}

for timepoint, step in enumerate(keys):
    Barcode = Barcode_order[timepoint]

    catch_left = Barcodes[Barcode+"_Fwd"]+ Primer_seq[Section+"_fwd_primer"][:Primer_out_of_triplets[Section + "_fwd_primer"]]
    reads = all_reads[step] 
    reads = [read[read.index(catch_left)+len(catch_left):] for read in reads if catch_left in read]
    AAseqs = [translate_dna2aa(read) for read in reads]
    AA_seqs_dict[str(timepoint)] =  AAseqs

ref_seq_Section = find_reference_seq(ref_gene = R5_Pool_seq, Primer_seq= Primer_seq, Section = "S4", Primer_out_of_triplets=Primer_out_of_triplets)
ref_AAseq = translate_dna2aa(ref_seq_Section) 

In [28]:
genotypes = build_genotype_table_from_AAseqs(AA_seqs_dict, ref_AAseq)
genotypes = pd.DataFrame.from_dict(genotypes, orient='index')
## calculate as percentage 
genotypes = genotypes/genotypes.sum()*100

## drop rows with less than 1 percent 
genotypes = genotypes.loc[genotypes.sum(axis=1) > 5]
genotypes = genotypes+1e-10

adjacency_matrix = pd.DataFrame()
adjacency_matrix["names"] = genotypes.index
adjacency_matrix["parents"] = [np.nan]+["WT"]*(len(genotypes)-1)


adjacency_matrix.to_csv(f"{base_dir}/adjacency_matrix.csv", index=False)

genotypes.to_csv(f"{base_dir}/genotype_table.csv", index = True)
