In [397]:
import regex
import pandas as pd
import numpy as np
import random

from Bio import Seq
from Bio import SeqIO

import cfg

## Load restrictions sites and codon tools


In [398]:
def load_restriction_sites():

    re_seqs = []
    for record in SeqIO.parse(cfg.RESTRICTSITES_FN, 'fasta'):
        re_seqs.append(record.seq)
        re_seqs.append(record.seq.reverse_complement())

    return(set(re_seqs))

def load_hs_codon_freq():
    '''
    Generate aa to codon dictionary.
    '''

    aa_codon_dt = pd.read_csv(cfg.DATA_DIR+'hs_codon_freq.csv')
    aa_to_codon = dict(aa_codon_dt.loc[
            aa_codon_dt.groupby('aa')['freq'].idxmax()][
                    ['aa','codon']].to_dict('split')['data'])

    return(aa_codon_dt, aa_to_codon)

AA_CODON_DT, AA_TO_CODON = load_hs_codon_freq()

def rand_codon(aa, orig=None):
    '''
    select random codon
    based on hs genome freq
    '''
    codon_rows = AA_CODON_DT[AA_CODON_DT['aa'] == aa]
    if orig:
        codon_rows = codon_rows[codon_rows.codon != orig]
        if not len(codon_rows):
            #no other codons, can't use orig
            return rand_codon(aa)
    return(np.random.choice(
            codon_rows.codon,
            p = codon_rows.freq/sum(codon_rows.freq)))

RE_SITES = load_restriction_sites()
RE_REGEX = regex.compile('('+'|'.join([str(re).replace('N','.') for re in RE_SITES])+')')
LIBRARY = pd.read_csv('out/results.csv')

## RE Check

Load the dictionary of mutated sequences generated by the TCR_library program from a saved CSV file. I then iterate through
the dataframe and extract the mutant DNA and AA sequence. These sequences are then fed to the remove_re_sites function.
remove_re_sites searches for instances of the specified site (including overlapping sequences) using the finditer function
provided in the regex module.

In [399]:
def remove_re_sites(
        nt_seq, aa_seq,
        pre=cfg.OLIGO_CHECK_PRE,
        post=cfg.OLIGO_CHECK_POST,
        re_regex= RE_REGEX,
        v= False):

    #Reformat input sequences to uppercase
    nt_seq = nt_seq.upper()
    pre = pre.upper()
    post = post.upper()

    assert(Seq.Seq(nt_seq).translate() == aa_seq)

    re_matches = re_regex.finditer(pre+nt_seq+post, overlapped = 1)
    replacements = 0
    codon_indices_replaced = []
    aa_replaced = []
    old_codons = []
    new_codons = []

    while re_matches:

        try:
            match = next(re_matches)
        except StopIteration:
            break


        if v:
            print('replacements:{}'.format(replacements))
            print(pre.lower() + nt_seq + post.lower())
            print('global match:'+str(match))

        if replacements > 100:
            raise Exception('too many codon replacement attempts') # something is up

        # get span of nucleotide indices

        nt_span = (
                max([0, match.span()[0]- len(pre)]),
                min([len(nt_seq), match.span()[1] - len(pre)]))

        # expand to corresponding codon indices
        aa_span = [i // 3 for i in nt_span]

        # convert codon indices back to nucleotides
        nt_rep_span = (aa_span[0]*3, aa_span[1]*3+3)

        if v: print('spans:'+str(nt_span)+str(aa_span)+str(nt_rep_span))

        # try replacing one codon at a time:
        for codon_i in random.sample(
                range(aa_span[0], aa_span[1]),
                aa_span[1]-aa_span[0]):

            old_codon = nt_seq[(codon_i*3):((codon_i*3)+3)]
            if v: print('AA codon index: ' + str(codon_i))
            new_codon = rand_codon(aa_seq[codon_i], orig=old_codon)
            if v: print(new_codon)

            if v:
                print('changing codon {}:{}({}) to {}({})'.format(
                        codon_i,
                        old_codon,
                        aa_seq[codon_i],
                        new_codon,
                        Seq.Seq(new_codon).translate()))

            new_nt_seq = (
                    nt_seq[:(codon_i*3)] +
                    new_codon +
                    nt_seq[((codon_i*3)+3):])

            # print('new seq: '+pre+new_nt_seq+post)
            # test for same match as found originally
            local_re_match = re_regex.search((pre+new_nt_seq+post)[
                    match.span()[0]:match.span()[1]])
            if v: print('local match: '+str(local_re_match))

            # if no more match, we're done, accept new seq
            if not local_re_match:
                nt_seq = new_nt_seq
                codon_indices_replaced.append(codon_i)
                aa_replaced.append(aa_seq[codon_i])
                old_codons.append(old_codon)
                new_codons.append(new_codon)
                break

        replacements += 1
        #redo search to see if we still have restriction sites

        re_matches = re_regex.finditer(pre+nt_seq+post, overlapped = 1)

    assert(Seq.Seq(nt_seq).translate() == aa_seq), '{} ne {}'.format(
        Seq.Seq(nt_seq).translate(),
        aa_seq)

    return(nt_seq, zip(
            codon_indices_replaced,
            aa_replaced,
            old_codons,
            new_codons))

In [400]:
def screen_library_for_re_sites(library):
    '''

    :param library: CSV file of unscreened sequences
    :return: CSV file of screened sequences
    '''

    screened_sequences = []
    re_sites_removed = 0

    for sequence in library.iterrows(): #Iterate through CSV and convert back into sequence dictionary

        screened_sequence = {
            'position': sequence[1][1],
            'original_aa': sequence[1][2],
            'new_aa': sequence[1][3],
            'new_codon': sequence[1][4],
            'new_dna_seq': sequence[1][5],
            'new_aa_seq': sequence[1][6]
        }

        dna_sequence_pre = screened_sequence['new_dna_seq']
        aa_sequence_pre = screened_sequence['new_aa_seq']

        dna_sequence_post, re_info = remove_re_sites(dna_sequence_pre,aa_sequence_pre, v= True) #Check for RE sites

        re_info = list(re_info)

        if len(re_info) > 0: #Update the DNA sequence if a site if found
            re_sites_removed += len(re_info[0])
            screened_sequence ['new_dna_seq'] = dna_sequence_post

        screened_sequences += [screened_sequence]


    return pd.DataFrame(screened_sequences), re_sites_removed

screened_library, re_sites = screen_library_for_re_sites(LIBRARY)

screened_library.to_csv('out/screened_library.csv')
print (screened_library)

replacements:0
cactgcCTGGAGGATCTGAAAAACGTGTTCCCTCCTGAAGTGGCTGTCTTTGAAGACTCCGAGGCCGAGATTTCCCATACCCAGAAAGCAACTCTGGTCTGTCTGGCCACTGGATTCTACCCCGATCACGTGGAACTGTCTTGGTGGGTGAACGGCAAGGAAGTCCATTCCGGAGTCTCTACCGACCCTCAGCCCCTCAAGGAGCAGCCTGCTCTCAACGATTCTCGGTACTGCCTGTCATCTCGACTGAGAGTGTCTGCCACCTTCTGGCAGAACCCTAGAAACCACTTTCGGTGTCAGGTCCAGTTTTACGGCCTGAGCGAGAACGATGAGTGGACACAGGATAGAGCCAAACCTGTGACACAGATTGTGAGCGCCGAGGCTTGGGGACGAGCCGATTGTGGCTTCACATCCGAGTCTTACCAGCAGGGAGTGCTGTCTGCTACAATCCTCTACGAAagaggg
global match:<regex.Match object; span=(51, 57), match='GAAGAC'>
spans:(45, 51)[15, 17](45, 54)
AA codon index: 16
GAT
changing codon 16:GAC(D) to GAT(D)
local match: None
replacements:0
cactgcCTGGAGGATCTGAAAAACGTGTTCCCTCCTGAAGTGGCTGTCTTTGAACCATCCGAGACCGAGATTTCCCATACCCAGAAAGCAACTCTGGTCTGTCTGGCCACTGGATTCTACCCCGATCACGTGGAACTGTCTTGGTGGGTGAACGGCAAGGAAGTCCATTCCGGAGTCTCTACCGACCCTCAGCCCCTCAAGGAGCAGCCTGCTCTCAACGATTCTCGGTACTGCCTGTCATCTCGACTGAGAGTGTCTGCCACCTTCTGGCAGAACCCTAGAAACCACTTTCGGTGTCAGGTCCAGTTTTACGGCCTGAGCGAGAACGATGA