# TCR Library Design

In [417]:
import pandas as pd
import numpy as np

from Bio import Seq
from Bio import SeqIO


In [418]:
#Import reference sequence as DNA and AA seq objects

REF_DNA = SeqIO.read("data/TRBC_Extracellular.fasta", "fasta").seq
REF_AA = Seq.translate(REF_DNA)

REF_DNA = REF_DNA.upper()

BLOSSOM_SUB_TABLE = pd.read_csv('data/blosum80_subs_plus_a.csv') # Dataframe with list of substitutions for a given amino acid based on blossom matrix

In [419]:
def homologous_sub_freq(): #Generate dictionary with ranked list of substitutions at a given position
    homology_dt = pd.read_csv('data/homologous_subs.csv') #DT with sub frequencies from alignment
    sub_freq_dict = {} #Dictionary with pre-sorted list of subs at a given position

    aa_row = list(homology_dt["AA"]) #List of which row corresponds to which AA

    for position in homology_dt.keys()[1:]: #Iterates through sequence but skips first column with AA table
        position_subs = list(homology_dt[position]) #Frequencies of all subs (index matches aa_row)
        sub_tuples = [] #List of formatted sub frequencies (sub aa, frequency)

        for i in range(len(position_subs)): #Iterates through list of sub frequencies and reformats them into tuples

            aa = aa_row[i] #sub aa
            sub_freq = position_subs[i] #frequency in homologous sequences

            sub_tuples += [(aa, sub_freq)]

        subs_by_freq = sorted(sub_tuples, key=lambda x: x[1], reverse=True) #Sorted list of subs
        sub_freq_dict[int(position)] = subs_by_freq

    return sub_freq_dict

def load_top_codons(): #Generate dictionary of top 2 codons for each AA
    aa_codon_dt = pd.read_csv('data/hs_codon_freq.csv') #Dataframe of codon frequencies
    top_2_codons_dt = aa_codon_dt.groupby(['aa'])['codon','freq'].apply(lambda x: x.nlargest(2, columns=['freq']))
    top_codons = {} #Dictionary with list of top two codons for a given AA

    for codon in top_2_codons_dt.iterrows():
        aa = codon[0][0]
        codon_dna = codon[1][0]

        if aa in top_codons.keys():
            top_codons[aa] += [codon_dna]

        else:
            top_codons[aa] = [codon_dna]

    return top_codons,aa_codon_dt

TOP_2_CODONS, AA_CODON_DT = load_top_codons()
HOMOLOGY_SUB_TABLE = homologous_sub_freq()

def rand_codon(aa, orig=None):
    '''
    select random codon
    based on hs genome freq
    '''
    codon_rows = AA_CODON_DT[AA_CODON_DT['aa'] == aa]
    if orig:
        codon_rows = codon_rows[codon_rows.codon != orig]
        if not len(codon_rows):
            #no other codons, can't use orig
            return rand_codon(aa)
    return(np.random.choice(
            codon_rows.codon,
            p = codon_rows.freq/sum(codon_rows.freq)))

  top_2_codons_dt = aa_codon_dt.groupby(['aa'])['codon','freq'].apply(lambda x: x.nlargest(2, columns=['freq']))


## Mutation Generation:

Mutations are generated by iterating through the amino acid sequence of the protein. The first 4 substitutions for a
given position are alanine and the top 3 results from the blossom table. I then plan to look for the most frequent
mutation seen at that position in homologous sequences that is not included in the intial set of substitutions.

This list of AA substititions will be fed to Dan's code to pick the top two codons for that AA, formatted into a
dictionary that includes the position mutated, the original AA at that position, the new AA at that position, the
new codon used, the new DNA sequence, and the new AA sequence.

These dictionaries

In [420]:
def delete_position(ref_dna, ref_aa, aa_i):
    '''
    Delete the AA in ref_aa specified by aa_i
    :param ref_dna:
    :param ref_aa:
    :param aa_i:
    :return:
    '''

    dna_i = aa_i * 3

    deleted_position = {
        "position": aa_i,
        "original_aa": ref_aa[aa_i],
        "original_codon": ref_dna[dna_i:dna_i + 3],
        "new_aa": "DEL",
        "new_codon": "",
        "new_dna_seq": ref_dna[:dna_i] + ref_dna[dna_i + 3:],
        "new_aa_seq": ref_aa[:aa_i] + ref_aa[aa_i+1:],
        "dna_change_s": dna_i,
        "dna_change_e": dna_i + 2

    }

    return [deleted_position]

def insert_stop(ref_dna, ref_aa, aa_i):
    """
    Generates a stop codon at a given position in the input ref_dna sequence
    :param ref_dna:
    :param ref_aa:
    :param aa_i:
    :return:
    """
    dna_i = aa_i * 3
    aa = ref_aa[aa_i] #AA being substituted
    original_codon = ref_dna[dna_i:dna_i + 3]
    new_codon = TOP_2_CODONS["*"][0]

    changed_dna_i = [dna_i + i for i in range(len(new_codon)) if original_codon[i] != new_codon[i]]

    sub_sequence = {
        "position": aa_i,
        "original_aa": ref_aa[aa_i],
        "original_codon": original_codon,
        "new_aa": "STOP",
        "new_codon": new_codon,
        "new_dna_seq": ref_dna[:dna_i] + new_codon + ref_dna[dna_i + 3:],
        "new_aa_seq": ref_aa[:aa_i] + '*' + ref_aa[aa_i+1:],
        "dna_change_s": int(min(changed_dna_i)),
        "dna_change_e": int(max(changed_dna_i))
    }

    return [sub_sequence]

def generate_subs(ref_dna, ref_aa, aa_i):
    '''
    Generates substitute sequence dictionary entries for a given position to alanine, the top 4 blossom substitutions,
    any potential substitutions found in homologous sequences, and a synonymous mutation.

    :param ref_dna: DNA sequence being mutated
    :param ref_aa:  AA sequence being mutated
    :param aa_i: Position which is being mutated
    :return: List of dictionary entries containing details of mutations
    '''

    unique_subs_count = 0

    dna_i = aa_i * 3
    aa = ref_aa[aa_i] #AA being substituted
    original_codon = ref_dna[dna_i:dna_i + 3]


    sub_aas = [] #List of new AAs
    sub_sequences = [] #List of mutation dictionaries

    blossom_subs = list(BLOSSOM_SUB_TABLE[aa]) #Alanine + top 4 mutations from blossom table
    sub_aas += blossom_subs

    #Add all homologous substitutions to list of subs
    if aa_i >= 2: #Only perform after start codon
        for sub, freq in HOMOLOGY_SUB_TABLE[aa_i]:

            if sub not in sub_aas and freq > 0 and sub != aa:
                sub_aas += [sub]
                unique_subs_count += 1

    #Iterate through list of sub AAs and generate 2 codons per sub
    for sub_aa in sub_aas:
        codons = TOP_2_CODONS[sub_aa]

        for codon in codons:

            #Identify location of changed DNA indexes for tiling
            changed_dna_i = [dna_i + i for i in range(len(codon)) if original_codon[i] != codon[i]]

            sub_sequence = {
                "position": aa_i,
                "original_aa": ref_aa[aa_i],
                "original_codon": original_codon,
                "new_aa": sub_aa,
                "new_codon": codon,
                "new_dna_seq": ref_dna[:dna_i] + codon + ref_dna[dna_i + 3:],
                "new_aa_seq": ref_aa[:aa_i] + sub_aa + ref_aa[aa_i+1:],
                "dna_change_s": int(min(changed_dna_i)),
                "dna_change_e": int(max(changed_dna_i))
            }

            sub_sequences += [sub_sequence]

    #Attempt to generate a synonymous mutation at the position
    silent_sub = rand_codon(aa, orig=original_codon)

    #Create a record if a synonymous mutation was able to be generated
    if silent_sub != original_codon:

        changed_dna_i = [dna_i + i for i in range(len(silent_sub)) if original_codon[i] != silent_sub[i]]

        silent_sub_sequence = {
                    "position": aa_i,
                    "original_aa": ref_aa[aa_i],
                    "original_codon": original_codon,
                    "new_aa": aa,
                    "new_codon": silent_sub,
                    "new_dna_seq": ref_dna[:dna_i] + silent_sub + ref_dna[dna_i + 3:],
                    "new_aa_seq": ref_aa,
                    "dna_change_s": int(min(changed_dna_i)),
                    "dna_change_e": int(max(changed_dna_i))
                }

        sub_sequences += [silent_sub_sequence]

    return sub_sequences, unique_subs_count

def generate_library(ref_dna, ref_aa):
    '''

    :param ref_dna:
    :param ref_aa:
    :return:
    '''


    count = 0
    mutated_sequences = []

    for i in range(len(REF_AA)):
        mutated_sequences += delete_position(ref_dna, ref_aa, i)
        mutated_sequences += generate_subs(ref_dna, ref_aa, i)[0]

        if i % 10 == 0:
            mutated_sequences += insert_stop(ref_dna,ref_aa,i)

    count += generate_subs(ref_dna, ref_aa, i)[1]

    return pd.DataFrame(mutated_sequences)

MUTATED_SEQUENCES = generate_library(REF_DNA, REF_AA)
MUTATED_SEQUENCES.to_csv('out/results.csv')


## Testing:
Basic sanity checks for a given mutation:

1.) Does the translated new DNA sequence match the new AA sequence?

2.) Is the new DNA sequence different from the reference DNA sequence?

3.) Does the new codon match the new AA?

4.) Does the new AA match the

In [421]:
def trim_duplicates(library):
    library_size = len(library)
    unique_sequences = set()
    unique_count = 0

    non_unique_sequence_info = []
    unique_sequence_info = []

    for sequence in library.iterrows():
        unique_sequences.add(str(sequence[1][5]))
        if len(unique_sequences) > unique_count:
            unique_count += 1
            unique_sequence_info += [sequence[1]]
        else:
            non_unique_sequence_info += [sequence[1]]

    non_unique_sequence_info = pd.DataFrame(non_unique_sequence_info)
    unique_sequence_info = pd.DataFrame(unique_sequence_info)
    removed_count = library_size - unique_count

    print("Trimming duplicates...")
    print("Size of initial library: " + str(library_size))
    print("Size of final library: " + str(unique_count))
    print("Number of sequences removed: " + str(removed_count))
    print("Trimming complete!")

    return unique_sequence_info

TRIMMED_INITIAL_LIBRARY = trim_duplicates(MUTATED_SEQUENCES)

TRIMMED_INITIAL_LIBRARY.to_csv('out/results.csv')


def check_translation(mutation_dict):
    if Seq.translate(mutation_dict["new_dna_seq"]) != mutation_dict["new_aa_seq"]:
        return False

    return True

def check_dna_change(mutation_dict):
    if mutation_dict["new_dna_seq"] == REF_DNA:
        return False
    return True

Trimming duplicates...
Size of initial library: 1906
Size of final library: 1903
Number of sequences removed: 3
Trimming complete!
