# TCR Library Design

In [21]:
import pandas as pd
import numpy as np

#import cfg

from Bio import Seq
from Bio import SeqIO


In [22]:
#Import reference sequence as DNA and AA seq objects

REF_DNA = SeqIO.read("TRBC_Extracellular.fasta", "fasta").seq
REF_AA = Seq.translate(REF_DNA)

BLOSSOM_SUB_TABLE = pd.read_csv('blosum80_subs_plus_a.csv') # Dataframe with list of substitutions for a given amino acid based on blossom matrix

MUTATED_SEQUENCES = [] # List of lists containing dictionary entries for each mutated sequence. Index of first set of lists = position in AA sequence

In [23]:
def homologous_sub_freq():
    '''
    Generate dictionary with ranked list of substitutions at a given position
    '''
    homology_dt = pd.read_csv('homologous_subs.csv') #DT with sub frequencies from alignment
    sub_freq_dict = {} #Dictionary with pre-sorted list of subs at a given position

    aa_row = list(homology_dt["AA"]) #List of which row corresponds to which AA

    for position in homology_dt.keys()[1:]: #Iterates through sequence but skips first column with AA table
        position_subs = list(homology_dt[position]) #Frequencies of all subs (index matches aa_row)
        sub_tuples = [] #List of formatted sub frequencies (sub aa, frequency)

        for i in range(len(position_subs)): #Iterates through list of sub frequencies and reformats them into tuples

            aa = aa_row[i] #sub aa
            sub_freq = position_subs[i] #frequency in homologous sequences

            sub_tuples += [(aa, sub_freq)]

        subs_by_freq = sorted(sub_tuples, key=lambda x: x[1], reverse=True) #Sorted list of subs
        sub_freq_dict[int(position)] = subs_by_freq

    return sub_freq_dict

def load_hs_codon_freq(): #Generate dictionary of top 2 codons for each AA
    aa_codon_dt = pd.read_csv('hs_codon_freq.csv') #Dataframe of codon frequencies
    top_2_codons_dt = aa_codon_dt.groupby(['aa'])['codon','freq'].apply(lambda x: x.nlargest(2, columns=['freq']))
    aa_to_codon = {} #Dictionary with list of top two codons for a given AA

    for codon in top_2_codons_dt.iterrows():
        aa = codon[0][0]
        codon_dna = codon[1][0]

        if aa in aa_to_codon.keys():
            aa_to_codon[aa] += [codon_dna]

        else:
            aa_to_codon[aa] = [codon_dna]

    return(aa_to_codon)

AA_TO_CODON = load_hs_codon_freq()
HOMOLOGY_SUB_TABLE = homologous_sub_freq()


## Mutation Generation:

Mutations are generated by iterating through the amino acid sequence of the protein. The first 4 substitutions for a
given position are alanine and the top 3 results from the blossom table. I then plan to look for the most frequent
mutation seen at that position in homologous sequences that is not included in the intial set of substitutions.

This list of AA substititions will be fed to Dan's code to pick the top two codons for that AA, formatted into a
dictionary that includes the position mutated, the original AA at that position, the new AA at that position, the
new codon used, the new DNA sequence, and the new AA sequence.

These dictionaries

In [24]:
def delete_position(aa_i, ref_aa, ref_dna):
    dna_i = aa_i * 3

    deleted_position = {
        "position": aa_i,
        "original_aa": ref_aa[aa_i],
        "new_aa": "*",
        "new_codon": "",
        "new_dna_seq": ref_dna[:dna_i] + ref_dna[dna_i + 3:],
        "new_aa_seq": ref_aa[:aa_i] + ref_aa[aa_i+1:]
    }

    return [deleted_position]

def generate_subs(aa_i, ref_aa, ref_dna):
    unique_subs_count = 0

    dna_i = aa_i * 3
    aa = ref_aa[aa_i] #AA being substituted

    sub_aas = [] #List of new AAs
    sub_sequences = [] #List of mutation dictionaries

    blossom_subs = list(BLOSSOM_SUB_TABLE[aa]) #Alanine + top 4 mutations from blossom table
    sub_aas += blossom_subs


    if aa_i >= 2:
        for sub, freq in HOMOLOGY_SUB_TABLE[aa_i]:
            if sub not in sub_aas and freq > 0:
                sub_aas += [sub]
                unique_subs_count += 1


    for sub_aa in sub_aas:
        codons = AA_TO_CODON[sub_aa]

        for codon in codons:
            sub_sequence = {
                "position": aa_i,
                "original_aa": ref_aa[aa_i],
                "new_aa": sub_aa,
                "new_codon": codon,
                "new_dna_seq": ref_dna[:dna_i] + codon + ref_dna[dna_i + 3:],
                "new_aa_seq": ref_aa[:aa_i] + sub_aa + ref_aa[aa_i+1:]
            }

            sub_sequences += [sub_sequence]

    return sub_sequences, unique_subs_count

## Testing:
Basic sanity checks for a given mutation:

1.) Does the translated new DNA sequence match the new AA sequence?

2.) Is the new DNA sequence different from the reference DNA sequence?

3.) Does the new codon match the new AA?

4.) Does the new AA match the

In [25]:
def check_translation(mutation_dict):
    if Seq.translate(mutation_dict["new_dna_seq"]) != mutation_dict["new_aa_seq"]:
        return False

    return True

def check_dna_change(mutation_dict):
    if mutation_dict["new_dna_seq"] == REF_DNA:
        return False
    return True


def generate_library(ref_aa, ref_dna, output_file='out/aa_mut_table.csv'):
    '''
    Main function, generates library given ref amino acid sequence.
    
    inputs: 
        ref_aa, a Bio.Sequence object of amino acids to mutate
        output_file, a csv containing these cols:
             position
             original_aa
             new_aa
             new_codon
             new_dna_seq
             new_aa_seq
    '''
    mutated_sequences = []
    count = 0
    for aa_i in range(len(ref_aa)):
        mutated_sequences += delete_position(aa_i, ref_aa, ref_dna)
        mutated_sequences += generate_subs(aa_i, ref_aa, ref_dna)[0]

        count += generate_subs(aa_i, ref_aa, ref_dna)[1]

    print("Number of amino acid mutated: " + str(count))
    print("Number of mutated sequences: " + str(len(mutated_sequences)))

    pd.DataFrame(mutated_sequences).to_csv('out/results.csv')
    
    return(pd.DataFrame(mutated_sequences))

In [27]:
generate_library(REF_AA, REF_DNA, output_file='out/aa_mut_table.csv')

KeyError: 'c'