In [11]:
import pandas as pd
import math


def get_codons_for_amino_acid(aa):
    """Return codons corresponding to a given amino acid."""
    # Dictionary mapping amino acids to their codons
    codon_table = {
        '*': ['TAA', 'TAG', 'TGA'],
        'A': ['GCT', 'GCC', 'GCA', 'GCG'],
        'C': ['TGT', 'TGC'],
        'D': ['GAT', 'GAC'],
        'E': ['GAA', 'GAG'],
        'F': ['TTT', 'TTC'],
        'G': ['GGT', 'GGC', 'GGA', 'GGG'],
        'H': ['CAT', 'CAC'],
        'I': ['ATT', 'ATC', 'ATA'],
        'K': ['AAA', 'AAG'],
        'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'M': ['ATG'],
        'N': ['AAT', 'AAC'],
        'P': ['CCT', 'CCC', 'CCA', 'CCG'],
        'Q': ['CAA', 'CAG'],
        'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
        'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'T': ['ACT', 'ACC', 'ACA', 'ACG'],
        'V': ['GTT', 'GTC', 'GTA', 'GTG'],
        'W': ['TGG'],
        'Y': ['TAT', 'TAC']
    }

    # Return codons for the provided amino acid
    return codon_table.get(aa, None)


def get_alternative_amino_acids(aa):
    """Return a list of alternative amino acids for the provided amino acid."""
    # List of all standard amino acids
    standard_aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*']

    # Check if the input amino acid is valid
    if aa not in standard_aas:
        raise ValueError(f"'{aa}' is not a standard amino acid.")

    # Return all amino acids excluding the input one
    return [a for a in standard_aas if a != aa]


def is_point_mutation(current_codon, alt_codons):
    """
    Check if codons in alt_codons differ from current_codon by a single point mutation.

    Args:
    - current_codon (str): The current codon.
    - alt_codons (list): List of alternative codons.

    Returns:
    - List of codons from alt_codons that differ by a point mutation from current_codon.
    """
    point_mutation_codons = []

    for codon in alt_codons:
        # Count differences between the current codon and the alternative codon
        diff_count = sum(1 for a, b in zip(current_codon, codon) if a != b)

        # If they differ by just one nucleotide, add to the result list
        if diff_count == 1:
            point_mutation_codons.append(codon)

    return point_mutation_codons


def is_tandem_double_mutation(current_codon, alt_codons):
    """
    Check if any codons in alt_codons differ from current_codon by a tandem double nucleotide mutation.

    Args:
    - current_codon (str): The current codon.
    - alt_codons (list): List of alternative codons.

    Returns:
    - List of codons from alt_codons that differ by a tandem double nucleotide mutation from current_codon.
    """
    double_mutation_codons = []

    for alt_codon in alt_codons:

        # Identify positions where the codons differ
        diff_positions = [i for i, (a, b) in enumerate(zip(current_codon, alt_codon)) if a != b]

        # Check for tandem double mutations
        if len(diff_positions) == 2 and diff_positions[1] - diff_positions[0] == 1:
            double_mutation_codons.append(alt_codon)

    return double_mutation_codons


def calculate_fitness_average(SA_df, amber_pos, mutant_AA):
    """Calculate and return the average fitness."""
    filtered_df = SA_df[(SA_df[('Ambler Position', 'Unnamed: 1_level_1')] == amber_pos) & (SA_df[('Mutant AA', 'Unnamed: 5_level_1')] == mutant_AA)]
    temp_list = filtered_df[('Fitness', 'Unnamed: 21_level_1')].tolist()
    cleaned_list = [x for x in temp_list if not math.isnan(x)]
    if len(cleaned_list) == 0:
        return "unknown"
    else:
        return sum(cleaned_list)/len(cleaned_list)

def main():
    SA_df = pd.read_excel('../../Data_TEM1/14.xlsx', header=[0, 1])

    WT_NT_seq_SA = [None] * (290 - 2)
    WT_AA_seq_SA = [None] * (290 - 2)

    for _, row in SA_df.iterrows():
        amber_pos = row[('Ambler Position', 'Unnamed: 1_level_1')]
        if amber_pos < 291:
            WT_NT_seq_SA[amber_pos - 3] = row[('WT codon', 'Unnamed: 2_level_1')]
            WT_AA_seq_SA[amber_pos - 3] = row[('WT AA', 'Unnamed: 4_level_1')]

    mutation_data = {
        'Allele': [], 'AA_wt': [], 'AA_variant': [], 'Position': [],
        'fitness': [], 'mutation': [], 'codon': []
    }

    for c, (NT_t, AA_t) in enumerate(zip(WT_NT_seq_SA, WT_AA_seq_SA), start=3):
        if AA_t:
            for mutant_AA in get_alternative_amino_acids(AA_t):
                mutation_data['Allele'].append(f"{AA_t}{c}{mutant_AA}")
                mutation_data['AA_wt'].append(AA_t)
                mutation_data['AA_variant'].append(mutant_AA)
                mutation_data['Position'].append(c)
                mutation_data['fitness'].append(calculate_fitness_average(SA_df, c, mutant_AA))
                mutation_data['codon'].append(NT_t)

                point_mutation_codons = is_point_mutation(NT_t, get_codons_for_amino_acid(mutant_AA))
                tandem_mutation_codons = is_tandem_double_mutation(NT_t, get_codons_for_amino_acid(mutant_AA))
                if point_mutation_codons:
                    mutation_type = "single"
                elif tandem_mutation_codons:
                    mutation_type = "double"
                else:
                    mutation_type = "NA"

                mutation_data['mutation'].append(mutation_type)

    dfoutput = pd.DataFrame(mutation_data)
    dfoutput.to_csv('../../Data_TEM1/data_TEM1_Codon_Substitutions.csv', index=False)


def translate_codon(codon):
    """Translate a codon into its corresponding amino acid."""
    codon_table = {
        '*': ['TAA', 'TAG', 'TGA'],
        'A': ['GCT', 'GCC', 'GCA', 'GCG'],
        'C': ['TGT', 'TGC'],
        'D': ['GAT', 'GAC'],
        'E': ['GAA', 'GAG'],
        'F': ['TTT', 'TTC'],
        'G': ['GGT', 'GGC', 'GGA', 'GGG'],
        'H': ['CAT', 'CAC'],
        'I': ['ATT', 'ATC', 'ATA'],
        'K': ['AAA', 'AAG'],
        'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'M': ['ATG'],
        'N': ['AAT', 'AAC'],
        'P': ['CCT', 'CCC', 'CCA', 'CCG'],
        'Q': ['CAA', 'CAG'],
        'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
        'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'T': ['ACT', 'ACC', 'ACA', 'ACG'],
        'V': ['GTT', 'GTC', 'GTA', 'GTG'],
        'W': ['TGG'],
        'Y': ['TAT', 'TAC']
    }
    for aa, codons in codon_table.items():
        if codon in codons:
            return aa
    return None


def get_alternative_amino_acids_combos(NT1, NT2):
    """Get amino acid combinations attainable by cross-codon tandem double mutations."""
    # Define the nucleotides
    nucleotides = ['A', 'T', 'G', 'C']

    # List to store amino acid combinations
    aa_combinations = []

    # Last nucleotide of NT1 and the first nucleotide of NT2
    last_NT1, first_NT2 = NT1[-1], NT2[0]

    # Original amino acids from the codons
    original_aa1 = translate_codon(NT1)
    original_aa2 = translate_codon(NT2)

    # Iterate over possible mutations
    for nuc1 in nucleotides:
        for nuc2 in nucleotides:
            if nuc1 != last_NT1 and nuc2 != first_NT2:
                mutated_NT1 = NT1[:-1] + nuc1
                mutated_NT2 = nuc2 + NT2[1:]

                aa1 = translate_codon(mutated_NT1)
                aa2 = translate_codon(mutated_NT2)

                if aa1 and aa2 and (aa1 != original_aa1 and aa2 != original_aa2):
                    aa_combinations.append((aa1, aa2))

    unique_aa_combinations = list(set(aa_combinations))

    return unique_aa_combinations


def calculate_fitness_average_crosscodon(DA_df, amber_pos, mutant_AA1, mutant_AA2):
    """Calculate and return the average fitness."""
    filtered_df = DA_df[(DA_df[('Ambler Position')] == amber_pos) & (
                DA_df[('Mut AA 1')] == mutant_AA1) & (
                DA_df[('Mut AA 2')] == mutant_AA2)]

    temp_list = filtered_df[('Double Mutant Fitness')].tolist()

    if len(temp_list) == 0:
        return "unknown"
    else:
        return sum(temp_list) / len(temp_list)


def main2():
    SA_df = pd.read_excel('../../Data_TEM1/14.xlsx', header=[0, 1])
    DA_df = pd.read_excel(r'../../Data_TEM1/19.xlsx', sheet_name=1)

    WT_NT_seq_SA = [None] * (290 - 2)
    WT_AA_seq_SA = [None] * (290 - 2)

    for _, row in SA_df.iterrows():
        amber_pos = row[('Ambler Position', 'Unnamed: 1_level_1')]
        if amber_pos < 291:
            WT_NT_seq_SA[amber_pos - 3] = row[('WT codon', 'Unnamed: 2_level_1')]
            WT_AA_seq_SA[amber_pos - 3] = row[('WT AA', 'Unnamed: 4_level_1')]

    #print(WT_AA_seq_SA)
    #print([index for index, value in enumerate(WT_AA_seq_SA) if value is None])

    mutation_data = {
        'Allele': [], 'AA_wt': [], 'AA_variant': [], 'Position': [],
        'fitness': [], 'mutation': [], 'codon': []
    }

    codons = len(WT_AA_seq_SA)
    for c in range(codons-1):
        AA1_t = WT_AA_seq_SA[c]
        AA2_t = WT_AA_seq_SA[c+1]

        NT1_t = WT_NT_seq_SA[c]
        NT2_t = WT_NT_seq_SA[c+1]

        if NT1_t != None and NT2_t != None:
            for mutant_AA_combo in get_alternative_amino_acids_combos(NT1_t, NT2_t):
                mutant_AA1 = mutant_AA_combo[0]
                mutant_AA2 = mutant_AA_combo[1]
                mutation_data['Allele'].append(f"{AA1_t}{AA2_t}{c+3}{mutant_AA1}{mutant_AA2}")
                mutation_data['AA_wt'].append(f"{AA1_t}{AA2_t}")
                mutation_data['AA_variant'].append(f"{mutant_AA1}{mutant_AA2}")
                mutation_data['Position'].append(c+3)
                mutation_data['fitness'].append(calculate_fitness_average_crosscodon(DA_df, c+3, mutant_AA1, mutant_AA2))
                mutation_data['codon'].append(f"{NT1_t}{NT2_t}")
                mutation_data['mutation'].append("double")

    dfoutput = pd.DataFrame(mutation_data)
    dfoutput.to_csv('../../Data_TEM1/data_TEM1_CrossCodonsDoubles.csv', index=False)


if __name__ == "__main__":
    main()


  for idx, row in parser.parse():


In [54]:
import pandas as pd
import numpy as np

'''Create singles_TEM1.npy for WF simulations'''

file_path = '../../Data_TEM1/data_TEM1_Codon_Substitutions.csv'
df = pd.read_csv(file_path)
# Filter the rows
filtered_fitness = df[(df['mutation'] == 'single') & (df['fitness'] != 'unknown')]['fitness']
singles_fitness_arrays = np.array([float(item) if item != 'unknown' else np.nan for item in list(filtered_fitness)])
np.save('../WFModel/WF_Simulations/singles_TEM1_new.npy', singles_fitness_arrays)


'''Create doubles_TEM1.npy for WF simulations'''

file_path1 = '../../Data_TEM1/data_TEM1_Codon_Substitutions.csv'
file_path2 = '../../Data_TEM1/data_TEM1_CrossCodonsDoubles.csv'
df1 = pd.read_csv(file_path1)
# Filter the rows
filtered_fitness1 = df1[(df1['mutation'] == 'double') & (df1['fitness'] != 'unknown')]['fitness']
doubles_fitness_arrays1 = np.array([float(item) if item != 'unknown' else np.nan for item in list(filtered_fitness1)])
print("len_1", len(doubles_fitness_arrays1))
df2 = pd.read_csv(file_path2)
# Filter the rows
filtered_fitness2 = df2[(df2['mutation'] == 'double') & (df2['fitness'] != 'unknown')]['fitness']
doubles_fitness_arrays2 = np.array([float(item) if item != 'unknown' else np.nan for item in list(filtered_fitness2)])
print("len_1", len(doubles_fitness_arrays2))
concatenated_doubles_fitness_arrays = np.concatenate((doubles_fitness_arrays1, doubles_fitness_arrays2))
np.save('../WFModel/WF_Simulations/doubles_TEM1_new.npy', concatenated_doubles_fitness_arrays)


len_1 2341
len_1 50
