In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import os, sys, itertools, csv
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from mutil.alemutdf import get_all_sample_mut_df, get_multi_exp_max_freq_mut_df
from mutil.mut import is_coding_mut
from mutil.metadata import get_condition_val_dict, get_condition_field_val_set
from mutil.genome import get_K12_pos_from_BOP27

In [2]:
muts_df = get_all_sample_mut_df("./data/muts/")
display(set(muts_df["exp"]))

 51%|█████▏    | 19/37 [00:00<00:00, 182.51it/s]

./data/muts//ndh-cydB-appC_mut.csv


100%|██████████| 37/37 [00:00<00:00, 178.35it/s]
100%|██████████| 29/29 [00:00<00:00, 216.92it/s]
  0%|          | 0/17 [00:00<?, ?it/s]

./data/muts//SSW GLU XYL.csv
./data/muts//ndh-cyoB_mut.csv


100%|██████████| 17/17 [00:00<00:00, 172.06it/s]
100%|██████████| 20/20 [00:00<00:00, 195.48it/s]
  0%|          | 0/48 [00:00<?, ?it/s]

./data/muts//TOL hexamethylenediamine_mut.csv
./data/muts//GLU.csv


100%|██████████| 48/48 [00:00<00:00, 174.93it/s]


{'GLU', 'SSW_GLU_XYL', 'TOL_hexamethylenediamine', 'ndh-cydB-appC', 'ndh-cyoB'}

Getting number of samples from raw dataset, since by the end of the NB, the samples are flattened into ALEs by the "ALE-unique mutations" logic

In [3]:
def get_exp_sample_name(exp_name, ale, flask, isolate, tech_rep):
    sample_name = exp_name + " " + str(int(ale)) + " " + str(int(flask)) + " " + str(int(isolate)) + " " + str(int(tech_rep))
    return sample_name

muts_df["sample"] = muts_df.apply(lambda r: get_exp_sample_name(r.exp, r.ale, r.flask, r.isolate, r.tech_rep), axis=1)

In [4]:
len(muts_df["sample"].unique())

151

In [5]:
len(muts_df)

3264

## remove problem and starting strain mutations

In [6]:
# Different experiments have different strings for position (some with commas, some without), therefore going ahead and changing them all to integers
muts_df.Position = muts_df.Position.apply(lambda x: int(str(x).replace(",","")))
muts_df.Position = muts_df.Position.astype(int)

In [7]:
muts_df = muts_df[~(
# For whatever reason, the gene annotations can change between version of breseq or runs
#     ((muts_df.Gene == "gatC")
#      | (muts_df.Gene == "gatC, gatC"))
    ((muts_df.Position == 2173361)
       | (muts_df.Position == 2173363)
       | (muts_df.Position == 2173364))
    & ((muts_df["Sequence Change"] == "Δ2 bp")
       | (muts_df["Sequence Change"] == "Δ1 bp"))
)]

muts_df = muts_df[~(
#     (muts_df.Gene == "[crl]")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 257908)
    & (muts_df["Sequence Change"] == "Δ776 bp"))]

muts_df = muts_df[~(
#     (muts_df.Gene == "insB1, insA")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 1978503)
    & (muts_df["Sequence Change"] == "Δ776 bp"))]

muts_df = muts_df[~(
#     (muts_df.Gene == "ychE, oppA")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 1299499)
    & (muts_df["Sequence Change"] == "Δ1,199 bp"))]

muts_df = muts_df[~(
# For whatever reason, the gene annotations can change between version of breseq or runs
#     ((muts_df.Gene == "glpR")
#      | (muts_df.Gene == "glpR, glpR")
#      | (muts_df.Gene == "glpR/glpR"))
    (muts_df.Position == 3560455)
    & (muts_df["Sequence Change"] == "+G"))]

muts_df = muts_df[~(
#     (muts_df.Gene == "gltP, yjcO")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 4296381)
    & (muts_df["Sequence Change"] == "+GC"))]

muts_df = muts_df[~(
#     (muts_df.Gene == "gltP, yjcO")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 4296380)
    & (muts_df["Sequence Change"] == "+CG"))]

muts_df = muts_df[~(
#     (muts_df.Gene == "insB1,insA")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 1978503)
    & (muts_df["Sequence Change"] == "Δ776 bp"))]

muts_df = muts_df[~(
#     ((muts_df.Gene == "insA/uspC") | (muts_df.Gene == "insA, uspC"))  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.Position == 1979486)
    & (muts_df["Sequence Change"] == "IS5 (+) +4 bp"))]


muts_df = muts_df[~(
#     (muts_df.Gene == "corA")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.exp != "GLU")  # This is the ALE that originated the corA mutation that is found in all CCK samples, therefore should track
    & (muts_df.Position == 4001645)
    & (muts_df["Sequence Change"] == "Δ5 bp"))]


muts_df = muts_df[~(
#     (muts_df.Gene == "wcaA")  # For whatever reason, the gene annotations can change between version of breseq or runs
    (muts_df.exp != "GLU")  # This is the ALE that originated the corA mutation that is found in all CCK samples, therefore should track
    & (muts_df.Position == 2132787)
    & (muts_df["Details"] == "I204S (ATC→AGC)"))]


p = {4293212, 4293403, 4293430, 4296060, 4296154, 4296189,
 4296190, 4296191, 4296267, 4296268, 4296286, 4296380, 4296382}
ch = {'+C', '+G', 'A→C', 'A→G', 'A→T', 'C→A', 'C→T', 'T→C'}
muts_df = muts_df[~(
#     (muts_df.Gene == "gltP, yjcO")
    (muts_df["Position"].isin(p))
    & (muts_df["Sequence Change"].isin(ch))
       )]

display(len(muts_df), muts_df.head())

2528

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample
50,ndh-cydB-appC,0,0,1,1,1.0,4640551,MOB,IS1 (–) +9 bp,intergenic (+9/‑383),"yjjY, yjtD",NC_000913,1656594.0,ndh-cydB-appC 0 0 1 1
61,ndh-cydB-appC,0,0,1,1,1.0,2375828,MOB,IS186 (+) +4 bp,coding (132‑135/963 nt),menC,NC_000913,1566872.0,ndh-cydB-appC 0 0 1 1
68,ndh-cydB-appC,0,0,1,1,1.0,2019460,SNP,G→A,V359I (GTC→ATC),fliK,NC_000913,669363.0,ndh-cydB-appC 0 0 1 1
73,ndh-cydB-appC,0,0,1,1,1.0,4406129,SNP,T→C,intergenic (+144/‑61),"purA, nsrR",NC_000913,1102531.0,ndh-cydB-appC 0 0 1 1
75,ndh-cydB-appC,0,0,1,1,1.0,222965,DEL,(A)8→7,coding (133/576 nt),gmhB,NC_000913,1656776.0,ndh-cydB-appC 0 0 1 1


In [8]:
reduced_ecoli_ale = 'application_of_ALE_to_re-optimize_growth_performance_of_genome-reduced_strain_MS56-M9-MG1655gbk'
MG1655_pBGT_untreated = "MG1655-pBGT_untreated"
MG1655_untreated = "MG1655_untreated"
MG1655_blaTEM1_untreated = "MG1655-blaTEM1_untreated"
muts_to_remove = [
    {"experiment":MG1655_blaTEM1_untreated,"position":1397381,"sequence change":"Δ13,756 bp"},
    {"experiment":MG1655_untreated,"position":1397381,"sequence change":"Δ13,756 bp"},
    {"experiment":MG1655_pBGT_untreated,"position":1397381,"sequence change":"Δ13,756 bp"},
    {"experiment":reduced_ecoli_ale,"position":389898,"sequence change":"Δ9,908 bp"},
    {"experiment":reduced_ecoli_ale,"position":1388889,"sequence change":"Δ9,734 bp"},
    {"experiment":reduced_ecoli_ale,"position":2466545,"sequence change":"Δ9,634 bp"},
    {"experiment":reduced_ecoli_ale,"position":729365,"sequence change":"Δ9,598 bp"},
    {"experiment":reduced_ecoli_ale,"position":3815859,"sequence change":"Δ82 bp"},
    {"experiment":reduced_ecoli_ale,"position":1400325,"sequence change":"Δ81,931 bp"},
    {"experiment":reduced_ecoli_ale,"position":380112,"sequence change":"Δ8,537 bp"},
    {"experiment":reduced_ecoli_ale,"position":2509631,"sequence change":"Δ8,319 bp"},
    {"experiment":reduced_ecoli_ale,"position":3761993,"sequence change":"Δ8,250 bp"},
    {"experiment":reduced_ecoli_ale,"position":257908,"sequence change":"Δ776 bp"},
    {"experiment":reduced_ecoli_ale,"position":1978503,"sequence change":"Δ776 bp"},
    {"experiment":reduced_ecoli_ale,"position":1040502,"sequence change":"Δ73,763 bp"},
    {"experiment":reduced_ecoli_ale,"position":3090347,"sequence change":"Δ708 bp"},
    {"experiment":reduced_ecoli_ale,"position":675552,"sequence change":"Δ7,819 bp"},
    {"experiment":reduced_ecoli_ale,"position":4400289,"sequence change":"Δ7 bp"},
    {"experiment":reduced_ecoli_ale,"position":819776,"sequence change":"Δ62,977 bp"},
    {"experiment":reduced_ecoli_ale,"position":263514,"sequence change":"Δ61,897 bp"},
    {"experiment":reduced_ecoli_ale,"position":3184782,"sequence change":"Δ6,916 bp"},
    {"experiment":reduced_ecoli_ale,"position":997868,"sequence change":"Δ6,818 bp"},
    {"experiment":reduced_ecoli_ale,"position":2558699,"sequence change":"Δ6,790 bp"},
    {"experiment":reduced_ecoli_ale,"position":3618992,"sequence change":"Δ6,688 bp"},
    {"experiment":reduced_ecoli_ale,"position":4496676,"sequence change":"Δ53,036 bp"},
    {"experiment":reduced_ecoli_ale,"position":167484,"sequence change":"Δ5,964 bp"},
    {"experiment":reduced_ecoli_ale,"position":603467,"sequence change":"Δ5,885 bp"},
    {"experiment":reduced_ecoli_ale,"position":3362162,"sequence change":"Δ5,481 bp"},
    {"experiment":reduced_ecoli_ale,"position":765208,"sequence change":"Δ5,405 bp"},
    {"experiment":reduced_ecoli_ale,"position":15389,"sequence change":"Δ5,175 bp"},
    {"experiment":reduced_ecoli_ale,"position":1509562,"sequence change":"Δ46,201 bp"},
    {"experiment":reduced_ecoli_ale,"position":332366,"sequence change":"Δ44,951 bp"},
    {"experiment":reduced_ecoli_ale,"position":4555491,"sequence change":"Δ41,523 bp"},
    {"experiment":reduced_ecoli_ale,"position":508873,"sequence change":"Δ40,744 bp"},
    {"experiment":reduced_ecoli_ale,"position":1223090,"sequence change":"Δ4 bp"},
    {"experiment":reduced_ecoli_ale,"position":2101397,"sequence change":"Δ36,321 bp"},
    {"experiment":reduced_ecoli_ale,"position":2756160,"sequence change":"Δ35,091 bp"},
    {"experiment":reduced_ecoli_ale,"position":2204350,"sequence change":"Δ32,368 bp"},
    {"experiment":reduced_ecoli_ale,"position":2994992,"sequence change":"Δ3,877 bp"},
    {"experiment":reduced_ecoli_ale,"position":2286401,"sequence change":"Δ3,780 bp"},
    {"experiment":reduced_ecoli_ale,"position":1997112,"sequence change":"Δ26,567 bp"},
    {"experiment":reduced_ecoli_ale,"position":1197137,"sequence change":"Δ25,940 bp"},
    {"experiment":reduced_ecoli_ale,"position":4299628,"sequence change":"Δ25,538 bp"},
    {"experiment":reduced_ecoli_ale,"position":1627518,"sequence change":"Δ25,244 bp"},
    {"experiment":reduced_ecoli_ale,"position":565055,"sequence change":"Δ21,054 bp"},
    {"experiment":reduced_ecoli_ale,"position":2846301,"sequence change":"Δ20,968 bp"},
    {"experiment":reduced_ecoli_ale,"position":3651293,"sequence change":"Δ2,422 bp"},
    {"experiment":reduced_ecoli_ale,"position":2173363,"sequence change":"Δ2 bp"},
    {"experiment":reduced_ecoli_ale,"position":3888432,"sequence change":"Δ18,167 bp"},
    {"experiment":reduced_ecoli_ale,"position":1962566,"sequence change":"Δ16,764 bp"},
    {"experiment":reduced_ecoli_ale,"position":2320144,"sequence change":"Δ16,489 bp"},
    {"experiment":reduced_ecoli_ale,"position":3453928,"sequence change":"Δ15,926 bp"},
    {"experiment":reduced_ecoli_ale,"position":2601201,"sequence change":"Δ13,595 bp"},
    {"experiment":reduced_ecoli_ale,"position":2165151,"sequence change":"Δ12,060 bp"},
    {"experiment":reduced_ecoli_ale,"position":1129397,"sequence change":"Δ11,591 bp"},
    {"experiment":reduced_ecoli_ale,"position":3593299,"sequence change":"Δ10 bp"},
    {"experiment":reduced_ecoli_ale,"position":3720634,"sequence change":"Δ1,442 bp"},
    {"experiment":reduced_ecoli_ale,"position":1872030,"sequence change":"Δ1,437 bp"},
    {"experiment":reduced_ecoli_ale,"position":1299499,"sequence change":"Δ1,199 bp"},
    {"experiment":reduced_ecoli_ale,"position":687860,"sequence change":"Δ1,185 bp"},
    {"experiment":reduced_ecoli_ale,"position":3581139,"sequence change":"3915 bp→6 bp"},
    {"experiment":reduced_ecoli_ale,"position":1197137,"sequence change":"25956 bp→12 bp"},
    {"experiment":reduced_ecoli_ale,"position":3110680,"sequence change":"25702 bp→GA"},
]

In [9]:
for mut in muts_to_remove:
    muts_df = muts_df[~(
        (muts_df["exp"] == mut["experiment"])
        & (muts_df["Position"] == mut["position"])
        & (muts_df["Sequence Change"] == mut["sequence change"])
    )]
display(len(muts_df), muts_df.head())

2528

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample
50,ndh-cydB-appC,0,0,1,1,1.0,4640551,MOB,IS1 (–) +9 bp,intergenic (+9/‑383),"yjjY, yjtD",NC_000913,1656594.0,ndh-cydB-appC 0 0 1 1
61,ndh-cydB-appC,0,0,1,1,1.0,2375828,MOB,IS186 (+) +4 bp,coding (132‑135/963 nt),menC,NC_000913,1566872.0,ndh-cydB-appC 0 0 1 1
68,ndh-cydB-appC,0,0,1,1,1.0,2019460,SNP,G→A,V359I (GTC→ATC),fliK,NC_000913,669363.0,ndh-cydB-appC 0 0 1 1
73,ndh-cydB-appC,0,0,1,1,1.0,4406129,SNP,T→C,intergenic (+144/‑61),"purA, nsrR",NC_000913,1102531.0,ndh-cydB-appC 0 0 1 1
75,ndh-cydB-appC,0,0,1,1,1.0,222965,DEL,(A)8→7,coding (133/576 nt),gmhB,NC_000913,1656776.0,ndh-cydB-appC 0 0 1 1


## Remove experiment specific starting strain mutations

In [10]:
muts_df.exp.unique()

array(['ndh-cydB-appC', 'SSW_GLU_XYL', 'ndh-cyoB',
       'TOL_hexamethylenediamine', 'GLU'], dtype=object)

In [11]:
# Filter the following mutations out if they come from specific experiments.
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="cyoB") & (muts_df["Details"]=="W190* (TGG→TAG)")))]
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="eno") & (muts_df["Details"]=="L193Q (CTG→CAG)")))]
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="chbF") & (muts_df["Details"]=="V133E (GTA→GAA)")))]
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="fumC") & (muts_df["Details"]=="G354G (GGT→GGC)")))]
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="yegE") & (muts_df["Details"]=="S683Y (TCC→TAC)")))]
muts_df = muts_df[(~((muts_df.exp=="SER") & (muts_df["Gene"]=="rcsC") & (muts_df["Details"]=="L594F (TTG→TTC)")))]


muts_df = muts_df[(~((muts_df.exp=="GYD") & (muts_df["Gene"]=="fumC") & (muts_df["Details"]=="G354G (GGT→GGC)")))]
muts_df = muts_df[(~((muts_df.exp=="GYD") & (muts_df["Gene"]=="yegE") & (muts_df["Details"]=="S683Y (TCC→TAC)")))]
muts_df = muts_df[(~((muts_df.exp=="GYD") & (muts_df["Gene"]=="rcsC") & (muts_df["Details"]=="L594F (TTG→TTC)")))]


cck_exp_list = [
    "tpi",
    "pts",
    "pgi",
    "sdh",
    "gnd",
    "evo"
]

muts_df = muts_df[~(
    (muts_df.exp.isin(cck_exp_list))
    & (
        (muts_df["Gene"]=="wcaA")
        | (muts_df["Gene"]=="corA")
    )
)]  # The details of the mutations to filter out should probably include the sequence change.
print(len(muts_df))

2528


### Removing MG1655_anchestor mutations

In [12]:
# to use with filtering out mutations 
muts_df = muts_df.reset_index(drop=True)

In [13]:
# Not many. Going to manual remove (see cell below)
# This is for the Ecoli_resistance_caz
# Sebastian often uses the name "MG1655_anchestor" 
MG1655_anchestor = muts_df[muts_df.exp=="MG1655_anchestor"].copy()
MG1655_anchestor

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample


In [14]:
print(len(muts_df))
muts_df = muts_df[~(muts_df.index.isin(list(MG1655_anchestor.index)))]
print(len(muts_df))

2528
2528


In [15]:
muts_df = muts_df[(~((muts_df.exp=="MG1655_chloramphenicol") & (muts_df["Position"]==4390754) & (muts_df["Details"]=="A252A (GCC→GCA)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_chloramphenicol") & (muts_df["Position"]==803662) & (muts_df["Details"]=="L54I (CTT→ATT)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_chloramphenicol") & (muts_df["Position"]==1905761) & (muts_df["Details"]=="G25D (GGT→GAT)")))]

muts_df = muts_df[(~((muts_df.exp=="MG1655_doxycycline") & (muts_df["Position"]==4390754) & (muts_df["Details"]=="A252A (GCC→GCA)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_doxycycline") & (muts_df["Position"]==803662) & (muts_df["Details"]=="L54I (CTT→ATT)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_doxycycline") & (muts_df["Position"]==1905761) & (muts_df["Details"]=="G25D (GGT→GAT)")))]

muts_df = muts_df[(~((muts_df.exp=="MG1655_trimethoprim") & (muts_df["Position"]==4390754) & (muts_df["Details"]=="A252A (GCC→GCA)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_trimethoprim") & (muts_df["Position"]==803662) & (muts_df["Details"]=="L54I (CTT→ATT)")))]
muts_df = muts_df[(~((muts_df.exp=="MG1655_trimethoprim") & (muts_df["Position"]==1905761) & (muts_df["Details"]=="G25D (GGT→GAT)")))]
print(len(muts_df))

2528


### Starting strain mutations according to samples within experiments with starting strain AFIRs

In [16]:
# to use with filtering out mutations 
muts_df = muts_df.reset_index(drop=True)

In [17]:
# Remove starting strain mutations from all experiments according to each experiment's starting strain

def _get_sample_name(ale, flask, isolate, tech_rep):  # Doesn't include the experiment name
    sample_name = str(int(ale)) + " " + str(int(flask)) + " " + str(int(isolate)) + " " + str(int(tech_rep))
    return sample_name

muts_df["sample"] = muts_df.apply(lambda r: _get_sample_name(r.ale, r.flask, r.isolate, r.tech_rep), axis=1)
muts_df

muts_df = muts_df.copy()
starting_strains = muts_df[muts_df["sample"]=="0 0 1 1"]  # Need an exact match
muts_df = muts_df[muts_df["sample"]!="0 0 1 1"]  # remove starting strain samples
for e, exp_starting_strain_muts in starting_strains.groupby("exp"):
    rows_to_filter = (
        muts_df["exp"].isin(exp_starting_strain_muts["exp"])
        & muts_df["Position"].isin(exp_starting_strain_muts["Position"])
        & muts_df["Sequence Change"].isin(exp_starting_strain_muts["Sequence Change"])
        & muts_df["Details"].isin(exp_starting_strain_muts["Details"])
    )
    muts_df.drop(muts_df[rows_to_filter].index, inplace=True)  # have to ensure that muts_df indeces are unique
print(len(muts_df))

2231


###  Check that no weird mutations references are included in dataset, since they won't match to RegulonDB's data

In [18]:
for ref_seq, df in muts_df.groupby("Reference Seq"):
    print(ref_seq, df.exp.unique())

NC_000913 ['ndh-cydB-appC' 'ndh-cyoB' 'TOL_hexamethylenediamine' 'GLU']


According the the prevous results, TOL_isobutyric_acid has one weird mutation. Going to remove it.

In [19]:
muts_df = muts_df[muts_df["Reference Seq"] != "CP009273"]
muts_df["Reference Seq"].unique()

array(['NC_000913', nan], dtype=object)

In [20]:
# removing samples not incuded in GYD MS for AVA MS

muts_df = muts_df[
    ~(
        (muts_df.exp == "GYD")
        & ((muts_df.ale == 35) | (muts_df.ale == 34))
    )
]
print(len(muts_df.apply(lambda r: r.exp + " " + str(r.ale), axis=1).unique()))

28


In [21]:
# Amount of ALEs before removal
print(len(muts_df.apply(lambda r: r.exp + " " + str(r.ale), axis=1).unique()))

# removing hypermutators since they will affect associations

muts_df = muts_df[
    ~(
        (muts_df.exp == "GLU")
        & ((muts_df.ale == 5) | (muts_df.ale == 7))
    )
]

muts_df = muts_df[
    ~(
        (muts_df.exp == "TOL_2,3-butanediol")
        & (muts_df.ale == 3)
    )
]

muts_df = muts_df[
    ~(
        (muts_df.exp == "TOL_adipic_acid")
        & (muts_df.ale == 5)
    )
]

muts_df = muts_df[
    ~(
        (muts_df.exp == "TOL_hexamethylenediamine")
        & (muts_df.ale == 6)
    )
]

muts_df = muts_df[
    ~(
        (muts_df.exp == "TOL_octanoic_acid")
        & (muts_df.ale == 5)
    )
]

muts_df = muts_df[
    ~(
        (muts_df.exp == "TOL_propanediol")
        & ((muts_df.ale == 1) | (muts_df.ale == 2) | (muts_df.ale == 3) | (muts_df.ale == 4) | (muts_df.ale == 5) | (muts_df.ale == 7) | (muts_df.ale == 8))
    )
]

print(len(muts_df.apply(lambda r: r.exp + " " + str(r.ale), axis=1).unique()))

28
25


In [22]:
print(len(muts_df))
muts_df = muts_df[muts_df.presence >= 0.5]
print(len(muts_df))

1202
1086


In [23]:
# This work is also currently duplicated NB4 
muts_df["Gene"] = muts_df["Gene"].apply(lambda a: "rph" if a == "[rph], [rph]" else a)
muts_df["Gene"] = muts_df["Gene"].apply(lambda a: "rph" if a == "[rph],[rph]" else a)
muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample
12,ndh-cydB-appC,10,13,0,1,1.0,222958,DEL,Δ1 bp,coding (126/576 nt),gmhB,NC_000913,1656779.0,10 13 0 1
14,ndh-cydB-appC,10,22,0,1,0.89,4633634,SNP,C→T,E38K (GAG→AAG),yjjX,NC_000913,1656864.0,10 22 0 1
20,ndh-cydB-appC,10,22,0,1,1.0,222958,DEL,Δ1 bp,coding (126/576 nt),gmhB,NC_000913,1656779.0,10 22 0 1
22,ndh-cydB-appC,10,32,0,1,1.0,4633634,SNP,C→T,E38K (GAG→AAG),yjjX,NC_000913,1656864.0,10 32 0 1
23,ndh-cydB-appC,10,32,0,1,0.85,4183399,DEL,Δ3 bp,coding (2155‑2157/4029 nt),rpoB,NC_000913,1656884.0,10 32 0 1


In [24]:
muts_df["coding"] = muts_df.Details.apply(is_coding_mut)
muts_df["Gene"] = muts_df.apply(lambda row: row["Gene"].replace(", ","/") if not row["coding"] else row["Gene"], axis=1)

muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample,coding
12,ndh-cydB-appC,10,13,0,1,1.0,222958,DEL,Δ1 bp,coding (126/576 nt),gmhB,NC_000913,1656779.0,10 13 0 1,True
14,ndh-cydB-appC,10,22,0,1,0.89,4633634,SNP,C→T,E38K (GAG→AAG),yjjX,NC_000913,1656864.0,10 22 0 1,True
20,ndh-cydB-appC,10,22,0,1,1.0,222958,DEL,Δ1 bp,coding (126/576 nt),gmhB,NC_000913,1656779.0,10 22 0 1,True
22,ndh-cydB-appC,10,32,0,1,1.0,4633634,SNP,C→T,E38K (GAG→AAG),yjjX,NC_000913,1656864.0,10 32 0 1,True
23,ndh-cydB-appC,10,32,0,1,0.85,4183399,DEL,Δ3 bp,coding (2155‑2157/4029 nt),rpoB,NC_000913,1656884.0,10 32 0 1,True


In [25]:
# ensure all mutations are using NC_000913 positions.
genome_pos_transform_exp_l = [
    "ERS",
    'EEPcyoBndh',
    'EEPcydBappC',
    'EEPndhcyoB',
    'EEPndh',
    'EEPcydBappCndh',
    'EEPndhcydBappC'
    
]
muts_df.Position = muts_df.apply(lambda row: get_K12_pos_from_BOP27(row.Position) if row.exp in genome_pos_transform_exp_l else row.Position, axis=1)

### Just get ALE-unique mutations 
!!! Filter out many inappropriate mutations as possible to cut down the runtime on the finding ALE-unique mutations !!!

In [26]:
print(len(muts_df))
muts_df = get_multi_exp_max_freq_mut_df(muts_df, endpoint_flask_only=False)
print(len(muts_df))
muts_df.head()

1086
585


Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,Gene,Reference Seq,Mut ID,sample,coding
1392,GLU,3,412,2,1,1.0,13957,SNP,A→T,M599L (ATG→TTG),dnaK,NC_000913,1034671.0,3 412 2 1,True
1428,GLU,3,412,2,1,1.0,28175,SNP,T→A,W295R (TGG→AGG),rihC,NC_000913,1035623.0,3 412 2 1,True
1393,GLU,3,412,2,1,1.0,101342,SNP,C→T,T193I (ACC→ATC),murC,NC_000913,1034672.0,3 412 2 1,True
1394,GLU,3,412,2,1,1.0,145691,SNP,C→T,A204V (GCC→GTC),yadE,NC_000913,1034673.0,3 412 2 1,True
1429,GLU,3,412,2,1,1.0,171072,SNP,A→G,K166K (AAA→AAG),fhuD,NC_000913,1035624.0,3 412 2 1,True


In [27]:
# removing unnecessary column
muts_df = muts_df.drop(columns=["Mut ID"])

In [28]:
muts_df.to_pickle("./data/1_df.pkl")

In [29]:
len(muts_df)

585