In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display
import os, sys, itertools, csv

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from mutil.alemutdf import get_all_sample_mut_df, get_gene_mut_count_mat, get_multi_exp_max_freq_mut_df, get_mut_type_avg_frac_across_class_df
from mutil.mut import is_coding_mut, get_original_nuc_mut_range
from mutil.metadata import get_condition_val_dict, get_condition_field_val_set
from mutil.genome import get_feature_hit_set, is_overlap, get_promoter_range_from_RegulonDB_df_row, NON_K12_EXP_L

In [2]:
pd.options.display.max_columns = 100

In [3]:
all_muts_df = pd.read_pickle("./data/1_5_df.pkl")
display(all_muts_df.shape)

(585, 17)

In [4]:
all_muts_df.exp.unique()

array(['GLU', 'SSW_GLU_XYL', 'TOL_hexamethylenediamine', 'ndh-cydB-appC',
       'ndh-cyoB'], dtype=object)

In [5]:
# got range from https://ecocyc.org/ECOLI/NEW-IMAGE?type=NIL&object=G0-10506
origin_of_replication_range=(3925744, 3925975)
all_muts_df["oriC"] = all_muts_df.apply(lambda r: is_overlap(r["range"], origin_of_replication_range) if r.exp not in NON_K12_EXP_L else False, axis=1)
set(all_muts_df["oriC"])

{False}

In [6]:
all_muts_df["coding"] = all_muts_df.Details.apply(is_coding_mut)
all_muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC
1392,GLU,3,412,2,1,1.0,13957,SNP,A→T,M599L (ATG→TTG),dnaK,NC_000913,3 412 2 1,True,"(13957, 13957)",{ECK120000235},"[{'name': 'dnaK', 'RegulonDB ID': 'ECK12000023...",False
1428,GLU,3,412,2,1,1.0,28175,SNP,T→A,W295R (TGG→AGG),rihC,NC_000913,3 412 2 1,True,"(28175, 28175)",{ECK120001070},"[{'name': 'rihC', 'RegulonDB ID': 'ECK12000107...",False
1393,GLU,3,412,2,1,1.0,101342,SNP,C→T,T193I (ACC→ATC),murC,NC_000913,3 412 2 1,True,"(101342, 101342)",{ECK120000612},"[{'name': 'murC', 'RegulonDB ID': 'ECK12000061...",False
1394,GLU,3,412,2,1,1.0,145691,SNP,C→T,A204V (GCC→GTC),yadE,NC_000913,3 412 2 1,True,"(145691, 145691)",{ECK120001687},"[{'name': 'yadE', 'RegulonDB ID': 'ECK12000168...",False
1429,GLU,3,412,2,1,1.0,171072,SNP,A→G,K166K (AAA→AAG),fhuD,NC_000913,3 412 2 1,True,"(171072, 171072)",{ECK120000299},"[{'name': 'fhuD', 'RegulonDB ID': 'ECK12000029...",False


In [7]:
all_muts_df["pseudogene"] = all_muts_df.Details.apply(lambda x: "pseudogene" in x )
all_muts_df[all_muts_df["pseudogene"]].head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene
1594,GLU,6,406,1,1,1.0,2330404,SNP,A→C,pseudogene (3999/4605 nt),yfaS,NC_000913,6 406 1 1,False,"(2330404, 2330404)",{ECK120003732},"[{'name': 'yfaS_1', 'RegulonDB ID': 'ECK120003...",False,True
647,SSW_GLU_XYL,1,163,7,1,1.0,291529,SNP,G→A,pseudogene (126/828 nt),yagJ,,1 163 7 1,False,"(291529, 291529)",{ECK120002767},"[{'name': 'yagJ', 'RegulonDB ID': 'ECK12000276...",False,True
604,SSW_GLU_XYL,1,163,6,1,1.0,1466733,SNP,A→G,pseudogene (1342/2513 nt),ydbA,,1 163 6 1,False,"(1466733, 1466733)",{ECK120003316},"[{'name': 'ydbA_1', 'RegulonDB ID': 'ECK120003...",False,True
426,SSW_GLU_XYL,1,163,0,1,1.0,1529218,SNP,C→T,pseudogene (1317/2037 nt),rhsE,,1 163 0 1,False,"(1529218, 1529218)",{ECK120000008},"[{'name': 'rhsE', 'RegulonDB ID': 'ECK12000000...",False,True
693,SSW_GLU_XYL,1,163,8,1,1.0,1594132,DEL,(C)8→7,pseudogene (3856/3861 nt),yneO,,1 163 8 1,False,"(1594132, 1594132)",{ECK120001714},"[{'name': 'ydeK', 'RegulonDB ID': 'ECK12000171...",False,True


### Genes

In [8]:
genes_df = pd.read_csv(
    "./data/RegulonDB10/gene.txt", sep="\t", comment='#', header=None)
genes_df.columns = [
    "GENE_ID",
    "GENE_NAME",
    "GENE_POSLEFT",
    "GENE_POSRIGHT",
    "GENE_STRAND",
    "GENE_SEQUENCE",
    "GC_CONTENT",
    "CRI_SCORE",
    "GENE_NOTE",
    "GENE_INTERNAL_COMMENT",
    "KEY_ID_ORG",
    "GENE_TYPE"
]

def get_gene_range(row):
    r = ()
    if not pd.isna(row["GENE_POSLEFT"]) and not pd.isna(row["GENE_POSRIGHT"]):
        r = (int(row["GENE_POSLEFT"]), int(row["GENE_POSRIGHT"])) 
    return r

genes_df["range"] = genes_df.apply(lambda r: get_gene_range(r), axis=1)
genes_df.head()

Unnamed: 0,GENE_ID,GENE_NAME,GENE_POSLEFT,GENE_POSRIGHT,GENE_STRAND,GENE_SEQUENCE,GC_CONTENT,CRI_SCORE,GENE_NOTE,GENE_INTERNAL_COMMENT,KEY_ID_ORG,GENE_TYPE,range
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,,"(4265782, 4266861)"
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,,"(795862, 796551)"
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,,"(2531463, 2532224)"
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,,"(3812731, 3813951)"
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,,"(4347404, 4348744)"


### TF binding sites

In [9]:
tf_df = pd.read_csv("./data/RegulonDBwebsite10/BindingSiteSet.txt", sep="\t", comment='#', header=None)

def get_TF_binding_site_range(tf_df_row):
    r = ()
    if not pd.isna(tf_df_row[3]) and not pd.isna(tf_df_row[4]):
        r = (int(tf_df_row[3]), int(tf_df_row[4]))
    return r

tf_df["range"] = tf_df.apply(get_TF_binding_site_range, axis=1)
display(tf_df.shape, tf_df.head())

(3562, 15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,range
0,ECK120015994,AcrR,ECK125258528,485709,485732,reverse,ECK120033472,acrAB,-,acrAp,-22.5,gcgttagattTACATACATTTGTGAATGTATGTAccatagcacg,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
1,ECK120015994,AcrR,ECK125258528,485709,485732,forward,ECK125134945,acrR,-,acrRp,22.5,cgtgctatggTACATACATTCACAAATGTATGTAaatctaacgc,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
2,ECK120015994,AcrR,ECK125202663,1619048,1619058,forward,ECK125202664,marRAB,-,marRp,-40.5,catcggtcaaTTCATTCATTtgacttatac,"[GEA|W|Gene expression analysis],[BPP|S|Bindin...",Strong,"(1619048, 1619058)"
3,ECK120015994,AcrR,ECK125242724,1978422,1978432,reverse,ECK125242725,flhDC,-,flhDp,-31.5,tcactacacgCACATACAACggaggggggc,"[GEA|W|Gene expression analysis],[HIBSCS|W|Hum...",Weak,"(1978422, 1978432)"
4,ECK120015994,AcrR,ECK120035040,2313112,2313135,forward,ECK120035041,micF,-,micFp,41.0,atttattaccGTCATTCATTTCTGAATGTCTGTTtacccctatt,[AIBSCS|W|Automated inference based on similar...,Weak,"(2313112, 2313135)"


In [10]:
# all_muts_df["TFBS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[tf_df, "range", 2])
# all_muts_df["in TF binding site"] = all_muts_df["TF binding site"].apply(lambda x: bool(x))
all_muts_df["TFBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], tf_df, "range", 2) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["TFBS"] != set()]
display(df.shape, df.head())

(13, 20)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS
1379,GLU,3,244,1,1,1.0,1293036,MOB,IS2 (+) +5 bp,intergenic (‑114/‑487),hns/tdk,NC_000913,3 244 1 1,False,"(1293036, 1293036)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120011932}
1507,GLU,4,149,1,1,1.0,1293032,MOB,IS1 (–) +8 bp,intergenic (‑110/‑488),hns/tdk,NC_000913,4 149 1 1,False,"(1293032, 1293032)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120011932}
1613,GLU,6,406,3,1,1.0,1292991,MOB,+T :: IS1 (–) +7 bp,intergenic (‑69/‑530),hns/tdk,NC_000913,6 406 3 1,False,"(1292991, 1292991)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120013147}
1324,GLU,10,247,1,1,1.0,29630,SNP,G→T,intergenic (+435/‑21),dapB/carA,NC_000913,10 247 1 1,False,"(29630, 29630)",{},"[{'RegulonDB ID': 'ECK120000200/ECK120000130',...",False,False,{ECK120012654}
1325,GLU,10,247,1,1,1.0,1293196,MOB,IS5 (+) +4 bp,intergenic (‑274/‑328),hns/tdk,NC_000913,10 247 1 1,False,"(1293196, 1293196)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120011926}


### promoter

In [11]:
promoter_df = pd.read_csv("./data/RegulonDB10/promoter.txt", sep="\t", comment='#', header=None)


promoter_df["range"] = promoter_df.apply(get_promoter_range_from_RegulonDB_df_row, axis=1)
promoter_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976569.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116709.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116772.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329284.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [12]:
# ignoring meaningless predicted promoters
display(len(promoter_df))
promoter_df = promoter_df.fillna('')
promoter_df = promoter_df[~(promoter_df[1].str.contains("TSS_"))]
display(len(promoter_df), promoter_df.head())

8617

3859

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976569.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116709.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116772.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329284.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [13]:
all_muts_df["promoter"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], promoter_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["promoter"] != set()]
display(df.shape, df)

(38, 21)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter
1448,GLU,3,412,2,1,1.0,255855,SNP,A→T,intergenic (‑139/‑122),pepD/gpt,NC_000913,3 412 2 1,False,"(255855, 255855)",{},"[{'RegulonDB ID': 'ECK120000687/ECK120000407',...",False,False,{},"{ECK120034547, ECK120010290}"
1408,GLU,3,412,2,1,1.0,4136037,SNP,C→T,intergenic (+22/‑71),katG/yijE,NC_000913,3 412 2 1,False,"(4136037, 4136037)",{},"[{'RegulonDB ID': 'ECK120000504/ECK120001831',...",False,False,{},{ECK125137977}
1613,GLU,6,406,3,1,1.0,1292991,MOB,+T :: IS1 (–) +7 bp,intergenic (‑69/‑530),hns/tdk,NC_000913,6 406 3 1,False,"(1292991, 1292991)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120013147},{ECK120010201}
1559,GLU,6,238,1,1,1.0,1293008,MOB,IS1 (+) +9 bp,intergenic (‑86/‑511),hns/tdk,NC_000913,6 238 1 1,False,"(1293008, 1293008)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{},{ECK120010201}
1590,GLU,6,406,1,1,1.0,1995398,MOB,IS5 (–) +4 bp,intergenic (‑39/‑417),uvrY/yecF,NC_000913,6 406 1 1,False,"(1995398, 1995398)",{},"[{'RegulonDB ID': 'ECK120001124/ECK120002407',...",False,False,{},{ECK120010344}
2457,GLU,8,380,0,1,1.0,1292989,MOB,IS1 (+) +8 bp,intergenic (‑67/‑531),hns/tdk,NC_000913,8 380 0 1,False,"(1292989, 1292989)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{},{ECK120010201}
2490,GLU,9,262,1,1,1.0,1293015,MOB,IS1 (–) +8 bp,intergenic (‑93/‑505),hns/tdk,NC_000913,9 262 1 1,False,"(1293015, 1293015)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{},{ECK120010201}
1324,GLU,10,247,1,1,1.0,29630,SNP,G→T,intergenic (+435/‑21),dapB/carA,NC_000913,10 247 1 1,False,"(29630, 29630)",{},"[{'RegulonDB ID': 'ECK120000200/ECK120000130',...",False,False,{ECK120012654},{ECK120010111}
1376,GLU,10,75,1,1,1.0,711516,SNP,A→T,intergenic (‑51/+89),fldA/ybfE,NC_000913,10 75 1 1,False,"(711516, 711516)",{},"[{'RegulonDB ID': 'ECK120000312/ECK120001709',...",False,False,{},{ECK120009880}
1335,GLU,10,320,1,1,1.0,1755429,DEL,(T)5→4,intergenic (‑288/‑269),ydhZ/pykF,NC_000913,10 320 1 1,False,"(1755429, 1755429)",{},"[{'RegulonDB ID': 'ECK120003492/ECK120000795',...",False,False,{},{ECK120010529}


### TSS

Uses promoter data structures

In [14]:
# TSS_df = promoter_df.copy()
# TSS_df[3] = TSS_df[3].fillna(-1)  # -1 will ensure that no mutations ever accidentally get put on TSS with NaN pos.
# TSS_df[3] = TSS_df[3].apply(lambda x: int(x))
# TSS_df["range"] = TSS_df[3].apply(lambda x: (x, x))
# # all_muts_df["TSS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[TSS_adf, "range", 0])
# all_muts_df["TSS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], TSS_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
# df = all_muts_df[all_muts_df["TSS"] != set()]
# display(df.shape, df.head())

### RBS

In [15]:
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
RBS_df.head()

Unnamed: 0,SHINE_DALGARNO_ID,GENE_ID,SHINE_DALGARNO_DIST_GENE,SHINE_DALGARNO_POSLEFT,SHINE_DALGARNO_POSRIGHT,SHINE_DALGARNO_SEQUENCE,SHINE_DALGARNO_NOTE,SD_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120014181,ECK120000266,-11,3151252,3151257,aaattacgcgCAGGATaatatccGAT,,,ECK12,"(3151252, 3151257)"
1,ECK120014182,ECK120000265,-9,3151991,3151996,acttgcgtccTGGAGAtacacAGT,,,ECK12,"(3151991, 3151996)"
2,ECK120014183,ECK120000496,-11,3957829,3957834,acgtcaacatCGAGGGctgtcccTGT,,,ECK12,"(3957829, 3957834)"
3,ECK120014184,ECK120000488,-10,3957957,3957962,cacaacatcaCGAGGAatcaccATG,,,ECK12,"(3957957, 3957962)"
4,ECK120014185,ECK120001215,-8,3469859,3469864,tttacgtcacAAGGGAttatAAT,,,ECK12,"(3469859, 3469864)"


In [16]:
all_muts_df["RBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], RBS_df, "range", "SHINE_DALGARNO_ID") if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["RBS"] != set()]
display(df.shape, df.head())

(0, 22)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS


### Terminator

In [17]:
terminator_df = pd.read_csv("./data/RegulonDB10/terminator.txt", sep="\t", comment='#', header=None)
terminator_df["range"] = terminator_df.apply(lambda row: (row[2], row[3]), axis=1)
terminator_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,range
0,ECK120010779,,2738912,2738940,rho-independent,ctgatgaaaaGGTGCCGGATGATGTGAATCATCCGGCACtggattatta,,,ECK12,"(2738912, 2738940)"
1,ECK120010780,,2684075,2684093,rho-independent,taacgtagaaAGGCTTCCCGAAGGAAGCCttgatgatca,,,ECK12,"(2684075, 2684093)"
2,ECK120010781,,2311610,2311624,rho-independent,caatgaaaaaAGGGCCCGCAGGCCCtttgttcgat,,,ECK12,"(2311610, 2311624)"
3,ECK120010782,,1159325,1159346,rho-independent,tggggagactAAGGCAGCCAGATGGCTGCCTTttttacaggt,,,ECK12,"(1159325, 1159346)"
4,ECK120010783,,1113532,1113560,rho-independent,acgagccaatAAAAATACCGGCGTTATGCCGGTATTTTTttacgaaaga,,,ECK12,"(1113532, 1113560)"


In [18]:
# all_muts_df["terminator"] = all_muts_df["range"].apply(get_feature_hit_set, args=[terminator_df, "range", 0])
all_muts_df["terminator"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], terminator_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["terminator"] != set()]
display(df.shape, df.head())

(14, 23)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,terminator
1398,GLU,3,412,2,1,1.0,2090136,INS,(G)5→6,intergenic (+90/‑56),hisL/hisG,NC_000913,3 412 2 1,False,"(2090136, 2090136)",{},"[{'RegulonDB ID': 'ECK120001243/ECK120000442',...",False,False,{},{},{},{ECK120033736}
2480,GLU,8,380,3,1,1.0,3950469,SNP,T→G,intergenic (+49/‑38),ilvL/ilvX,NC_000913,8 380 3 1,False,"(3950469, 3950469)",{},"[{'RegulonDB ID': 'ECK120001244/ECK120048853',...",False,False,{},{},{},{ECK120033263}
473,SSW_GLU_XYL,1,163,1,1,1.0,191723,SNP,C→T,intergenic (+15/‑132),tsf/pyrH,,1 163 1 1,False,"(191723, 191723)",{},"[{'RegulonDB ID': 'ECK120001022/ECK120001493',...",False,False,{},{},{},{ECK125095454}
443,SSW_GLU_XYL,1,163,0,2,1.0,3815810,DEL,Δ1 bp,intergenic (‑42/+24),pyrE/rph,,1 163 0 2,False,"(3815810, 3815810)",{},"[{'RegulonDB ID': 'ECK120000799/ECK120000854',...",False,False,{},{},{},{ECK120035133}
710,SSW_GLU_XYL,1,163,8,1,1.0,3853127,DEL,Δ1 bp,noncoding (131/140 nt),istR,,1 163 8 1,False,"(3853127, 3853127)","{ECK120015142, ECK120015143}","[{'name': 'istR-1', 'RegulonDB ID': 'ECK120015...",False,False,{},{},{},{ECK120035137}


In [19]:
all_muts_df.to_pickle("./data/2_df.pkl")
display(all_muts_df.shape)

(585, 23)