# TODO: merge this NB and NB "*annotate_genomic_feats"

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display
import os, sys, itertools, csv

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.alemutdf import get_all_sample_mut_df, get_gene_mut_count_mat, get_multi_exp_max_freq_mut_df, get_mut_type_avg_frac_across_class_df
from util.mut import is_coding_mut, get_original_nuc_mut_range
from util.metadata import get_condition_val_dict, get_condition_field_val_set
from util.genome import get_feature_hit_set, is_overlap, get_promoter_range_from_RegulonDB_df_row, NON_K12_EXP_L

In [2]:
pd.options.display.max_columns = 100

In [3]:
all_muts_df = pd.read_pickle("./data/1_5_df.pkl")
display(all_muts_df.shape)

(2641, 16)

In [4]:
all_muts_df.exp.unique()

array(['42C', 'C13', 'Central_carbon_knockout_PTS',
       'Central_carbon_knockout_glucose_evolution',
       'Central_carbon_knockout_gnd', 'Central_carbon_knockout_pgi',
       'Central_carbon_knockout_sdh', 'Central_carbon_knockout_tpiA',
       'GLU', 'GYD', 'Iron-ALE',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-2-deoxyribose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-arabinose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-lyxose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-m-tartrate2',
       'MG1655-M9-NC_000913_3gb-stationary-37-monomethyl_succinate2',
       'PGI', 'SER', 'SSW_AC', 'SSW_GLU_AC', 'SSW_GLU_GLY', 'SSW_GLU_XYL',
       'SSW_GLY', 'SSW_XYL', 'TOL_2,3-butanediol', 'TOL_adipic_acid',
       'TOL_coumaric_acid', 'TOL_glutaric_acid',
       'TOL_hexamethylenediamine', 'TOL_hexanoic_acid',
       'TOL_isobutyric_acid', 'TOL_n-butanol', 'TOL_octanoic_acid',
       'TOL_propanediol', 'TOL_putrescine'], dtype=object)

In [5]:
# debug
# all_muts_df = all_muts_df[all_muts_df.Position==1619080]

In [6]:
# got range from https://ecocyc.org/ECOLI/NEW-IMAGE?type=NIL&object=G0-10506
origin_of_replication_range=(3925744, 3925975)
all_muts_df["oriC"] = all_muts_df.apply(lambda r: is_overlap(r["range"], origin_of_replication_range) if r.exp not in NON_K12_EXP_L else False, axis=1)
set(all_muts_df["oriC"])

{False}

In [7]:
all_muts_df["coding"] = all_muts_df.Details.apply(is_coding_mut)
all_muts_df.head()

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC
2,R110G (CGT→GGT),clsA,SNP,1308318,,G→C,1,42C,124,1,1.0,1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False
6,,rph,DEL,3815859,,Δ82 bp,1,42C,124,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False
7,A734V (GCG→GTG),rpoC,SNP,4187550,,C→T,1,42C,124,1,1.0,1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False
8,D9A (GAT→GCT),hfq,SNP,4400313,,A→C,1,42C,124,1,1.0,1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False
0,coding (380‑400/1149 nt),nagA,DEL,702352,,Δ21 bp,1,42C,124,1,1.0,1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False


In [8]:
all_muts_df["pseudogene"] = all_muts_df.Details.apply(lambda x: "pseudogene" in x )
all_muts_df[all_muts_df["pseudogene"]].head()

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene
65,pseudogene (1115/1503 nt),yjiT,SNP,4573528,,A→G,3,42C,120,1,1.0,1,False,"(4573528, 4573528)",{ECK120004433},"[{'name': 'yjiT', 'RegulonDB ID': 'ECK12000443...",False,True
99,pseudogene (1712/2001 nt),yehQ,SNP,2210811,,C→T,6,42C,164,1,1.0,1,False,"(2210811, 2210811)",{ECK120001922},"[{'name': 'yehQ', 'RegulonDB ID': 'ECK12000192...",False,True
109,pseudogene (6/261 nt)pseudogene (346/399 nt),ygeN;ygeO,INS,2996015,,(T)8→9,6,42C,164,1,1.0,1,False,"(2996015, 2996015)","{ECK120004041, ECK120004042}","[{'name': 'ygeN', 'RegulonDB ID': 'ECK12000404...",False,True
80,pseudogene (26/491 nt),ybbD,SNP,528665,,C→T,6,42C,164,1,1.0,1,False,"(528665, 528665)",{ECK120001706},"[{'name': 'ybbD', 'RegulonDB ID': 'ECK12000170...",False,True
119,pseudogene (41/63 nt),ykfN,SNP,263172,,A→C,7,42C,135,1,1.0,1,False,"(263172, 263172)",{},"[{'RegulonDB ID': 'ECK120002546/ECK120002740',...",False,True


### Genes

In [9]:
genes_df = pd.read_csv(
    "./data/RegulonDB10/gene.txt", sep="\t", comment='#', header=None)
genes_df.columns = [
    "GENE_ID",
    "GENE_NAME",
    "GENE_POSLEFT",
    "GENE_POSRIGHT",
    "GENE_STRAND",
    "GENE_SEQUENCE",
    "GC_CONTENT",
    "CRI_SCORE",
    "GENE_NOTE",
    "GENE_INTERNAL_COMMENT",
    "KEY_ID_ORG",
    "GENE_TYPE"
]

def get_gene_range(row):
    r = ()
    if not pd.isna(row["GENE_POSLEFT"]) and not pd.isna(row["GENE_POSRIGHT"]):
        r = (int(row["GENE_POSLEFT"]), int(row["GENE_POSRIGHT"])) 
    return r

genes_df["range"] = genes_df.apply(lambda r: get_gene_range(r), axis=1)
genes_df.head()

Unnamed: 0,GENE_ID,GENE_NAME,GENE_POSLEFT,GENE_POSRIGHT,GENE_STRAND,GENE_SEQUENCE,GC_CONTENT,CRI_SCORE,GENE_NOTE,GENE_INTERNAL_COMMENT,KEY_ID_ORG,GENE_TYPE,range
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,,"(4265782, 4266861)"
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,,"(795862, 796551)"
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,,"(2531463, 2532224)"
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,,"(3812731, 3813951)"
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,,"(4347404, 4348744)"


### TF binding sites

In [10]:
tf_df = pd.read_csv("./data/RegulonDBwebsite10/BindingSiteSet.txt", sep="\t", comment='#', header=None)

def get_TF_binding_site_range(tf_df_row):
    r = ()
    if not pd.isna(tf_df_row[3]) and not pd.isna(tf_df_row[4]):
        r = (int(tf_df_row[3]), int(tf_df_row[4]))
    return r

tf_df["range"] = tf_df.apply(get_TF_binding_site_range, axis=1)
display(tf_df.shape, tf_df.head())

(3562, 15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,range
0,ECK120015994,AcrR,ECK125258528,485709,485732,reverse,ECK120033472,acrAB,-,acrAp,-22.5,gcgttagattTACATACATTTGTGAATGTATGTAccatagcacg,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
1,ECK120015994,AcrR,ECK125258528,485709,485732,forward,ECK125134945,acrR,-,acrRp,22.5,cgtgctatggTACATACATTCACAAATGTATGTAaatctaacgc,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
2,ECK120015994,AcrR,ECK125202663,1619048,1619058,forward,ECK125202664,marRAB,-,marRp,-40.5,catcggtcaaTTCATTCATTtgacttatac,"[GEA|W|Gene expression analysis],[BPP|S|Bindin...",Strong,"(1619048, 1619058)"
3,ECK120015994,AcrR,ECK125242724,1978422,1978432,reverse,ECK125242725,flhDC,-,flhDp,-31.5,tcactacacgCACATACAACggaggggggc,"[GEA|W|Gene expression analysis],[HIBSCS|W|Hum...",Weak,"(1978422, 1978432)"
4,ECK120015994,AcrR,ECK120035040,2313112,2313135,forward,ECK120035041,micF,-,micFp,41.0,atttattaccGTCATTCATTTCTGAATGTCTGTTtacccctatt,[AIBSCS|W|Automated inference based on similar...,Weak,"(2313112, 2313135)"


In [11]:
# all_muts_df["TFBS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[tf_df, "range", 2])
# all_muts_df["in TF binding site"] = all_muts_df["TF binding site"].apply(lambda x: bool(x))
all_muts_df["TFBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], tf_df, "range", 2) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["TFBS"] != set()]
display(df.shape, df.head())

(147, 19)

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS
83,intergenic (‑314/‑441),csgD/csgB,SNP,1103510,,G→A,6,42C,164,1,1.0,1,False,"(1103510, 1103510)",{},"[{'RegulonDB ID': 'ECK120003142/ECK120003143',...",False,False,"{ECK125110223, ECK125110237, ECK125135183, ECK..."
97,noncoding (5/195 nt),isrC,SNP,2071321,,A→G,6,42C,164,1,1.0,1,False,"(2071321, 2071321)",{ECK120002616},"[{'name': 'isrC', 'RegulonDB ID': 'ECK12000261...",False,False,{ECK125108973}
132,intergenic (+39/‑63),ycdY/ycdZ,SNP,1100233,,G→A,8,42C,164,1,1.0,1,False,"(1100233, 1100233)",{},"[{'RegulonDB ID': 'ECK120003137/ECK120003138',...",False,False,{ECK125158213}
97,noncoding (5/195 nt),isrC,SNP,2071321,,A→G,8,42C,164,1,1.0,1,False,"(2071321, 2071321)",{ECK120002616},"[{'name': 'isrC', 'RegulonDB ID': 'ECK12000261...",False,False,{ECK125108973}
156,intergenic (‑111/‑486),hns/tdk,MOB,1293033,,IS1 (+) +9 bp,9,42C,131,1,1.0,1,False,"(1293033, 1293033)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120011932}


### promoter

In [12]:
promoter_df = pd.read_csv("./data/RegulonDB10/promoter.txt", sep="\t", comment='#', header=None)


promoter_df["range"] = promoter_df.apply(get_promoter_range_from_RegulonDB_df_row, axis=1)
promoter_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976569.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116709.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116772.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329284.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [13]:
# ignoring meaningless predicted promoters
display(len(promoter_df))
promoter_df = promoter_df.fillna('')
promoter_df = promoter_df[~(promoter_df[1].str.contains("TSS_"))]
display(len(promoter_df), promoter_df.head())

8617

3859

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976570.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116710.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116770.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329280.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [14]:
all_muts_df["promoter"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], promoter_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["promoter"] != set()]
display(df.shape, df)

(270, 20)

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter
39,intergenic (‑90/‑133),ravA/kup,INS,3931183,,(G)7→8,2,42C,163,1,1.0,1,False,"(3931183, 3931183)",{},"[{'RegulonDB ID': 'ECK120001670/ECK120001495',...",False,False,{},"{ECK120034340, ECK120034342}"
21,A15T (GCC→ACC),ycaN,SNP,949526,,C→T,2,42C,163,1,1.0,1,True,"(949526, 949526)",{ECK120003067},"[{'name': 'ycaN', 'RegulonDB ID': 'ECK12000306...",False,False,{},"{ECK125136681, ECK125136682}"
48,intergenic (‑498/‑59),ydhZ/pykF,SNP,1755639,,G→A,3,42C,120,1,1.0,1,False,"(1755639, 1755639)",{},"[{'RegulonDB ID': 'ECK120003492/ECK120000795',...",False,False,{},{ECK120034238}
54,intergenic (‑155/+288),rrsG/clpB,SNP,2731312,,G→A,3,42C,120,1,1.0,1,False,"(2731312, 2731312)",{},"[{'RegulonDB ID': 'ECK120002531/ECK120000153',...",False,False,{},{ECK120009869}
70,intergenic (+81/‑10),ygaH/mprA,SNP,2810760,,T→C,4,42C,161,1,1.0,1,False,"(2810760, 2810760)",{},"[{'RegulonDB ID': 'ECK120003973/ECK120000596',...",False,False,{},{ECK120010556}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,intergenic (‑66/‑507),nmpC/essD,SNP,576891,NC_000913,C→T,5,TOL_putrescine,50,2,1.0,1,False,"(576891, 576891)",{},"[{'RegulonDB ID': 'ECK120000652/ECK120002917',...",False,False,{ECK120013822},"{ECK120016711, ECK120010699}"
29,,"insB1, insA",DEL,1978503,NC_000913,Δ776 bp,5,TOL_putrescine,50,1,1.0,1,True,"(1978503, 1979278)","{ECK120023915, ECK120003617, ECK120003616}","[{'name': 'insB-5', 'RegulonDB ID': 'ECK120003...",False,False,"{ECK120017150, ECK120013472}","{ECK120034252, ECK125095446}"
31,intergenic (‑102/‑559),stpA/alaE,MOB,2798597,NC_000913,IS1 (+) +9 bp,5,TOL_putrescine,50,1,1.0,1,False,"(2798597, 2798597)",{},"[{'RegulonDB ID': 'ECK120001507/ECK120003966',...",False,False,{},"{ECK120010482, ECK120010483}"
29,,"insB1, insA",DEL,1978503,NC_000913,Δ776 bp,6,TOL_putrescine,50,1,1.0,1,True,"(1978503, 1979278)","{ECK120023915, ECK120003617, ECK120003616}","[{'name': 'insB-5', 'RegulonDB ID': 'ECK120003...",False,False,"{ECK120017150, ECK120013472}","{ECK120034252, ECK125095446}"


### TSS

Uses promoter data structures

In [15]:
# TSS_df = promoter_df.copy()
# TSS_df[3] = TSS_df[3].fillna(-1)  # -1 will ensure that no mutations ever accidentally get put on TSS with NaN pos.
# TSS_df[3] = TSS_df[3].apply(lambda x: int(x))
# TSS_df["range"] = TSS_df[3].apply(lambda x: (x, x))
# # all_muts_df["TSS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[TSS_adf, "range", 0])
# all_muts_df["TSS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], TSS_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
# df = all_muts_df[all_muts_df["TSS"] != set()]
# display(df.shape, df.head())

### RBS

In [16]:
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
RBS_df.head()

Unnamed: 0,SHINE_DALGARNO_ID,GENE_ID,SHINE_DALGARNO_DIST_GENE,SHINE_DALGARNO_POSLEFT,SHINE_DALGARNO_POSRIGHT,SHINE_DALGARNO_SEQUENCE,SHINE_DALGARNO_NOTE,SD_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120014181,ECK120000266,-11,3151252,3151257,aaattacgcgCAGGATaatatccGAT,,,ECK12,"(3151252, 3151257)"
1,ECK120014182,ECK120000265,-9,3151991,3151996,acttgcgtccTGGAGAtacacAGT,,,ECK12,"(3151991, 3151996)"
2,ECK120014183,ECK120000496,-11,3957829,3957834,acgtcaacatCGAGGGctgtcccTGT,,,ECK12,"(3957829, 3957834)"
3,ECK120014184,ECK120000488,-10,3957957,3957962,cacaacatcaCGAGGAatcaccATG,,,ECK12,"(3957957, 3957962)"
4,ECK120014185,ECK120001215,-8,3469859,3469864,tttacgtcacAAGGGAttatAAT,,,ECK12,"(3469859, 3469864)"


In [17]:
all_muts_df["RBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], RBS_df, "range", "SHINE_DALGARNO_ID") if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["RBS"] != set()]
display(df.shape, df.head())

(6, 21)

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS
8,intergenic (‑10/‑514),pntA/ydgH,SNP,1677913,,T→C,17,Central_carbon_knockout_gnd,170,0,1.0,1,False,"(1677913, 1677913)",{},"[{'RegulonDB ID': 'ECK120000735/ECK120003449',...",False,False,{},{},{ECK120014306}
8,intergenic (‑10/‑514),pntA/ydgH,SNP,1677913,,T→C,18,Central_carbon_knockout_gnd,169,2,1.0,1,False,"(1677913, 1677913)",{},"[{'RegulonDB ID': 'ECK120000735/ECK120003449',...",False,False,{},{},{ECK120014306}
159,intergenic (‑172/‑6),marC/marR,DEL,1619080,,Δ35 bp,3,Central_carbon_knockout_pgi,108,1,1.0,1,False,"(1619080, 1619114)",{},"[{'RegulonDB ID': 'ECK120001584/ECK120001399',...",False,False,"{ECK120017169, ECK120030315}",{ECK120010245},{ECK120014209}
26,,"[nrdE], nrdF, proV, proW, proX, ygaY, ygaY, yg...",DEL,2801966,NC_000913,"Δ11,843 bp",8,TOL_coumaric_acid,50,1,1.0,1,True,"(2801966, 2813808)","{ECK120000764, ECK120000763, ECK120003971, ECK...","[{'name': 'proV', 'RegulonDB ID': 'ECK12000076...",False,False,"{ECK120012300, ECK125228813, ECK120012284, ECK...","{ECK120010370, ECK120010556, ECK120010368, ECK...","{ECK120014245, ECK120014246, ECK120014247}"
74,IS5‑mediated,"[pepD], gpt, frsA, crl, crl, phoE, proB, proA,...",DEL,255591,NC_000913,"Δ18,364 bp",1,TOL_isobutyric_acid,50,2,1.0,1,True,"(255591, 273954)","{ECK125256958, ECK120002752, ECK120002746, ECK...","[{'name': 'ykfG', 'RegulonDB ID': 'ECK12000274...",False,False,"{ECK120013598, ECK120012953, ECK120026344, ECK...","{ECK125136414, ECK120010289, ECK125138134, ECK...",{ECK120014270}


### Cis-regulatory RNA sequences
Not going to include these for now.

In [18]:
# cis_regulatory_RNA_df = pd.read_csv("./data/RegulonDB10/rfam.txt", sep="\t", comment='#', header=None)
# cis_regulatory_RNA_df["range"] = cis_regulatory_RNA_df.apply(lambda row: (row[6], row[7]), axis=1)
# cis_regulatory_RNA_df.head()

In [19]:
# # all_muts_df["cis-regulatory RNA"] = all_muts_df["range"].apply(get_feature_hit_set, args=[cis_regulatory_RNA_df, "range", 1])
# all_muts_df["cis-regulatory RNA"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], cis_regulatory_RNA_df, "range", 1) if r.exp not in NON_K12_EXP_L else set(), axis=1)
# df = all_muts_df[all_muts_df["cis-regulatory RNA"] != set()]
# display(df.shape, df.head())

### Terminator/Attenuator

In [20]:
att_term_df = pd.read_pickle("./data/att_term.pkl")
att_term_df.head()

Unnamed: 0,RegulonDB ID,range
0,ECK125143526,"(200, 311)"
1,ECK125143530,"(4979, 5078)"
2,ECK125143534,"(14134, 14155)"
3,ECK125143536,"(21166, 21255)"
4,ECK125143540,"(20912, 20982)"


In [21]:
all_muts_df["attenuator terminator"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], att_term_df, "range", "RegulonDB ID") if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["attenuator terminator"] != set()]
display(df.shape, df.head())

(169, 22)

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator
6,,rph,DEL,3815859,,Δ82 bp,1,42C,124,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
6,,rph,DEL,3815859,,Δ82 bp,2,42C,163,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
6,,rph,DEL,3815859,,Δ82 bp,3,42C,120,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
44,V222A (GTA→GCA),focA,SNP,953802,,A→G,3,42C,120,1,1.0,1,True,"(953802, 953802)",{ECK120001233},"[{'name': 'focA', 'RegulonDB ID': 'ECK12000123...",False,False,{},{},{},{ECK125143834}
6,,rph,DEL,3815859,,Δ82 bp,4,42C,161,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}


### Attenuator

Assuming all attenuators are also annotated with "attenuator terminator" entries. Will use the "attenuator terminator" entries to get associated genes.

In [22]:
# attenuator_df = pd.read_csv("./data/RegulonDB10/attenuator.txt", sep="\t", comment='#', header=None)
# #                             column=["ATTENUATOR_ID", "GENE_ID", "ATTENUATOR_TYPE", "ATTENUATOR_STRAND"])
# attenuator_df.head()

In [23]:
# def get_attenuator_id_set(terminator_attenuator_id_set):
#     attenuator_id_set = set()
#     if bool(terminator_attenuator_id_set):
#         for terminator_attenuator_id in terminator_attenuator_id_set:
#             attenuator_subset_df = attenuator_df[attenuator_df[0]==terminator_attenuator_id]
#             attenuator_id_set = attenuator_id_set | set(attenuator_subset_df[0])
#     return attenuator_id_set


# all_muts_df["attenuator"] = all_muts_df["attenuator terminator"].apply(get_attenuator_id_set)
# df = all_muts_df[all_muts_df["attenuator"] != set()]
# # !!! ATTENUATOR_TERMINATOR_ID = ATTENUATOR_ID
# display(df.shape, df.head())

### Terminator

In [24]:
terminator_df = pd.read_csv("./data/RegulonDB10/terminator.txt", sep="\t", comment='#', header=None)
terminator_df["range"] = terminator_df.apply(lambda row: (row[2], row[3]), axis=1)
terminator_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,range
0,ECK120010779,,2738912,2738940,rho-independent,ctgatgaaaaGGTGCCGGATGATGTGAATCATCCGGCACtggattatta,,,ECK12,"(2738912, 2738940)"
1,ECK120010780,,2684075,2684093,rho-independent,taacgtagaaAGGCTTCCCGAAGGAAGCCttgatgatca,,,ECK12,"(2684075, 2684093)"
2,ECK120010781,,2311610,2311624,rho-independent,caatgaaaaaAGGGCCCGCAGGCCCtttgttcgat,,,ECK12,"(2311610, 2311624)"
3,ECK120010782,,1159325,1159346,rho-independent,tggggagactAAGGCAGCCAGATGGCTGCCTTttttacaggt,,,ECK12,"(1159325, 1159346)"
4,ECK120010783,,1113532,1113560,rho-independent,acgagccaatAAAAATACCGGCGTTATGCCGGTATTTTTttacgaaaga,,,ECK12,"(1113532, 1113560)"


In [25]:
# all_muts_df["terminator"] = all_muts_df["range"].apply(get_feature_hit_set, args=[terminator_df, "range", 0])
all_muts_df["terminator"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], terminator_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["terminator"] != set()]
display(df.shape, df.head())

(83, 23)

Unnamed: 0,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator
60,noncoding (82/84 nt),agrA,DEL,3648144,,(T)7→6,3,42C,120,1,1.0,1,False,"(3648144, 3648144)",{ECK125158033},"[{'name': 'agrA', 'RegulonDB ID': 'ECK12515803...",False,False,{},{},{},{},{ECK125160656}
113,intergenic (+47/‑39),ilvL/ilvX,SUB,3950467,,2 bp→TT,6,42C,164,1,1.0,1,False,"(3950467, 3950468)",{},"[{'RegulonDB ID': 'ECK120001244/ECK120048853',...",False,False,{},{},{},{},{ECK120033263}
149,intergenic (+48/‑39),ilvL/ilvX,SNP,3950468,,G→T,8,42C,164,1,1.0,1,False,"(3950468, 3950468)",{},"[{'RegulonDB ID': 'ECK120001244/ECK120048853',...",False,False,{},{},{},{},{ECK120033263}
15,intergenic (‑42/+24),pyrE/rph,DEL,3815810,,Δ1 bp,1,C13,134,1,1.0,1,False,"(3815810, 3815810)",{},"[{'RegulonDB ID': 'ECK120000799/ECK120000854',...",False,False,{},{},{},{ECK125144791},{ECK120035133}
15,intergenic (‑42/+24),pyrE/rph,DEL,3815810,,Δ1 bp,5,C13,135,1,1.0,1,False,"(3815810, 3815810)",{},"[{'RegulonDB ID': 'ECK120000799/ECK120000854',...",False,False,{},{},{},{ECK125144791},{ECK120035133}


In [26]:
all_muts_df.to_pickle("./data/2_df.pkl")
display(all_muts_df.shape)

(2641, 23)