# TODO: merge this NB and NB "*annotate_genomic_feats"

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display
import os, sys, itertools, csv

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.alemutdf import get_all_sample_mut_df, get_gene_mut_count_mat, get_multi_exp_max_freq_mut_df, get_mut_type_avg_frac_across_class_df
from util.mut import is_coding_mut, get_original_nuc_mut_range
from util.metadata import get_condition_val_dict, get_condition_field_val_set
from util.genome import get_feature_hit_set, is_overlap, get_promoter_range_from_RegulonDB_df_row, NON_K12_EXP_L

In [2]:
pd.options.display.max_columns = 100

In [3]:
all_muts_df = pd.read_pickle("./data/1_5_df.pkl")
display(all_muts_df.shape)

(2244, 16)

In [4]:
all_muts_df.exp.unique()

array(['42C', 'C13', 'GLU', 'GYD', 'Iron-ALE',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-2-deoxyribose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-arabinose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-D-lyxose2',
       'MG1655-M9-NC_000913_3gb-stationary-37-m-tartrate2',
       'MG1655-M9-NC_000913_3gb-stationary-37-monomethyl_succinate2',
       'PGI', 'SER', 'SSW_AC', 'SSW_GLU_AC', 'SSW_GLU_GLY', 'SSW_GLU_XYL',
       'SSW_GLY', 'SSW_XYL', 'TOL_2,3-butanediol', 'TOL_adipic_acid',
       'TOL_coumaric_acid', 'TOL_glutaric_acid',
       'TOL_hexamethylenediamine', 'TOL_hexanoic_acid',
       'TOL_isobutyric_acid', 'TOL_n-butanol', 'TOL_octanoic_acid',
       'TOL_propanediol', 'TOL_putrescine', 'evo', 'gnd', 'pgi', 'pts',
       'sdh', 'tpi'], dtype=object)

In [5]:
# debug
# all_muts_df = all_muts_df[all_muts_df.Position==1619080]

In [6]:
# got range from https://ecocyc.org/ECOLI/NEW-IMAGE?type=NIL&object=G0-10506
origin_of_replication_range=(3925744, 3925975)
all_muts_df["oriC"] = all_muts_df.apply(lambda r: is_overlap(r["range"], origin_of_replication_range) if r.exp not in NON_K12_EXP_L else False, axis=1)
set(all_muts_df["oriC"])

{False}

In [7]:
all_muts_df["coding"] = all_muts_df.Details.apply(is_coding_mut)
all_muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC
0,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False
2,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False
6,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False
7,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False
8,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False


In [8]:
all_muts_df["pseudogene"] = all_muts_df.Details.apply(lambda x: "pseudogene" in x )
all_muts_df[all_muts_df["pseudogene"]].head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene
65,42C,3,120,1,1,1.0,4573528,SNP,A→G,pseudogene (1115/1503 nt),yjiT,,False,"(4573528, 4573528)",{ECK120004433},"[{'name': 'yjiT', 'RegulonDB ID': 'ECK12000443...",False,True
80,42C,6,164,1,1,1.0,528665,SNP,C→T,pseudogene (26/491 nt),ybbD,,False,"(528665, 528665)",{ECK120001706},"[{'name': 'ybbD', 'RegulonDB ID': 'ECK12000170...",False,True
99,42C,6,164,1,1,1.0,2210811,SNP,C→T,pseudogene (1712/2001 nt),yehQ,,False,"(2210811, 2210811)",{ECK120001922},"[{'name': 'yehQ', 'RegulonDB ID': 'ECK12000192...",False,True
109,42C,6,164,1,1,1.0,2996015,INS,(T)8→9,pseudogene (6/261 nt)pseudogene (346/399 nt),ygeN;ygeO,,False,"(2996015, 2996015)","{ECK120004042, ECK120004041}","[{'name': 'ygeN', 'RegulonDB ID': 'ECK12000404...",False,True
119,42C,7,135,1,1,1.0,263172,SNP,A→C,pseudogene (41/63 nt),ykfN,,False,"(263172, 263172)",{},"[{'RegulonDB ID': 'ECK120002546/ECK120002740',...",False,True


### Genes

In [9]:
genes_df = pd.read_csv(
    "./data/RegulonDB10/gene.txt", sep="\t", comment='#', header=None)
genes_df.columns = [
    "GENE_ID",
    "GENE_NAME",
    "GENE_POSLEFT",
    "GENE_POSRIGHT",
    "GENE_STRAND",
    "GENE_SEQUENCE",
    "GC_CONTENT",
    "CRI_SCORE",
    "GENE_NOTE",
    "GENE_INTERNAL_COMMENT",
    "KEY_ID_ORG",
    "GENE_TYPE"
]

def get_gene_range(row):
    r = ()
    if not pd.isna(row["GENE_POSLEFT"]) and not pd.isna(row["GENE_POSRIGHT"]):
        r = (int(row["GENE_POSLEFT"]), int(row["GENE_POSRIGHT"])) 
    return r

genes_df["range"] = genes_df.apply(lambda r: get_gene_range(r), axis=1)
genes_df.head()

Unnamed: 0,GENE_ID,GENE_NAME,GENE_POSLEFT,GENE_POSRIGHT,GENE_STRAND,GENE_SEQUENCE,GC_CONTENT,CRI_SCORE,GENE_NOTE,GENE_INTERNAL_COMMENT,KEY_ID_ORG,GENE_TYPE,range
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,,"(4265782, 4266861)"
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,,"(795862, 796551)"
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,,"(2531463, 2532224)"
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,,"(3812731, 3813951)"
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,,"(4347404, 4348744)"


### TF binding sites

In [10]:
tf_df = pd.read_csv("./data/RegulonDBwebsite10/BindingSiteSet.txt", sep="\t", comment='#', header=None)

def get_TF_binding_site_range(tf_df_row):
    r = ()
    if not pd.isna(tf_df_row[3]) and not pd.isna(tf_df_row[4]):
        r = (int(tf_df_row[3]), int(tf_df_row[4]))
    return r

tf_df["range"] = tf_df.apply(get_TF_binding_site_range, axis=1)
display(tf_df.shape, tf_df.head())

(3562, 15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,range
0,ECK120015994,AcrR,ECK125258528,485709,485732,reverse,ECK120033472,acrAB,-,acrAp,-22.5,gcgttagattTACATACATTTGTGAATGTATGTAccatagcacg,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
1,ECK120015994,AcrR,ECK125258528,485709,485732,forward,ECK125134945,acrR,-,acrRp,22.5,cgtgctatggTACATACATTCACAAATGTATGTAaatctaacgc,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong,"(485709, 485732)"
2,ECK120015994,AcrR,ECK125202663,1619048,1619058,forward,ECK125202664,marRAB,-,marRp,-40.5,catcggtcaaTTCATTCATTtgacttatac,"[GEA|W|Gene expression analysis],[BPP|S|Bindin...",Strong,"(1619048, 1619058)"
3,ECK120015994,AcrR,ECK125242724,1978422,1978432,reverse,ECK125242725,flhDC,-,flhDp,-31.5,tcactacacgCACATACAACggaggggggc,"[GEA|W|Gene expression analysis],[HIBSCS|W|Hum...",Weak,"(1978422, 1978432)"
4,ECK120015994,AcrR,ECK120035040,2313112,2313135,forward,ECK120035041,micF,-,micFp,41.0,atttattaccGTCATTCATTTCTGAATGTCTGTTtacccctatt,[AIBSCS|W|Automated inference based on similar...,Weak,"(2313112, 2313135)"


In [11]:
# all_muts_df["TFBS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[tf_df, "range", 2])
# all_muts_df["in TF binding site"] = all_muts_df["TF binding site"].apply(lambda x: bool(x))
all_muts_df["TFBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], tf_df, "range", 2) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["TFBS"] != set()]
display(df.shape, df.head())

(61, 19)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS
83,42C,6,164,1,1,1.0,1103510,SNP,G→A,intergenic (‑314/‑441),csgD/csgB,,False,"(1103510, 1103510)",{},"[{'RegulonDB ID': 'ECK120003142/ECK120003143',...",False,False,"{ECK125110223, ECK125110237, ECK125135183, ECK..."
97,42C,6,164,1,1,1.0,2071321,SNP,A→G,noncoding (5/195 nt),isrC,,False,"(2071321, 2071321)",{ECK120002616},"[{'name': 'isrC', 'RegulonDB ID': 'ECK12000261...",False,False,{ECK125108973}
132,42C,8,164,1,1,1.0,1100233,SNP,G→A,intergenic (+39/‑63),ycdY/ycdZ,,False,"(1100233, 1100233)",{},"[{'RegulonDB ID': 'ECK120003137/ECK120003138',...",False,False,{ECK125158213}
97,42C,8,164,1,1,1.0,2071321,SNP,A→G,noncoding (5/195 nt),isrC,,False,"(2071321, 2071321)",{ECK120002616},"[{'name': 'isrC', 'RegulonDB ID': 'ECK12000261...",False,False,{ECK125108973}
156,42C,9,131,1,1,1.0,1293033,MOB,IS1 (+) +9 bp,intergenic (‑111/‑486),hns/tdk,,False,"(1293033, 1293033)",{},"[{'RegulonDB ID': 'ECK120000450/ECK120000983',...",False,False,{ECK120011932}


### promoter

In [12]:
promoter_df = pd.read_csv("./data/RegulonDB10/promoter.txt", sep="\t", comment='#', header=None)


promoter_df["range"] = promoter_df.apply(get_promoter_range_from_RegulonDB_df_row, axis=1)
promoter_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976569.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116709.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116772.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329284.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [13]:
# ignoring meaningless predicted promoters
display(len(promoter_df))
promoter_df = promoter_df.fillna('')
promoter_df = promoter_df[~(promoter_df[1].str.contains("TSS_"))]
display(len(promoter_df), promoter_df.head())

8617

3859

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,range
0,ECK120009842,galRp,forward,2976570.0,Sigma70,,,,,tcccacgatgaaaacacgccaccccttgaaccaacgggcgttttcc...,ECK12,,,"(2976509, 2976589)"
1,ECK120009843,lpxLp,reverse,1116710.0,,,,,,gcggcatgatatagcaattatcgataattaacatccacacatttta...,ECK12,,,"(1116689, 1116769)"
2,ECK120009844,yceAp,forward,1116770.0,,,,,,gcaaatgtagcgtaaaatgtgtggatgttaattatcgataattgct...,ECK12,,,"(1116712, 1116792)"
3,ECK120009845,mraZp,forward,89596.0,Sigma70,,,,,tatgccttgtgactggcttgacaagcttttcctcagctccgtaaac...,ECK12,The contribution of the mraZp promoter to the ...,,"(89536, 89616)"
4,ECK120009846,sohBp1,forward,1329280.0,"Sigma70, Sigma38",,,,,aaatggatactttgtcatactttcgctgcaataacatctctgcgag...,ECK12,We assigned a putative transcription start sit...,,"(1329224, 1329304)"


In [14]:
all_muts_df["promoter"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], promoter_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["promoter"] != set()]
display(df.shape, df)

(202, 20)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter
21,42C,2,163,1,1,1.0,949526,SNP,C→T,A15T (GCC→ACC),ycaN,,True,"(949526, 949526)",{ECK120003067},"[{'name': 'ycaN', 'RegulonDB ID': 'ECK12000306...",False,False,{},"{ECK125136682, ECK125136681}"
39,42C,2,163,1,1,1.0,3931183,INS,(G)7→8,intergenic (‑90/‑133),ravA/kup,,False,"(3931183, 3931183)",{},"[{'RegulonDB ID': 'ECK120001670/ECK120001495',...",False,False,{},"{ECK120034342, ECK120034340}"
48,42C,3,120,1,1,1.0,1755639,SNP,G→A,intergenic (‑498/‑59),ydhZ/pykF,,False,"(1755639, 1755639)",{},"[{'RegulonDB ID': 'ECK120003492/ECK120000795',...",False,False,{},{ECK120034238}
54,42C,3,120,1,1,1.0,2731312,SNP,G→A,intergenic (‑155/+288),rrsG/clpB,,False,"(2731312, 2731312)",{},"[{'RegulonDB ID': 'ECK120002531/ECK120000153',...",False,False,{},{ECK120009869}
70,42C,4,161,1,1,1.0,2810760,SNP,T→C,intergenic (+81/‑10),ygaH/mprA,,False,"(2810760, 2810760)",{},"[{'RegulonDB ID': 'ECK120003973/ECK120000596',...",False,False,{},{ECK120010556}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,sdh,23,176,1,1,1.0,2876426,SNP,G→A,intergenic (‑97/‑155),cysD/iap,NC_000913,False,"(2876426, 2876426)",{},"[{'RegulonDB ID': 'ECK120000181/ECK120000481',...",False,False,{},{ECK120010206}
40,tpi,13,89,1,1,1.0,702066,SNP,G→C,P229R (CCT→CGT),nagA,NC_000913,True,"(702066, 702066)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{ECK120033607}
24,tpi,13,89,1,1,1.0,2509479,SNP,C→T,intergenic (‑53/‑151),glk/yfeO,NC_000913,False,"(2509479, 2509479)",{},"[{'RegulonDB ID': 'ECK120002426/ECK120003826',...",False,False,{},"{ECK120015382, ECK120034272}"
46,tpi,15,88,1,1,1.0,417075,DEL,(TCATAAATCTG)2→1,intergenic (‑123/‑57),sbcD/phoB,NC_000913,False,"(417075, 417075)",{},"[{'RegulonDB ID': 'ECK120001082/ECK120000719',...",False,False,{ECK120012852},{ECK120011011}


### TSS

Uses promoter data structures

In [15]:
# TSS_df = promoter_df.copy()
# TSS_df[3] = TSS_df[3].fillna(-1)  # -1 will ensure that no mutations ever accidentally get put on TSS with NaN pos.
# TSS_df[3] = TSS_df[3].apply(lambda x: int(x))
# TSS_df["range"] = TSS_df[3].apply(lambda x: (x, x))
# # all_muts_df["TSS"] = all_muts_df["range"].apply(get_feature_hit_set, args=[TSS_adf, "range", 0])
# all_muts_df["TSS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], TSS_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
# df = all_muts_df[all_muts_df["TSS"] != set()]
# display(df.shape, df.head())

### RBS

In [16]:
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
RBS_df.head()

Unnamed: 0,SHINE_DALGARNO_ID,GENE_ID,SHINE_DALGARNO_DIST_GENE,SHINE_DALGARNO_POSLEFT,SHINE_DALGARNO_POSRIGHT,SHINE_DALGARNO_SEQUENCE,SHINE_DALGARNO_NOTE,SD_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120014181,ECK120000266,-11,3151252,3151257,aaattacgcgCAGGATaatatccGAT,,,ECK12,"(3151252, 3151257)"
1,ECK120014182,ECK120000265,-9,3151991,3151996,acttgcgtccTGGAGAtacacAGT,,,ECK12,"(3151991, 3151996)"
2,ECK120014183,ECK120000496,-11,3957829,3957834,acgtcaacatCGAGGGctgtcccTGT,,,ECK12,"(3957829, 3957834)"
3,ECK120014184,ECK120000488,-10,3957957,3957962,cacaacatcaCGAGGAatcaccATG,,,ECK12,"(3957957, 3957962)"
4,ECK120014185,ECK120001215,-8,3469859,3469864,tttacgtcacAAGGGAttatAAT,,,ECK12,"(3469859, 3469864)"


In [17]:
all_muts_df["RBS"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], RBS_df, "range", "SHINE_DALGARNO_ID") if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["RBS"] != set()]
display(df.shape, df.head())

(6, 21)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS
26,TOL_coumaric_acid,8,50,1,1,1.0,2801966,DEL,"Δ11,843 bp",,"[nrdE], nrdF, proV, proW, proX, ygaY, ygaY, yg...",NC_000913,True,"(2801966, 2813808)","{ECK120001323, ECK120003971, ECK120000764, ECK...","[{'name': 'emrA', 'RegulonDB ID': 'ECK12000132...",False,False,"{ECK120012300, ECK120012302, ECK120012298, ECK...","{ECK120010556, ECK125137465, ECK120010370, ECK...","{ECK120014245, ECK120014246, ECK120014247}"
74,TOL_isobutyric_acid,1,50,2,1,1.0,255591,DEL,"Δ18,364 bp",IS5‑mediated,"[pepD], gpt, frsA, crl, crl, phoE, proB, proA,...",NC_000913,True,"(255591, 273954)","{ECK120016284, ECK120002746, ECK120001079, ECK...","[{'name': 'ykfC', 'RegulonDB ID': 'ECK12000275...",False,False,"{ECK120012951, ECK120012953, ECK120012955, ECK...","{ECK125136415, ECK120034547, ECK125136413, ECK...",{ECK120014270}
19,TOL_isobutyric_acid,8,50,1,1,1.0,1995819,DEL,"Δ40,006 bp",,"[yecF], sdiA, yecC, yecS, dcyD, fliY, fliZ, fl...",NC_000913,True,"(1995819, 2035824)","{ECK120001317, ECK120003630, ECK120029424, ECK...","[{'name': 'fliC', 'RegulonDB ID': 'ECK12000031...",False,False,"{ECK125165912, ECK125258861, ECK120016614, ECK...","{ECK125137173, ECK125137169, ECK125140840, ECK...","{ECK120014308, ECK120014309, ECK120014356, ECK..."
25,gnd,17,170,0,1,1.0,1677913,SNP,T→C,intergenic (‑10/‑514),pntA/ydgH,NC_000913,False,"(1677913, 1677913)",{},"[{'RegulonDB ID': 'ECK120000735/ECK120003449',...",False,False,{},{},{ECK120014306}
25,gnd,18,169,2,1,1.0,1677913,SNP,T→C,intergenic (‑10/‑514),pntA/ydgH,NC_000913,False,"(1677913, 1677913)",{},"[{'RegulonDB ID': 'ECK120000735/ECK120003449',...",False,False,{},{},{ECK120014306}


### Cis-regulatory RNA sequences
Not going to include these for now.

In [18]:
# cis_regulatory_RNA_df = pd.read_csv("./data/RegulonDB10/rfam.txt", sep="\t", comment='#', header=None)
# cis_regulatory_RNA_df["range"] = cis_regulatory_RNA_df.apply(lambda row: (row[6], row[7]), axis=1)
# cis_regulatory_RNA_df.head()

In [19]:
# # all_muts_df["cis-regulatory RNA"] = all_muts_df["range"].apply(get_feature_hit_set, args=[cis_regulatory_RNA_df, "range", 1])
# all_muts_df["cis-regulatory RNA"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], cis_regulatory_RNA_df, "range", 1) if r.exp not in NON_K12_EXP_L else set(), axis=1)
# df = all_muts_df[all_muts_df["cis-regulatory RNA"] != set()]
# display(df.shape, df.head())

### Terminator/Attenuator

In [20]:
att_term_df = pd.read_pickle("./data/att_term.pkl")
att_term_df.head()

Unnamed: 0,RegulonDB ID,range
0,ECK125143526,"(200, 311)"
1,ECK125143530,"(4979, 5078)"
2,ECK125143534,"(14134, 14155)"
3,ECK125143536,"(21166, 21255)"
4,ECK125143540,"(20912, 20982)"


In [21]:
all_muts_df["attenuator terminator"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], att_term_df, "range", "RegulonDB ID") if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["attenuator terminator"] != set()]
display(df.shape, df.head())

(172, 22)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator
6,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
6,42C,2,163,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
44,42C,3,120,1,1,1.0,953802,SNP,A→G,V222A (GTA→GCA),focA,,True,"(953802, 953802)",{ECK120001233},"[{'name': 'focA', 'RegulonDB ID': 'ECK12000123...",False,False,{},{},{},{ECK125143834}
6,42C,3,120,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}
6,42C,4,161,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791}


### Attenuator

Assuming all attenuators are also annotated with "attenuator terminator" entries. Will use the "attenuator terminator" entries to get associated genes.

In [22]:
# attenuator_df = pd.read_csv("./data/RegulonDB10/attenuator.txt", sep="\t", comment='#', header=None)
# #                             column=["ATTENUATOR_ID", "GENE_ID", "ATTENUATOR_TYPE", "ATTENUATOR_STRAND"])
# attenuator_df.head()

In [23]:
# def get_attenuator_id_set(terminator_attenuator_id_set):
#     attenuator_id_set = set()
#     if bool(terminator_attenuator_id_set):
#         for terminator_attenuator_id in terminator_attenuator_id_set:
#             attenuator_subset_df = attenuator_df[attenuator_df[0]==terminator_attenuator_id]
#             attenuator_id_set = attenuator_id_set | set(attenuator_subset_df[0])
#     return attenuator_id_set


# all_muts_df["attenuator"] = all_muts_df["attenuator terminator"].apply(get_attenuator_id_set)
# df = all_muts_df[all_muts_df["attenuator"] != set()]
# # !!! ATTENUATOR_TERMINATOR_ID = ATTENUATOR_ID
# display(df.shape, df.head())

### Terminator

In [24]:
terminator_df = pd.read_csv("./data/RegulonDB10/terminator.txt", sep="\t", comment='#', header=None)
terminator_df["range"] = terminator_df.apply(lambda row: (row[2], row[3]), axis=1)
terminator_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,range
0,ECK120010779,,2738912,2738940,rho-independent,ctgatgaaaaGGTGCCGGATGATGTGAATCATCCGGCACtggattatta,,,ECK12,"(2738912, 2738940)"
1,ECK120010780,,2684075,2684093,rho-independent,taacgtagaaAGGCTTCCCGAAGGAAGCCttgatgatca,,,ECK12,"(2684075, 2684093)"
2,ECK120010781,,2311610,2311624,rho-independent,caatgaaaaaAGGGCCCGCAGGCCCtttgttcgat,,,ECK12,"(2311610, 2311624)"
3,ECK120010782,,1159325,1159346,rho-independent,tggggagactAAGGCAGCCAGATGGCTGCCTTttttacaggt,,,ECK12,"(1159325, 1159346)"
4,ECK120010783,,1113532,1113560,rho-independent,acgagccaatAAAAATACCGGCGTTATGCCGGTATTTTTttacgaaaga,,,ECK12,"(1113532, 1113560)"


In [25]:
# all_muts_df["terminator"] = all_muts_df["range"].apply(get_feature_hit_set, args=[terminator_df, "range", 0])
all_muts_df["terminator"] = all_muts_df.apply(lambda r: get_feature_hit_set(r["range"], terminator_df, "range", 0) if r.exp not in NON_K12_EXP_L else set(), axis=1)
df = all_muts_df[all_muts_df["terminator"] != set()]
display(df.shape, df.head())

(84, 23)

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator
60,42C,3,120,1,1,1.0,3648144,DEL,(T)7→6,noncoding (82/84 nt),agrA,,False,"(3648144, 3648144)",{ECK125158033},"[{'name': 'agrA', 'RegulonDB ID': 'ECK12515803...",False,False,{},{},{},{},{ECK125160656}
113,42C,6,164,1,1,1.0,3950467,SUB,2 bp→TT,intergenic (+47/‑39),ilvL/ilvX,,False,"(3950467, 3950468)",{},"[{'RegulonDB ID': 'ECK120001244/ECK120048853',...",False,False,{},{},{},{},{ECK120033263}
149,42C,8,164,1,1,1.0,3950468,SNP,G→T,intergenic (+48/‑39),ilvL/ilvX,,False,"(3950468, 3950468)",{},"[{'RegulonDB ID': 'ECK120001244/ECK120048853',...",False,False,{},{},{},{},{ECK120033263}
15,C13,1,134,1,1,1.0,3815810,DEL,Δ1 bp,intergenic (‑42/+24),pyrE/rph,,False,"(3815810, 3815810)",{},"[{'RegulonDB ID': 'ECK120000799/ECK120000854',...",False,False,{},{},{},{ECK125144791},{ECK120035133}
15,C13,4,59,1,1,1.0,3815810,DEL,Δ1 bp,intergenic (‑42/+24),pyrE/rph,,False,"(3815810, 3815810)",{},"[{'RegulonDB ID': 'ECK120000799/ECK120000854',...",False,False,{},{},{},{ECK125144791},{ECK120035133}


In [26]:
all_muts_df.to_pickle("./data/2_df.pkl")
display(all_muts_df.shape)

(2244, 23)