In [1]:
import pandas as pd

import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.gene import get_gene_bnum
pd.options.display.max_columns = 100

Assuming that the genes coding for the TFs controlling the imodulons aren't included in the file that describes which imodulons genes are involved in. Past versions of the imodulons files had shown this assumption to be true. Therefore, need to pull imodulon controlling genes into the same datastructure used to map genes to imodulons.

In [2]:
genes_df = pd.read_csv("./data/RegulonDB10/gene.txt", sep="\t", comment='#', header=None)
genes_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,


In [3]:
def get_imodulon_gene_set(imod_row):
    imod_gene_set = set()
    TF_str = str(imod_row["regulator"])
    if TF_str != "nan":
        if '/' in TF_str:
            imod_gene_set = set(TF_str.split('/'))
        elif ';' in imod_row["regulator"]:
            imod_gene_set = set(TF_str.split(';'))
        elif '+' in imod_row["regulator"]:
            imod_gene_set = set(TF_str.split('+'))
        else:
            imod_gene_set = set([TF_str])
    return imod_gene_set


imodulon_df = pd.read_csv("./data/imodulons/curated_enrichments.csv", sep=",")
imodulon_df = imodulon_df.drop(columns=['f1score', 'pvalue', 'precision', 'recall', 'n_tf', 'n_genes', 'threshold', 'TP'])
imodulon_df["genes"] = imodulon_df.apply(get_imodulon_gene_set, axis=1)
imodulon_df.head()

Unnamed: 0,name,regulator,Category,genes
0,AllR/AraC/FucR,allR/araC/fucR,Carbon Source Utilization,"{allR, araC, fucR}"
1,ArcA-1,arcA,Energy Metabolism,{arcA}
2,ArcA-2,arcA,Energy Metabolism,{arcA}
3,ArgR,argR,Amino Acid and Nucleotide Biosynthesis,{argR}
4,AtoC,atoC,Miscellaneous Metabolism,{atoC}


In [4]:
gene_synonym_df = pd.read_csv(
    "./data/RegulonDB10/object_synonym.txt",
    sep="\t",
    comment='#',
    header=None,
    quoting=3
)
gene_synonym_df.columns = ["OBJECT_ID", "OBJECT_SYNONYM_NAME", "OS_INTERNAL_COMMENT", "KEY_ID_ORG"]
gene_synonym_df.head()

Unnamed: 0,OBJECT_ID,OBJECT_SYNONYM_NAME,OS_INTERNAL_COMMENT,KEY_ID_ORG
0,ECK120000001,EG10001,,ECK12
1,ECK120000001,ECK4045,,ECK12
2,ECK120000001,b4053,,ECK12
3,ECK120000001,alr5,,ECK12
4,ECK120000002,b0764,,ECK12


In [5]:
object_external_db_id_df = pd.read_csv("./data/RegulonDB10/object_external_db_link.txt", sep="\t", comment='#', header=None)


def get_regulondb_gene_id_set(row):
    regulondb_gene_id_set = set()
    gene_name_set = row["genes"]
    for gene_name in gene_name_set:
        df = genes_df[genes_df[1]==gene_name]
        if len(df)>0:
            gene_id = df.iloc[0][0]
            regulondb_gene_id_set.add(gene_id)
    return regulondb_gene_id_set

imodulon_df["gene ID"] = imodulon_df.apply(get_regulondb_gene_id_set, axis=1)

In [6]:
REGULON_DB_GENE_ID_COL_IDX = 0
BLATTNER_NUMBER_COL_IDX = 2
def get_bnum_set(mut_row, gene_ID_mut_col):
    bnum_set = set()
    gene_ID_set = mut_row[gene_ID_mut_col]
    if gene_ID_set != set():
        for gene_ID in gene_ID_set:
            bnum_set.add(get_gene_bnum(gene_ID, gene_synonym_df))
    return bnum_set

imodulon_df["b number"] = imodulon_df.apply(get_bnum_set, args=("gene ID",), axis=1)
imodulon_df.head()

Unnamed: 0,name,regulator,Category,genes,gene ID,b number
0,AllR/AraC/FucR,allR/araC/fucR,Carbon Source Utilization,"{allR, araC, fucR}","{ECK120000347, ECK120002884, ECK120000050}","{b2805, b0506, b0064}"
1,ArcA-1,arcA,Energy Metabolism,{arcA},{ECK120000057},{b4401}
2,ArcA-2,arcA,Energy Metabolism,{arcA},{ECK120000057},{b4401}
3,ArgR,argR,Amino Acid and Nucleotide Biosynthesis,{argR},{ECK120000066},{b3237}
4,AtoC,atoC,Miscellaneous Metabolism,{atoC},{ECK120001611},{b2220}


In [8]:
from csv import reader
 
# open file in read mode
imod_gene_d = dict()
with open('./data/imodulons/imodulon_gene_bnumbers.txt', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for line in csv_reader:
        imod = line[0]
        for bnum in line[1:]:
            if bnum not in imod_gene_d.keys():
                imod_gene_d[bnum] = set()
            imod_gene_d[bnum].add(imod)

In [9]:
# Add i-modulon regulator gene to the imod gene list to get TUs for in the later cells.
for _, r in imodulon_df.iterrows():
    imod_name = r["name"]
    for bnum in r["b number"]:
        if bnum not in imod_gene_d.keys():
            imod_gene_d[bnum] = set()
        imod_gene_d[bnum].add(imod_name)

In [12]:
bnum_imod_df = pd.DataFrame()
for k, v in imod_gene_d.items():
    df = pd.DataFrame([[v, k]], columns=["imodulon", 'bnum'])
    bnum_imod_df = bnum_imod_df.append(df)
bnum_imod_df

Unnamed: 0,imodulon,bnum
0,"{AllR/AraC/FucR, YgbI}",b0509
0,"{AllR/AraC/FucR, deletion-1}",b0508
0,{AllR/AraC/FucR},b0507
0,{AllR/AraC/FucR},b0512
0,"{BW25113, AllR/AraC/FucR}",b0063
...,...,...
0,{RpoS},b2741
0,{SrlR+GutM},b2707
0,{Tryptophan},b4393
0,{Zinc},b4046


In [13]:
# # Some of the bnums have _1 or _2. I'm assuming these may be for pseudogenes.
# # Going to ignore these for now since currently not intend to publish with imodulons.
# # Opened a Jira ticket to keep track of this: ASW-1078
# bnum_imod_df = bnum_imod_df[~(bnum_imod_df["bnum"].str.contains("_"))]
# display(len(bnum_imod_df), bnum_imod_df.head())

In [14]:
bnum_imod_df = bnum_imod_df[~(bnum_imod_df["bnum"]=="")]
display(len(bnum_imod_df), bnum_imod_df.head())

1459

Unnamed: 0,imodulon,bnum
0,"{AllR/AraC/FucR, YgbI}",b0509
0,"{AllR/AraC/FucR, deletion-1}",b0508
0,{AllR/AraC/FucR},b0507
0,{AllR/AraC/FucR},b0512
0,"{BW25113, AllR/AraC/FucR}",b0063


In [15]:
def get_gene_RegulonDB_ID(gene_artifact):  # gene_artifact is anything that is found within gene_synonym_df
    ret_val = ""
    # some b numbers aren't found within the object_synonym.txt
    df = gene_synonym_df[gene_synonym_df["OBJECT_SYNONYM_NAME"]==gene_artifact]
    if len(df) > 0: 
        ret_val = df["OBJECT_ID"].iloc[0]
    return ret_val

assert(get_gene_RegulonDB_ID("b0001")=='ECK120001251')


bnum_imod_df["RegulonDB ID"] = bnum_imod_df["bnum"].apply(get_gene_RegulonDB_ID)
bnum_imod_df.head()

Unnamed: 0,imodulon,bnum,RegulonDB ID
0,"{AllR/AraC/FucR, YgbI}",b0509,ECK120002886
0,"{AllR/AraC/FucR, deletion-1}",b0508,ECK120002885
0,{AllR/AraC/FucR},b0507,ECK120001534
0,{AllR/AraC/FucR},b0512,ECK120002889
0,"{BW25113, AllR/AraC/FucR}",b0063,ECK120000049


In [16]:
TU_objects_df = pd.read_csv("./data/RegulonDB10/tu_objects_tmp.txt",
                            sep="\t",
                            comment='#',
                            header=None,
                            )
TU_objects_df.columns = [
    "TRANSCRIPTION_UNIT_ID",
    "NUMTU",
    "TU_POSLEFT",
    "TU_POSRIGHT",
    "TU_TYPE",
     "TU_OBJECT_CLASS",
    "TU_OBJECT_ID",
    "TU_OBJECT_NAME",
    "TU_OBJECT_POSLEFT",
    "TU_OBJECT_POSRIGHT",
    "TU_OBJECT_STRAND",
    "TU_OBJECT_COLORCLASS",
    "TU_OBJECT_DESCRIPTION",
    "TU_OBJECT_SIGMA",
    "TU_OBJECT_EVIDENCE",
    "TU_OBJECT_RI_TYPE",
    "TU_OBJECT_TYPE",
    "EVIDENCE"]
TU_objects_df.head()

Unnamed: 0,TRANSCRIPTION_UNIT_ID,NUMTU,TU_POSLEFT,TU_POSRIGHT,TU_TYPE,TU_OBJECT_CLASS,TU_OBJECT_ID,TU_OBJECT_NAME,TU_OBJECT_POSLEFT,TU_OBJECT_POSRIGHT,TU_OBJECT_STRAND,TU_OBJECT_COLORCLASS,TU_OBJECT_DESCRIPTION,TU_OBJECT_SIGMA,TU_OBJECT_EVIDENCE,TU_OBJECT_RI_TYPE,TU_OBJECT_TYPE,EVIDENCE
0,ECK120008913,3,1825955,1832013,H,PM,ECK120009851,astCp1,1832013,1832013,R,,,Sigma70,Human inference of promoter position,,predicted,
1,ECK120008913,3,1825955,1832013,H,GN,ECK120003528,astE,1825955,1826923,R,51153255.0,amino acids,,,,predicted,
2,ECK120008913,3,1825955,1832013,H,GN,ECK120003529,astB,1826916,1828259,R,51153255.0,amino acids,,,,predicted,
3,ECK120008913,3,1825955,1832013,H,GN,ECK120003532,astC,1830762,1831982,R,255.0,nitrogen metabolism,,,,predicted,
4,ECK120008913,3,1825955,1832013,H,GN,ECK120003530,astD,1828256,1829734,R,51153255.0,amino acids,,,,predicted,


In [17]:
def get_TU_ID_set(feat_ID):
    return set(TU_objects_df[TU_objects_df["TU_OBJECT_ID"]==feat_ID]["TRANSCRIPTION_UNIT_ID"])

bnum_imod_df["TU"] = bnum_imod_df["RegulonDB ID"].apply(get_TU_ID_set)
bnum_imod_df.head()

Unnamed: 0,imodulon,bnum,RegulonDB ID,TU
0,"{AllR/AraC/FucR, YgbI}",b0509,ECK120002886,{ECK120015331}
0,"{AllR/AraC/FucR, deletion-1}",b0508,ECK120002885,{ECK120015331}
0,{AllR/AraC/FucR},b0507,ECK120001534,{ECK120015331}
0,{AllR/AraC/FucR},b0512,ECK120002889,{ECK120015331}
0,"{BW25113, AllR/AraC/FucR}",b0063,ECK120000049,{ECK120009405}


In [18]:
TU_objects_df["TU range"] = TU_objects_df.apply(lambda row: (row["TU_POSLEFT"], row["TU_POSRIGHT"]), axis=1)
TU_range_df = TU_objects_df.copy()
TU_range_df = TU_range_df.drop(
    columns=['NUMTU',
             'TU_TYPE',
             'TU_OBJECT_ID',
             'TU_OBJECT_POSLEFT',
             'TU_OBJECT_POSRIGHT',
             'TU_OBJECT_STRAND',
             'TU_OBJECT_COLORCLASS',
             'TU_OBJECT_DESCRIPTION',
             'TU_OBJECT_SIGMA',
             'TU_OBJECT_EVIDENCE',
             'TU_OBJECT_RI_TYPE',
             'TU_OBJECT_TYPE',
             'TU_OBJECT_CLASS',
             'TU_OBJECT_NAME',
             'EVIDENCE'
            ]
)
TU_range_df = TU_range_df.drop_duplicates()
TU_range_df.head()

Unnamed: 0,TRANSCRIPTION_UNIT_ID,TU_POSLEFT,TU_POSRIGHT,TU range
0,ECK120008913,1825955,1832013,"(1825955, 1832013)"
12,ECK120008914,1825955,1832327,"(1825955, 1832327)"
38,ECK120008915,1825955,1832039,"(1825955, 1832039)"
50,ECK120008916,2800586,2804461,"(2800586, 2804461)"
74,ECK120008917,4105726,4106387,"(4105726, 4106387)"


In [19]:
imod_TU_range_df = pd.DataFrame()
for _, r in bnum_imod_df.iterrows():
    for imod in r["imodulon"]:
        for TU in r["TU"]:
            imod_TU_range_df = imod_TU_range_df.append({
                "imodulon": imod,
                "TRANSCRIPTION_UNIT_ID": TU,
                "TU range": TU_range_df[TU_range_df["TRANSCRIPTION_UNIT_ID"]==TU]["TU range"].iloc[0]
            }, ignore_index=True)
imod_TU_range_df = imod_TU_range_df.drop_duplicates()
imod_TU_range_df.head()

Unnamed: 0,TRANSCRIPTION_UNIT_ID,TU range,imodulon
0,ECK120015331,"(533879, 543033)",AllR/AraC/FucR
1,ECK120015331,"(533879, 543033)",YgbI
3,ECK120015331,"(533879, 543033)",deletion-1
6,ECK120009405,"(65800, 70358)",BW25113
7,ECK120009405,"(65800, 70358)",AllR/AraC/FucR


In [20]:
imod_TU_range_df.to_pickle("./data/imod_TU_df.pkl")