In [1]:
import pandas as pd

The same feature tables are constantly being rebuilt in this pipeline, where some can take a substantial amount of time to build and annotate. Should only have to build these tables once (into dataframes) and just import them where needed.  

In [2]:
RBS_df = pd.read_csv(
    "./data/RegulonDB10/shine_dalgarno.txt", sep="\t", comment='#', header=None)
RBS_df.columns = [
    "SHINE_DALGARNO_ID",
    "GENE_ID",
    "SHINE_DALGARNO_DIST_GENE",
    "SHINE_DALGARNO_POSLEFT",
    "SHINE_DALGARNO_POSRIGHT",
    "SHINE_DALGARNO_SEQUENCE",
    "SHINE_DALGARNO_NOTE",
    "SD_INTERNAL_COMMENT",
    "KEY_ID_ORG",
]


RBS_df["range"] = RBS_df.apply(
    lambda row: (row["SHINE_DALGARNO_POSLEFT"], row["SHINE_DALGARNO_POSRIGHT"]), axis=1)
RBS_df.to_pickle("./data/RBS_df.pkl")
RBS_df.head()

Unnamed: 0,SHINE_DALGARNO_ID,GENE_ID,SHINE_DALGARNO_DIST_GENE,SHINE_DALGARNO_POSLEFT,SHINE_DALGARNO_POSRIGHT,SHINE_DALGARNO_SEQUENCE,SHINE_DALGARNO_NOTE,SD_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120014181,ECK120000266,-11,3151252,3151257,aaattacgcgCAGGATaatatccGAT,,,ECK12,"(3151252, 3151257)"
1,ECK120014182,ECK120000265,-9,3151991,3151996,acttgcgtccTGGAGAtacacAGT,,,ECK12,"(3151991, 3151996)"
2,ECK120014183,ECK120000496,-11,3957829,3957834,acgtcaacatCGAGGGctgtcccTGT,,,ECK12,"(3957829, 3957834)"
3,ECK120014184,ECK120000488,-10,3957957,3957962,cacaacatcaCGAGGAatcaccATG,,,ECK12,"(3957957, 3957962)"
4,ECK120014185,ECK120001215,-8,3469859,3469864,tttacgtcacAAGGGAttatAAT,,,ECK12,"(3469859, 3469864)"


In [3]:
operon_df = pd.read_csv("./data/RegulonDB10/operon.txt", sep="\t", comment='#', header=None)
operon_df.columns = [
    "OPERON_ID",
    "OPERON_NAME",
    "FIRSTGENEPOSLEFT",
    "LASTGENEPOSRIGHT",
    "REGULATIONPOSLEFT",
    "REGULATIONPOSRIGHT",
    "OPERON_STRAND",
    "OPERON_INTERNAL_COMMENT",
    "KEY_ID_ORG"]
operon_df["range"] = operon_df.apply(lambda row: (row["REGULATIONPOSLEFT"], row["REGULATIONPOSRIGHT"]), axis=1)
operon_df.to_pickle("./data/operon_df.pkl")
operon_df.head()

Unnamed: 0,OPERON_ID,OPERON_NAME,FIRSTGENEPOSLEFT,LASTGENEPOSRIGHT,REGULATIONPOSLEFT,REGULATIONPOSRIGHT,OPERON_STRAND,OPERON_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120011191,rob,4634441,4635310,4634441,4635382,reverse,,ECK12,"(4634441, 4635382)"
1,ECK120011203,argP,3059753,3060646,3059684,3060680,forward,,ECK12,"(3059684, 3060680)"
2,ECK120011329,lrhA,2405703,2406641,2405703,2406866,reverse,,ECK12,"(2405703, 2406866)"
3,ECK120011490,rtcR,3558268,3559866,3558268,3559866,forward,,ECK12,"(3558268, 3559866)"
4,ECK120011629,lrp,932595,933089,932265,933089,forward,,ECK12,"(932265, 933089)"


In [4]:
# need to get total "unknown" operon category length
# follow the same approach as with total "unknown" COG length (get_COG_lengths NB)
# Could be faster
all_op_pos_set = set()
for i, r in operon_df.iterrows():
    op_pos = set(range(r["REGULATIONPOSLEFT"], r["REGULATIONPOSRIGHT"] + 1))
    all_op_pos_set = all_op_pos_set.union(op_pos)
display(len(all_op_pos_set))

4375748

In [5]:
unknown_op_pos_set = set(range(1 ,4641653 + 1)).difference(all_op_pos_set)
unknown_op_nuc_len = len(unknown_op_pos_set)
op_len_df = pd.DataFrame(
    data={"operon description": ["unknown"], "length": [unknown_op_nuc_len]},
    columns=["operon description", "length"],
)
op_len_df

Unnamed: 0,operon description,length
0,unknown,265905


In [6]:
op_len_df.to_pickle("./data/operon_length_df.pkl")

# attenuator terminator
The point of this notebook is to build those tables ASW-1216

In [7]:
import pandas as pd

In [8]:
att_term_df = pd.read_csv(
    "./data/RegulonDB10/attenuator_terminator.txt", sep="\t", comment='#', header=None)
att_term_df.columns = [
    "A_TERMINATOR_ID",
    "A_TERMINATOR_TYPE",
    "A_TERMINATOR_POSLEFT",
    "A_TERMINATOR_POSRIGHT",
    "A_TERMINATOR_ENERGY",
    "A_TERMINATOR_SEQUENCE",
    "A_TERMINATOR_ATTENUATOR_ID"
]
# att_term_df["range"] = att_term_df.apply(lambda row: (row.A_TERMINATOR_POSLEFT, row.A_TERMINATOR_POSRIGHT), axis=1)
att_term_df.head()

Unnamed: 0,A_TERMINATOR_ID,A_TERMINATOR_TYPE,A_TERMINATOR_POSLEFT,A_TERMINATOR_POSRIGHT,A_TERMINATOR_ENERGY,A_TERMINATOR_SEQUENCE,A_TERMINATOR_ATTENUATOR_ID
0,ECK125143527,terminator,276,311,-20.0,aacacagaaaAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCg...,ECK125143526
1,ECK125143528,anti-terminator,244,288,-19.41,acaggtaacgGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAG...,ECK125143526
2,ECK125143529,anti-anti-terminator,200,255,-14.5,tgaaacgcatTAGCACCACCATTACCACCACCATCACCATTACCAC...,ECK125143526
3,ECK125143531,terminator,5042,5078,-19.3,atctcaatcaGGCCGGGTTTGCTTTTATGCAGCCCGGCTTTTTTAT...,ECK125143530
4,ECK125143532,anti-terminator,4979,5051,-11.9,ctgcccgccgATTTTGCTGCGTTGCGTAAATTGATGATGAATCATC...,ECK125143530


In [9]:
new_att_term_df = pd.DataFrame()
for att_term_id, r in att_term_df.groupby(["A_TERMINATOR_ATTENUATOR_ID"]):
    range_l = list(r.A_TERMINATOR_POSLEFT) + list(r.A_TERMINATOR_POSRIGHT)
    combo_att_term_range = (min(range_l), max(range_l))
    new_att_term_df = new_att_term_df.append({"RegulonDB ID": att_term_id, "range": combo_att_term_range}, ignore_index=True)
new_att_term_df.head()

Unnamed: 0,RegulonDB ID,range
0,ECK125143526,"(200, 311)"
1,ECK125143530,"(4979, 5078)"
2,ECK125143534,"(14134, 14155)"
3,ECK125143536,"(21166, 21255)"
4,ECK125143540,"(20912, 20982)"


In [10]:
new_att_term_df.to_pickle("./data/att_term_df.pkl")

In [11]:
terminator_df = pd.read_csv("./data/RegulonDB10/terminator.txt", sep="\t", comment='#', header=None)
terminator_df.to_pickle("./data/term_df.pkl")

In [12]:
TU_df = pd.read_csv("./data/RegulonDB10/transcription_unit.txt", sep="\t", comment='#', header=None)
TU_df.columns = [
    "TRANSCRIPTION_UNIT_ID",
    "PROMOTER_ID",
    "TRANSCRIPTION_UNIT_NAME",
    "OPERON_ID",
    "TRANSCRIPTION_UNIT_NOTE",
    "TU_INTERNAL_COMMENT",
    "KEY_ID_ORG"]
TU_df.to_pickle("./data/TU_df.pkl")

In [13]:
gene_df = pd.read_csv(
    "./data/RegulonDB10/gene.txt", sep="\t", comment='#', header=None)
gene_df.columns = [
    "GENE_ID",
    "GENE_NAME",
    "GENE_POSLEFT",
    "GENE_POSRIGHT",
    "GENE_STRAND",
    "GENE_SEQUENCE",
    "GC_CONTENT",
    "CRI_SCORE",
    "GENE_NOTE",
    "GENE_INTERNAL_COMMENT",
    "KEY_ID_ORG",
    "GENE_TYPE"
]

def get_gene_range(row):
    r = ()
    if not pd.isna(row["GENE_POSLEFT"]) and not pd.isna(row["GENE_POSRIGHT"]):
        r = (int(row["GENE_POSLEFT"]), int(row["GENE_POSRIGHT"])) 
    return r

gene_df["range"] = gene_df.apply(lambda r: get_gene_range(r), axis=1)
gene_df.to_pickle("./data/gene_df.pkl")


df = gene_df.copy()
df = df.sort_values(by=["GENE_POSLEFT"])  # Ordering the genes according to their left-most position.
df = df.reset_index()

intergenic_df = pd.DataFrame(columns=["name", "RegulonDB ID", "range"])


gene_idex_iterator = iter(range(0, len(df) - 1))  # Don't want to iterate to the last one since the gene will be considered in i-1
for i in gene_idex_iterator:
    # check if there even exists an intergenic region between genes.
    if (df.loc[i + 1]["GENE_POSLEFT"] - df.loc[i]["GENE_POSRIGHT"]) > 1:
        intergenic_df = intergenic_df.append(
            {"name": str(df.loc[i]["GENE_NAME"]) + '/' + str(df.loc[i + 1]["GENE_NAME"]),
             "RegulonDB ID": df.loc[i]["GENE_ID"] + '/' + df.loc[i + 1]["GENE_ID"],
             "range": (int(df.loc[i]["GENE_POSRIGHT"] + 1), int(df.loc[i + 1]["GENE_POSLEFT"] - 1))},
            ignore_index=True)
    else:
        # Checks if needs to skip next gene, otherwise will iterate into it.
        if (df.loc[i + 1]["GENE_POSRIGHT"] - df.loc[i]["GENE_POSRIGHT"]) <= 0:
            next(gene_idex_iterator)  # skips the next gene since is resides completely within the current

intergenic_df.to_pickle("./data/intergenic_region_df.pkl")

In [14]:
TU_objects_df = pd.read_csv("./data/RegulonDB10/tu_objects_tmp.txt",
                            sep="\t",
                            comment='#',
                            header=None,
                           )

TU_objects_df.columns = [
"TRANSCRIPTION_UNIT_ID",
"NUMTU",
"TU_POSLEFT",
"TU_POSRIGHT",
"TU_TYPE",
"TU_OBJECT_CLASS",
"TU_OBJECT_ID",
"TU_OBJECT_NAME",
"TU_OBJECT_POSLEFT",
"TU_OBJECT_POSRIGHT",
"TU_OBJECT_STRAND",
"TU_OBJECT_COLORCLASS",
"TU_OBJECT_DESCRIPTION",
"TU_OBJECT_SIGMA",
"TU_OBJECT_EVIDENCE",
"TU_OBJECT_RI_TYPE",
"TU_OBJECT_TYPE",
"EVIDENCE"
]

TU_objects_df.to_pickle("./data/TU_objects_df.pkl")

In [15]:
gene_synonym_df = pd.read_csv(
    "./data/RegulonDB10/object_synonym.txt",
    sep="\t",
    comment='#',
    header=None,
    quoting=3
)
gene_synonym_df.columns = ["OBJECT_ID", "OBJECT_SYNONYM_NAME", "OS_INTERNAL_COMMENT", "KEY_ID_ORG"]
gene_synonym_df.to_pickle("./data/gene_synonym_df.pkl")
gene_synonym_df.head()

Unnamed: 0,OBJECT_ID,OBJECT_SYNONYM_NAME,OS_INTERNAL_COMMENT,KEY_ID_ORG
0,ECK120000001,EG10001,,ECK12
1,ECK120000001,ECK4045,,ECK12
2,ECK120000001,b4053,,ECK12
3,ECK120000001,alr5,,ECK12
4,ECK120000002,b0764,,ECK12


In [16]:
gene_pathway_df = pd.read_csv(
    "./data/pathways/511145.12.PATRIC.pathway.tab",
    sep="\t",
)
# Some of the entries have different entries for columsn in ec_description and ec_number and therefore can make duplicate entries when associating pathways to genes
gene_pathway_df = gene_pathway_df.drop(columns=[
    'genome_id',
    'genome_name',
    'patric_id',
    'alt_locus_tag',
    #  'gene',
#     'product',
#     'ec_number',
#     'ec_description',
    'pathway_id'
])
# gene_pathway_df = gene_pathway_df.drop_duplicates()
gene_pathway_df.to_pickle("./data/gene_pathway_df.pkl")
gene_pathway_df.head()

# display(gene_pathway_df[gene_pathway_df["refseq_locus_tag"]=="b3650"], gene_pathway_df[gene_pathway_df["gene"]=="spoT"])

Unnamed: 0,refseq_locus_tag,gene,product,ec_number,ec_description,pathway_name
0,b0002,thrA,Aspartokinase (EC 2.7.2.4) / Homoserine dehydr...,1.1.1.3,Homoserine dehydrogenase,"Glycine, serine and threonine metabolism"
1,b0002,thrA,Aspartokinase (EC 2.7.2.4) / Homoserine dehydr...,1.1.1.3,Homoserine dehydrogenase,Cysteine and methionine metabolism
2,b0002,thrA,Aspartokinase (EC 2.7.2.4) / Homoserine dehydr...,1.1.1.3,Homoserine dehydrogenase,Lysine biosynthesis
3,b0002,thrA,Aspartokinase (EC 2.7.2.4) / Homoserine dehydr...,2.7.2.4,Aspartate kinase,"Glycine, serine and threonine metabolism"
4,b0002,thrA,Aspartokinase (EC 2.7.2.4) / Homoserine dehydr...,2.7.2.4,Aspartate kinase,Cysteine and methionine metabolism


In [17]:
COG_df = pd.read_csv("./data/cogs_ecoli_mg1655.csv", encoding = "ISO-8859-1")
COG_df = COG_df.drop(["GI", "COG ID", "COG name", "COG category"], axis=1).drop_duplicates()
COG_df = COG_df[~(COG_df["COG description"]=="Function unknown")]  # We want to ignore the "Function unknown" COG entries because we ignore it and it's mutation for the statistical work and mutflow visualizations
COG_df.to_pickle("./data/COG_df.pkl")
display(len(COG_df), COG_df.head())

3441

Unnamed: 0,gene,locus,COG category primary,COG class,COG description
0,thrA,b0002,E,METABOLISM,Amino acid transport and metabolism
1,metL,b3940,E,METABOLISM,Amino acid transport and metabolism
4,lysC,b4024,E,METABOLISM,Amino acid transport and metabolism
5,thrB,b0003,E,METABOLISM,Amino acid transport and metabolism
6,thrC,b0004,E,METABOLISM,Amino acid transport and metabolism


In [18]:
tfbs_df = pd.read_csv("./data/RegulonDBwebsite10/BindingSiteSet.txt", sep="\t", comment='#', header=None)

def get_TFBS_range(tfbs_df_row):
    r = ()
    if not pd.isna(tfbs_df_row[3]) and not pd.isna(tfbs_df_row[4]):
        r = (int(tfbs_df_row[3]), int(tfbs_df_row[4])) 
    return r

tfbs_df.to_pickle("./data/TFBS_df.pkl")
display(tfbs_df.shape, tfbs_df.head())

(3562, 14)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,ECK120015994,AcrR,ECK125258528,485709,485732,reverse,ECK120033472,acrAB,-,acrAp,-22.5,gcgttagattTACATACATTTGTGAATGTATGTAccatagcacg,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong
1,ECK120015994,AcrR,ECK125258528,485709,485732,forward,ECK125134945,acrR,-,acrRp,22.5,cgtgctatggTACATACATTCACAAATGTATGTAaatctaacgc,"[BCE|W|Binding of cellular extracts],[GEA|W|Ge...",Strong
2,ECK120015994,AcrR,ECK125202663,1619048,1619058,forward,ECK125202664,marRAB,-,marRp,-40.5,catcggtcaaTTCATTCATTtgacttatac,"[GEA|W|Gene expression analysis],[BPP|S|Bindin...",Strong
3,ECK120015994,AcrR,ECK125242724,1978422,1978432,reverse,ECK125242725,flhDC,-,flhDp,-31.5,tcactacacgCACATACAACggaggggggc,"[GEA|W|Gene expression analysis],[HIBSCS|W|Hum...",Weak
4,ECK120015994,AcrR,ECK120035040,2313112,2313135,forward,ECK120035041,micF,-,micFp,41.0,atttattaccGTCATTCATTTCTGAATGTCTGTTtacccctatt,[AIBSCS|W|Automated inference based on similar...,Weak


In [19]:
promoter_df = pd.read_csv("./data/RegulonDB10/promoter.txt", sep="\t", comment='#', header=None)
promoter_df.to_pickle("./data/promoter_df.pkl")