In [1]:
import pandas as pd

The same feature tables are constantly being rebuilt in this pipeline, where some can take a substantial amount of time to build and annotate. Should only have to build these tables once (into dataframes) and just import them where needed.  

# RBS

In [2]:
RBS_df = pd.read_csv(
    "./data/RegulonDB10/shine_dalgarno.txt", sep="\t", comment='#', header=None)
RBS_df.columns = [
    "SHINE_DALGARNO_ID",
    "GENE_ID",
    "SHINE_DALGARNO_DIST_GENE",
    "SHINE_DALGARNO_POSLEFT",
    "SHINE_DALGARNO_POSRIGHT",
    "SHINE_DALGARNO_SEQUENCE",
    "SHINE_DALGARNO_NOTE",
    "SD_INTERNAL_COMMENT",
    "KEY_ID_ORG",
]


RBS_df["range"] = RBS_df.apply(
    lambda row: (row["SHINE_DALGARNO_POSLEFT"], row["SHINE_DALGARNO_POSRIGHT"]), axis=1)
RBS_df.head()

Unnamed: 0,SHINE_DALGARNO_ID,GENE_ID,SHINE_DALGARNO_DIST_GENE,SHINE_DALGARNO_POSLEFT,SHINE_DALGARNO_POSRIGHT,SHINE_DALGARNO_SEQUENCE,SHINE_DALGARNO_NOTE,SD_INTERNAL_COMMENT,KEY_ID_ORG,range
0,ECK120014181,ECK120000266,-11,3151252,3151257,aaattacgcgCAGGATaatatccGAT,,,ECK12,"(3151252, 3151257)"
1,ECK120014182,ECK120000265,-9,3151991,3151996,acttgcgtccTGGAGAtacacAGT,,,ECK12,"(3151991, 3151996)"
2,ECK120014183,ECK120000496,-11,3957829,3957834,acgtcaacatCGAGGGctgtcccTGT,,,ECK12,"(3957829, 3957834)"
3,ECK120014184,ECK120000488,-10,3957957,3957962,cacaacatcaCGAGGAatcaccATG,,,ECK12,"(3957957, 3957962)"
4,ECK120014185,ECK120001215,-8,3469859,3469864,tttacgtcacAAGGGAttatAAT,,,ECK12,"(3469859, 3469864)"


In [3]:
RBS_df.to_pickle("./data/RBS_df.pkl")

# operons

In [4]:
operon_df = pd.read_csv("./data/RegulonDB10/operon.txt", sep="\t", comment='#', header=None)
operon_df.columns = [
    "OPERON_ID",
    "OPERON_NAME",
    "FIRSTGENEPOSLEFT",
    "LASTGENEPOSRIGHT",
    "REGULATIONPOSLEFT",
    "REGULATIONPOSRIGHT",
    "OPERON_STRAND",
    "OPERON_INTERNAL_COMMENT",
    "KEY_ID_ORG"]
operon_df["range"] = operon_df.apply(lambda row: (row["REGULATIONPOSLEFT"], row["REGULATIONPOSRIGHT"]), axis=1)

In [5]:
operon_df.to_pickle("./data/operon_df.pkl")

In [6]:
# need to get total "unknown" operon category length
# follow the same approach as with total "unknown" COG length (get_COG_lengths NB)
# Could be faster
all_op_pos_set = set()
for i, r in operon_df.iterrows():
    op_pos = set(range(r["REGULATIONPOSLEFT"], r["REGULATIONPOSRIGHT"] + 1))
    all_op_pos_set = all_op_pos_set.union(op_pos)
display(len(all_op_pos_set))

4375748

In [7]:
unknown_op_pos_set = set(range(1 ,4641653 + 1)).difference(all_op_pos_set)
unknown_op_nuc_len = len(unknown_op_pos_set)
op_len_df = pd.DataFrame(
    data={"operon description": ["unknown"], "length": [unknown_op_nuc_len]},
    columns=["operon description", "length"],
)
op_len_df

Unnamed: 0,operon description,length
0,unknown,265905


In [8]:
op_len_df.to_pickle("./data/operon_length_df.pkl")

# attenuator terminator
The point of this notebook is to build those tables ASW-1216

In [9]:
att_term_df = pd.read_csv(
    "./data/RegulonDB10/attenuator_terminator.txt", sep="\t", comment='#', header=None)
att_term_df.columns = [
    "A_TERMINATOR_ID",
    "A_TERMINATOR_TYPE",
    "A_TERMINATOR_POSLEFT",
    "A_TERMINATOR_POSRIGHT",
    "A_TERMINATOR_ENERGY",
    "A_TERMINATOR_SEQUENCE",
    "A_TERMINATOR_ATTENUATOR_ID"
]
# att_term_df["range"] = att_term_df.apply(lambda row: (row.A_TERMINATOR_POSLEFT, row.A_TERMINATOR_POSRIGHT), axis=1)
att_term_df.head()

Unnamed: 0,A_TERMINATOR_ID,A_TERMINATOR_TYPE,A_TERMINATOR_POSLEFT,A_TERMINATOR_POSRIGHT,A_TERMINATOR_ENERGY,A_TERMINATOR_SEQUENCE,A_TERMINATOR_ATTENUATOR_ID
0,ECK125143527,terminator,276,311,-20.0,aacacagaaaAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCg...,ECK125143526
1,ECK125143528,anti-terminator,244,288,-19.41,acaggtaacgGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAG...,ECK125143526
2,ECK125143529,anti-anti-terminator,200,255,-14.5,tgaaacgcatTAGCACCACCATTACCACCACCATCACCATTACCAC...,ECK125143526
3,ECK125143531,terminator,5042,5078,-19.3,atctcaatcaGGCCGGGTTTGCTTTTATGCAGCCCGGCTTTTTTAT...,ECK125143530
4,ECK125143532,anti-terminator,4979,5051,-11.9,ctgcccgccgATTTTGCTGCGTTGCGTAAATTGATGATGAATCATC...,ECK125143530


In [10]:
new_att_term_df = pd.DataFrame()
for att_term_id, r in att_term_df.groupby(["A_TERMINATOR_ATTENUATOR_ID"]):
    range_l = list(r.A_TERMINATOR_POSLEFT) + list(r.A_TERMINATOR_POSRIGHT)
    combo_att_term_range = (min(range_l), max(range_l))
    new_att_term_df = new_att_term_df.append({"RegulonDB ID": att_term_id, "range": combo_att_term_range}, ignore_index=True)
new_att_term_df.head()

Unnamed: 0,RegulonDB ID,range
0,ECK125143526,"(200, 311)"
1,ECK125143530,"(4979, 5078)"
2,ECK125143534,"(14134, 14155)"
3,ECK125143536,"(21166, 21255)"
4,ECK125143540,"(20912, 20982)"


In [11]:
new_att_term_df.to_pickle("./data/att_term.pkl")