In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import pandas as pd
import seaborn as sns
from IPython.display import display
import os, sys, itertools, csv
pd.options.display.max_columns = 100

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.genome import get_feature_hit_set, is_overlap, NON_K12_EXP_L
from util.operon import get_operon_name_set, get_operon_ID_set
from util.feature import get_feat_d_from_ID

In [2]:
all_muts_df = pd.read_pickle("./data/2_2_1_df.pkl")
display(all_muts_df.shape, all_muts_df.head())

(5676, 28)

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features,genetic feature links
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{},{},{},{},True,"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",{'ECK120000625': ['ECK120000625']}
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False,False,{},{},{},{},{},True,"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",{'ECK120001556': ['ECK120001556']}
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK120000854'], 'ECK1200007..."
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",{'ECK120000886': ['ECK120000886']}
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False,False,{},{},{},{},{},True,"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",{'ECK120000431': ['ECK120000431']}


In [3]:
operon_df = pd.read_pickle("./data/operon_df.pkl")
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
TU_df = pd.read_pickle("./data/TU_df.pkl")
TU_objects_df = pd.read_pickle("./data/TU_objects_df.pkl")
TU_objects_df["TU range"] = TU_objects_df.apply(
    lambda row: (row["TU_POSLEFT"], row["TU_POSRIGHT"]), axis=1)

In [4]:
def get_operons(mut_row, operon_df):
    annots = []
    if mut_row.exp not in NON_K12_EXP_L:
        # 0) for each geno feat mutated by this individual mutation
        for geno_feat_d in mut_row["genomic features"]:
#             display(geno_feat_d)
            if '/' not in geno_feat_d["RegulonDB ID"]:
                # 1) get operon for geno feat using explicit relationship from RegDB.
                RegDB_id_to_use_for_op = geno_feat_d["RegulonDB ID"]
                if geno_feat_d["feature type"] == "RBS":
                    RegDB_id_to_use_for_op = RBS_df[RBS_df["SHINE_DALGARNO_ID"]==geno_feat_d["RegulonDB ID"]]["GENE_ID"].iloc[0]
                    
                op_ID_set = get_operon_ID_set(RegDB_id_to_use_for_op, TU_objects_df, TU_df, operon_df)
                # 2) annot each operon that feature is involved in individually within JSON.
                for op_ID in op_ID_set:
                    op_df = operon_df[operon_df["OPERON_ID"] == op_ID]
                    for _, op in op_df.iterrows():
                        d = {
                            "name": op["OPERON_NAME"],
                            "RegulonDB ID": op["OPERON_ID"],
                            "range": (op["REGULATIONPOSLEFT"], op["REGULATIONPOSRIGHT"])}
                        annots.append(d)

            # This will catch whether no features exist in the feat_ID_set
            # or if those features are just integenic regions.
            # Get operon genetic feature link dict through overlapping mutation and operon ranges.
            # All genetic features are annotated within both the "genetic features" and "genetic feature links" column.
            # If no integenic genomic feature is hit by a mutation, use the genetic feature range in an overlap analysis with operons.

            else:
                operon_df["mutation hit"] = operon_df["range"].apply(is_overlap, args=[geno_feat_d["range"]])
                op_mut_hit_df = operon_df[operon_df["mutation hit"]]
                for _, op in op_mut_hit_df.iterrows():
                    d = {
                        "name": op["OPERON_NAME"],
                        "RegulonDB ID": op["OPERON_ID"],
                        "range": (op["REGULATIONPOSLEFT"], op["REGULATIONPOSRIGHT"])}
                    annots.append(d)
                operon_df = operon_df.drop(columns=["mutation hit"])  # Removing this column to clean up for next iteration.

        return annots
    

all_muts_df["operons"] = all_muts_df.apply(lambda r: get_operons(r, operon_df), axis=1)
all_muts_df.head()

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features,genetic feature links,operons
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{},{},{},{},True,"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",{'ECK120000625': ['ECK120000625']},"[{'name': 'nagBAC-umpH', 'RegulonDB ID': 'ECK1..."
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False,False,{},{},{},{},{},True,"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",{'ECK120001556': ['ECK120001556']},"[{'name': 'clsA-yciU', 'RegulonDB ID': 'ECK125..."
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK120000854'], 'ECK1200007...","[{'name': 'rph-pyrE', 'RegulonDB ID': 'ECK1200..."
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",{'ECK120000886': ['ECK120000886']},"[{'name': 'rplKAJL-rpoBC', 'RegulonDB ID': 'EC..."
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False,False,{},{},{},{},{},True,"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",{'ECK120000431': ['ECK120000431']},[{'name': 'yjeF-tsaE-amiB-mutL-miaA-hfq-hflXKC...


In [5]:
# TODO: combine this with the get_operons() function above

def _get_genetic_target_d_from_ID(gen_targ_ID, mut_row):
    gen_targ_d = dict()
    for d in mut_row["genetic features"]:
        if d["RegulonDB ID"] == gen_targ_ID:
            gen_targ_d = d
            break
    return gen_targ_d


def get_operon_genetic_link_d(mut_row, operon_df):
    op_gen_targ_d = dict()
    if mut_row.exp not in NON_K12_EXP_L:
        # 0) for each genetic and genomic feat mutated by this individual mutation
        for gen_targ_ID, feat_ID_set in mut_row["genetic feature links"].items():  # Assuming all genetic features in mutation are defined here
            # 1) for each genomic feat mutated by this mut.
            for feat_ID in feat_ID_set:
                # 2) get the dict for this genome feat because we want it's regdb ID.
                feat_d = get_feat_d_from_ID(feat_ID, mut_row)
#                 link_exists = False
                
                if '/' not in feat_d["RegulonDB ID"]:
                    RegDB_id_to_use_for_op = feat_d["RegulonDB ID"]
                    if feat_d["feature type"] == "RBS":
                        RegDB_id_to_use_for_op = RBS_df[RBS_df["SHINE_DALGARNO_ID"]==feat_d["RegulonDB ID"]]["GENE_ID"].iloc[0]
                    
                    # 3) get operon for geno feat using explicit relationship from RegDB.
                    op_ID_set = get_operon_ID_set(RegDB_id_to_use_for_op, TU_objects_df, TU_df, operon_df)
#                     if len(op_ID_set) > 0: link_exists = True
                    for op_ID in op_ID_set:
                    # 4) annot the GENETIC FEAT that this 
                        if op_ID not in op_gen_targ_d.keys():
                            op_gen_targ_d[op_ID] = []
                        op_gen_targ_d[op_ID].append(gen_targ_ID)

                else:
                    operon_df["mutation hit"] = operon_df["range"].apply(is_overlap, args=[feat_d["range"]])
                    op_mut_hit_df = operon_df[operon_df["mutation hit"]]
                    for _, op in op_mut_hit_df.iterrows():
                        if op["OPERON_ID"] not in op_gen_targ_d.keys():
                            op_gen_targ_d[op["OPERON_ID"]] = []
                        op_gen_targ_d[op["OPERON_ID"]].append(gen_targ_ID)
                    # cleanup
                    if "mutation hit" in operon_df.columns.values:
                        operon_df = operon_df.drop(columns=["mutation hit"])
                        
        return op_gen_targ_d
    

all_muts_df["operon links"] = all_muts_df.apply(lambda r: get_operon_genetic_link_d(r, operon_df), axis=1)
all_muts_df.head()

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features,genetic feature links,operons,operon links
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{},{},{},{},True,"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",{'ECK120000625': ['ECK120000625']},"[{'name': 'nagBAC-umpH', 'RegulonDB ID': 'ECK1...",{'ECK125162049': ['ECK120000625']}
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False,False,{},{},{},{},{},True,"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",{'ECK120001556': ['ECK120001556']},"[{'name': 'clsA-yciU', 'RegulonDB ID': 'ECK125...",{'ECK125147415': ['ECK120001556']}
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK120000854'], 'ECK1200007...","[{'name': 'rph-pyrE', 'RegulonDB ID': 'ECK1200...","{'ECK120014627': ['ECK120000854', 'ECK12000079..."
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",{'ECK120000886': ['ECK120000886']},"[{'name': 'rplKAJL-rpoBC', 'RegulonDB ID': 'EC...",{'ECK120016992': ['ECK120000886']}
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False,False,{},{},{},{},{},True,"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",{'ECK120000431': ['ECK120000431']},[{'name': 'yjeF-tsaE-amiB-mutL-miaA-hfq-hflXKC...,{'ECK125147410': ['ECK120000431']}


In [6]:
all_muts_df.to_pickle("./data/2_3_df.pkl")