Getting the mapping between genetic and genomic features is done separately from annotating the genetic features since the annotation of the genetic features has to happen before genomic features, though we need the genomic features to get the mapping between genomic and genetic. This work therefore comes after getting the genomic and genetic features. In summary, the following describes the necessary order:  
1) genetic features  
2) genomic features  
3) mapping between genomic and genetic features

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import pandas as pd
import seaborn as sns
import copy
from IPython.display import display
import os, sys, itertools, csv
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.gene import get_coding_genetic_target_len_d, get_intergenic_len_d
from util.genome import get_feature_hit_set
pd.options.display.max_columns = 100

In [2]:
all_muts_df = pd.read_pickle("./data/2_2_df.pkl")
display(all_muts_df.shape, all_muts_df.head())

(5676, 27)

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{},{},{},{},True,"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062..."
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False,False,{},{},{},{},{},True,"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155..."
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R..."
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088..."
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False,False,{},{},{},{},{},True,"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431..."


In [3]:
# # # DEBUG
# all_muts_df = all_muts_df[all_muts_df.exp=="SSW_GLU_XYL"].copy()

In [4]:
def get_genetic_links_d(mut_row):
    gen_links_d = dict()
    # generates a DF of all genomic features for this mutation (row) to use as input for mapping genomic features
    # to genetic features according to range.
    # This mapping is done by ranges only, unlike other mapping since there is no explicit mapping between
    # genomic and genetic features. 
    geno_feat_df = pd.DataFrame(mut_row["genomic features"])
    for gen_feat_d in mut_row["genetic features"]:
        o = get_feature_hit_set(gen_feat_d["range"], geno_feat_df, "range", "RegulonDB ID")
        gen_links_d[gen_feat_d["RegulonDB ID"]] = list(o)
    return gen_links_d


all_muts_df["genetic feature links"] = all_muts_df.apply(get_genetic_links_d, axis=1)
all_muts_df.head()

Unnamed: 0,index,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features,genetic feature links
0,9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",False,False,{},{},{},{},{},True,"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062...",{'ECK120000625': ['ECK120000625']}
1,10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",False,False,{},{},{},{},{},True,"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155...",{'ECK120001556': ['ECK120001556']}
2,11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK125144791', 'ECK12000085..."
3,12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",{'ECK120000886': ['ECK120000886']}
4,13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",False,False,{},{},{},{},{},True,"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431...",{'ECK120000431': ['ECK120000431']}


In [5]:
# I think I'm only going to have to implement the remove of a genetic feature function once.
def filter_feats(feat_json, feats_to_remove):
    output = feat_json.copy()
    for f in feats_to_remove:
        for d in feat_json:
            if d["RegulonDB ID"] == f:
                output.remove(d)
    return output


i = [{
    'name': 'g1',
    'RegulonDB ID': '1',
    'range': (1, 2),
    'feature type': 'gene'},
    {'name': 'g2',
     'RegulonDB ID': '2',
     'range': (2, 3),
     'feature type': 'gene'}]
e = [{'name': 'g1', 'RegulonDB ID': '1', 'range': (1, 2), 'feature type': 'gene'}]
o = filter_feats(i, {'2'})
assert(o == e)

In [6]:
gene_df = pd.read_pickle("./data/gene_df.pkl")
gene_df.head()

Unnamed: 0,GENE_ID,GENE_NAME,GENE_POSLEFT,GENE_POSRIGHT,GENE_STRAND,GENE_SEQUENCE,GC_CONTENT,CRI_SCORE,GENE_NOTE,GENE_INTERNAL_COMMENT,KEY_ID_ORG,GENE_TYPE,range
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,,"(4265782, 4266861)"
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,,"(795862, 796551)"
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,,"(2531463, 2532224)"
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,,"(3812731, 3813951)"
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,,"(4347404, 4348744)"


In [7]:
# fix for ASW-1628
# remove links to coding regions for non-coding features that overlap in both coding and non-coding regions
# assuming that all overlapping features are non-coding.


# want to filter out non-coding features from coding links.
# The overlap between of a cis-reg feature with an intergenic and coding region is real
# though its not the relationship we want to model.
def _filter_coding_noncoding_overlapping_links(gen_link_d):
    ret_d = copy.deepcopy(gen_link_d)
    gen_feats_to_remove = set()

    # using these for iteration
    coding_links = dict()
    noncoding_links = dict()
    for k, v in gen_link_d.items():
        if '/' in k:
            noncoding_links[k] = v
        else:
            coding_links[k] = v

    # changing the given gen_link_d to return
    for ncl_geno_feats in noncoding_links.values():
        for ncl_geno_feat in ncl_geno_feats:
            # There's the potential to filter for overlapping genes here as well
            for cl_gen_feat, cl_geno_feats in coding_links.items():
                # Check if non-coding feature linked with intergenic region (ncl_geno_feats) overlaps (linked) with coding region.
                if ncl_geno_feat not in set(gene_df["GENE_ID"]):  # Ensuring feat to filter is cis-reg, otherwise don't filter
                    if ncl_geno_feat in cl_geno_feats:                    
                        if len(cl_geno_feats) == 1:
                            gen_feats_to_remove.add(cl_gen_feat)
                            del ret_d[cl_gen_feat]
                        else:
                            # remove geno_feat link to the coding gen_feat (ncl_geno_feat == to feat in cl_geno_feats that want to remove)
                            if ncl_geno_feat in ret_d[cl_gen_feat]:
                                ret_d[cl_gen_feat].remove(ncl_geno_feat)

    return ret_d, gen_feats_to_remove


i = {'ECK120001080': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229290': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229291': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229291/ECK125256962': ['ECK120001080']}  # This should be removed since ECK120001080 == crl gene, though will be done in a filtering step below
e = {'ECK120001080': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229290': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229291': ['ECK125229290', 'ECK125229291', 'ECK120001080'],
     'ECK125229291/ECK125256962': ['ECK120001080']}
o, _ = _filter_coding_noncoding_overlapping_links(i)
assert(o == e)

i = {'ECK120000854': ['ECK120000854', 'ECK125144791'],
     'ECK120000799/ECK120000854': ['ECK125144791']}
e = {'ECK120000854': ['ECK120000854'],
     'ECK120000799/ECK120000854': ['ECK125144791']}
o, _ = _filter_coding_noncoding_overlapping_links(i)
assert(o == e)

# Remove the coding region entry if the only item linking is removed
i = {'ECK120000854': ['ECK125144791'],
     'ECK120000799/ECK120000854': ['ECK125144791']}
e = {'ECK120000799/ECK120000854': ['ECK125144791']}
o, _ = _filter_coding_noncoding_overlapping_links(i)
assert(o == e)

In [8]:
def _f(r):
    link_d, gen_feats_to_remove = _filter_coding_noncoding_overlapping_links(r["genetic feature links"])
    gen_feats = filter_feats(r["genetic features"], gen_feats_to_remove)
    return pd.Series([gen_feats, link_d])


# replicates the necessary parts of a dataframe row
# have to use RegulonDB IDs in testing for the gene_df condition in _filter_overlapping_genes_link above to work.
i = {
    "genetic features": [{'name': 'g1',
                          'RegulonDB ID': '1'},
                         {'name': 'g1/g2',
                          'RegulonDB ID': '1/2'},  # Intergenic region
                         {'name': 'f3',
                          'RegulonDB ID': '3'},
                         ],
    "genetic feature links": {'1': ['3'],
                              '1/2': ['3']}
}
e = pd.Series([
    [{'name': 'g1/g2', 'RegulonDB ID': '1/2'}, {'name': 'f3', 'RegulonDB ID': '3'}],
    {'1/2': ['3']}
])
o = _f(i)
assert(o.equals(e))


all_muts_df[["genetic features", "genetic feature links"]] = all_muts_df.apply(_f, axis=1)

In [9]:
# fix for ASW-1628

# 1) Check if genomic feature is a gene
# 2) If genetic feature it links to isn't the same gene, remove the link
# 2.1) If the length of links is one, just remove the linked genetic feature altogether like I had with overlapping non-coding features.


# filters out links due to overlapping genes
# and when genes for some reason get linked with integenic regions
def _filter_gene_link_only_to_itself(link_d):
    ret_d = copy.deepcopy(link_d)
    gen_feats_to_remove = set()
    coding_links = dict()
    noncoding_links = dict()
    for k, v in link_d.items():
        if '/' in k:
            noncoding_links[k] = v
        else:
            coding_links[k] = v

    # filter out links due to overlapping genes
    for cl_gen_feat, cl_geno_feats in coding_links.items():
        for cl_geno_feat in cl_geno_feats:            
            # if both gen and geno feat are genes.
            if (cl_gen_feat in gene_df.GENE_ID.unique()) & (cl_geno_feat in gene_df.GENE_ID.unique()):
                # geno feat isn't gen feat, remove link.
                if cl_gen_feat != cl_geno_feat:
                    if len(cl_geno_feats) == 1:
                        gen_feats_to_remove.add(cl_gen_feat)
                        del ret_d[cl_gen_feat]
                    else:
                        ret_d[cl_gen_feat].remove(cl_geno_feat)
                        
    # filters for when genes for some reason get linked with integenic regions
    # the simplest thing to do is just remove any link of a gene to an intergenic region.
    for ncl_gen_feat, ncl_geno_feats in noncoding_links.items():
        for ncl_geno_feat in ncl_geno_feats:
            if (ncl_gen_feat not in gene_df.GENE_ID.unique()) & (ncl_geno_feat in gene_df.GENE_ID.unique()):
                if len(ncl_geno_feats) == 1:
                    gen_feats_to_remove.add(ncl_gen_feat)
                    del ret_d[ncl_gen_feat]
                else:
                    ret_d[ncl_gen_feat].remove(ncl_geno_feat)
    
    return ret_d, gen_feats_to_remove


i = {'ECK120001080': ['ECK120001080', 'ECK125229291', 'ECK125229290'],
     'ECK125229290': ['ECK120001080', 'ECK125229291', 'ECK125229290'],
     'ECK125229291': ['ECK120001080', 'ECK125229291', 'ECK125229290'],}
e = {'ECK120001080': ['ECK120001080'],
     'ECK125229290': ['ECK125229290'],
     'ECK125229291': ['ECK125229291']}
o, _ = _filter_gene_link_only_to_itself(i)
assert(o == e)


i = {'ECK120002224': ['ECK120010816', 'ECK120002224', 'ECK120000393'],
     'ECK120000393': ['ECK120002224', 'ECK120000393']}
e = {'ECK120002224': ['ECK120010816', 'ECK120002224'],
     'ECK120000393': ['ECK120000393']}
o, _ = _filter_gene_link_only_to_itself(i)
assert(o == e)


i = {'ECK120003617': ['ECK120023915', 'ECK120003617'],
     'ECK120023915': ['ECK120003616', 'ECK120023915'],
     'ECK120003616': ['ECK120003616']}
e = {'ECK120003617': ['ECK120003617'],
     'ECK120023915': ['ECK120023915'],
     'ECK120003616': ['ECK120003616']}
o, _ = _filter_gene_link_only_to_itself(i)
assert(o == e)


i = {'ECK120001080': ['ECK120001080'],
     'ECK125229291/ECK125256962': ['ECK120001080']}
e = {'ECK120001080': ['ECK120001080']}
es = {'ECK125229291/ECK125256962'}
o, os = _filter_gene_link_only_to_itself(i)
assert(o == e)
assert(os == es)


def _f(r):
    link_d, gen_feats_to_remove = _filter_gene_link_only_to_itself(r["genetic feature links"])
    gen_feats = filter_feats(r["genetic features"], gen_feats_to_remove)
    return pd.Series([gen_feats, link_d])


all_muts_df[["genetic features", "genetic feature links"]] = all_muts_df.apply(_f, axis=1)

In [10]:
# fix for ASW-1628
# remove links between cis-regulatory features and overlapping genes


def _filter_cis_reg_overlapping_genes(link_d):
    ret_d = copy.deepcopy(link_d)
    gen_feats_to_remove = set()
    # This issue likely is only happening with genes and not intergenic regions.
    coding_links = dict()
    for k, v in link_d.items():
        if '/' not in k:
            coding_links[k] = v

    assigned_geno_feats = set()
    for gen_feat, geno_feats in coding_links.items():
        for geno_feat in geno_feats:
            # ensuring that it's a cis-regulatory feature
            if (geno_feat not in gene_df.GENE_ID.unique()) & ("/" not in geno_feat):
                if geno_feat in assigned_geno_feats:  # If the geno feat has already been assigned, remove it
                    if len(geno_feats) == 1:
                        gen_feats_to_remove.add(gen_feats)
                        del ret_d[gen_feats]
                    else:
                        ret_d[gen_feat].remove(geno_feat)
                else:
                    assigned_geno_feats.add(geno_feat)

    return ret_d, gen_feats_to_remove


i = {'ECK120003617': ['ECK120003617'],
     'ECK120023915': ['ECK120013472', 'ECK120023915', 'ECK125095446'],
     'ECK120003616': ['ECK120013472', 'ECK120003616', 'ECK125095446'],
     'ECK120000314/ECK120023915': ['ECK120017150'],
     'ECK120003617/ECK120003618': ['ECK120034252']}

e = {'ECK120003617': ['ECK120003617'],
     'ECK120023915': ['ECK120013472', 'ECK120023915', 'ECK125095446'],
     'ECK120003616': ['ECK120003616'],
     'ECK120000314/ECK120023915': ['ECK120017150'],
     'ECK120003617/ECK120003618': ['ECK120034252']}
o, _ = _filter_cis_reg_overlapping_genes(i)
assert(o == e)


def _f(r):
    link_d, gen_feats_to_remove = _filter_cis_reg_overlapping_genes(
        r["genetic feature links"])
    gen_feats = filter_feats(r["genetic features"], gen_feats_to_remove)
    return pd.Series([gen_feats, link_d])


all_muts_df[["genetic features", "genetic feature links"]] = all_muts_df.apply(_f, axis=1)

In [11]:
all_muts_df.to_pickle("./data/2_2_1_df.pkl")