In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
from IPython.display import display
import os, sys, itertools, csv

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.alemutdf import get_all_sample_mut_df, get_gene_mut_count_mat, get_multi_exp_max_freq_mut_df, get_mut_type_avg_frac_across_class_df
from util.mut import is_coding_mut, get_original_nuc_mut_range
from util.metadata import get_condition_val_dict, get_condition_field_val_set
from util.genome import get_feature_hit_set, is_overlap, get_promoter_range_from_RegulonDB_df_row, NON_K12_EXP_L 

In [2]:
pd.options.display.max_columns = 100

In [3]:
all_muts_df = pd.read_pickle("./data/1_df.pkl")
display(all_muts_df.shape)

(5676, 14)

In [4]:
# # # DEBUG
# all_muts_df = all_muts_df[all_muts_df.exp=="SSW_GLU_XYL"].copy()

In [5]:
# Renaming the "Gene" column to "mutation target annotation" because this is more accurate.
all_muts_df = all_muts_df.rename(index=str, columns={"Gene": "mutation target annotation"})

# Build genetic feature annotation
Need to build genetic feature annotations before genomic features annotations since the genetic annotations are used to fill gaps with unknown genomic targets.

## K12 specific genetic annotation

In [6]:
gene_df = pd.read_pickle("./data/gene_df.pkl")
gene_df.head()

Unnamed: 0,GENE_ID,GENE_NAME,GENE_POSLEFT,GENE_POSRIGHT,GENE_STRAND,GENE_SEQUENCE,GC_CONTENT,CRI_SCORE,GENE_NOTE,GENE_INTERNAL_COMMENT,KEY_ID_ORG,GENE_TYPE,range
0,ECK120000001,alr,4265782.0,4266861.0,forward,ATGCAAGCGGCAACTGTTGTGATTAACCGCCGCGCTCTGCGACACA...,55.93,,,,ECK12,,"(4265782, 4266861)"
1,ECK120000002,modB,795862.0,796551.0,forward,ATGATACTGACCGATCCAGAATGGCAGGCAGTTTTATTAAGCCTGA...,54.06,,,,ECK12,,"(795862, 796551)"
2,ECK120000003,cysZ,2531463.0,2532224.0,forward,ATGGTTTCATCATTCACATCTGCCCCACGCAGCGGTTTTTACTATT...,50.13,,,,ECK12,,"(2531463, 2532224)"
3,ECK120000004,dfp,3812731.0,3813951.0,forward,ATGAGCCTGGCCGGTAAAAAAATCGTTCTCGGCGTTAGCGGCGGTA...,53.64,,,,ECK12,,"(3812731, 3813951)"
4,ECK120000005,dcuB,4347404.0,4348744.0,reverse,ATGTTATTTACTATCCAACTTATCATAATACTGATATGTCTGTTTT...,52.27,,,,ECK12,,"(4347404, 4348744)"


In [7]:
all_muts_df["range"] = all_muts_df.apply(get_original_nuc_mut_range, axis=1)
all_muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range
9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)"
10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)"
11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)"
12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)"
13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)"


In [8]:
all_muts_df["gene RegulonDB ID"] = all_muts_df.apply(
    lambda r: get_feature_hit_set(
        r["range"], gene_df, "range", "GENE_ID") if r.exp not in NON_K12_EXP_L else set(), axis=1
)  # Maybe instead just have a list of K12 experiments

all_muts_df.head()

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID
9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625}
10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556}
11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854}
12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886}
13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431}


In [9]:
intergenic_df = pd.read_pickle("./data/intergenic_region_df.pkl")
intergenic_df.head()

Unnamed: 0,name,RegulonDB ID,range
0,thrL/thrA,ECK120001251/ECK120000987,"(256, 336)"
1,thrA/thrB,ECK120000987/ECK120000988,"(2800, 2800)"
2,thrC/yaaX,ECK120000989/ECK120002701,"(5021, 5233)"
3,yaaX/yaaA,ECK120002701/ECK120000009,"(5531, 5682)"
4,yaaA/yaaJ,ECK120000009/ECK120001508,"(6460, 6528)"


In [10]:
def get_feature_range(feat_id, feat_type):
    r = ()
    feat_id_col = 0
    range_col = "range"
    feat_id_col = "GENE_ID"
    feat_type_df = gene_df

    df = feat_type_df[feat_type_df[feat_id_col] == feat_id]
    if not df.empty:
        r = df.iloc[0][range_col]  # Assuming all rows describe the same range.
    l = [int(i) for i in r]
    return tuple(l)


# This will include all implicit intergenic regions, where get_genomic_features(...) does not.
# TODO: this may not handle wrapping around the genome from the last gene to the first gene. Write test for this.
def get_ordered_genetic_targets(genetic_targets):
    ordered_genetic_targets = sorted(
        genetic_targets, key=lambda k: k["range"][0])
    return ordered_genetic_targets


test_genetic_targets = [
    {'RegulonDB ID': "ECK120000003",
     "range": (2531463, 2532224)},
    {'RegulonDB ID': "ECK120000001",
     "range": (4265782, 4266861)},
    {'RegulonDB ID': "ECK120000002",
     "range": (795862, 796551)}]
assert(get_ordered_genetic_targets(test_genetic_targets) ==
       [{'RegulonDB ID': 'ECK120000002', 'range': (795862, 796551)},
        {'RegulonDB ID': 'ECK120000003', 'range': (2531463, 2532224)},
        {'RegulonDB ID': 'ECK120000001', 'range': (4265782, 4266861)}])


#Simply returning the first gene name in the second column that has the RegulonDB ID in the first column.
def get_gene_name(RegulonDB_ID, debug=False):
    if debug:
        display(RegulonDB_ID, gene_df[gene_df["GENE_ID"] == RegulonDB_ID])
    return gene_df[gene_df["GENE_ID"] == RegulonDB_ID].iloc[0]["GENE_NAME"]


def get_genetic_feat_JSON(mut_row):
    l = []
    for gene_id in mut_row["gene RegulonDB ID"]:
        l.append({
            # IDs are unique; assuming only be one entry per DF.
            "name": get_gene_name(gene_id),
            "RegulonDB ID": gene_id,
            "range": get_feature_range(gene_id, "gene RegulonDB ID"),
            "feature type": "gene"
        })

    intergenic_regions = get_feature_hit_set(mut_row["range"], intergenic_df, "range", "RegulonDB ID")
    for intergenic_region in intergenic_regions:
        l.append({
            "RegulonDB ID": intergenic_region,
            "name": str(get_gene_name(intergenic_region.split('/')[0])) + '/' + str(get_gene_name(intergenic_region.split('/')[1])),
            "range": intergenic_df[intergenic_df["RegulonDB ID"] == intergenic_region].iloc[0]["range"],
            "feature type": "unknown"
        })

    return l


all_muts_df["genetic features"] = all_muts_df.apply(lambda r: get_genetic_feat_JSON(r) if r.exp not in NON_K12_EXP_L else [], axis=1)
display(all_muts_df.head())

Unnamed: 0,exp,ale,flask,isolate,tech_rep,presence,Position,Mutation Type,Sequence Change,Details,mutation target annotation,Reference Seq,sample,coding,range,gene RegulonDB ID,genetic features
9,42C,1,124,1,1,1.0,702352,DEL,Δ21 bp,coding (380‑400/1149 nt),nagA,,1 124 1 1,True,"(702352, 702372)",{ECK120000625},"[{'name': 'nagA', 'RegulonDB ID': 'ECK12000062..."
10,42C,1,124,1,1,1.0,1308318,SNP,G→C,R110G (CGT→GGT),clsA,,1 124 1 1,True,"(1308318, 1308318)",{ECK120001556},"[{'name': 'clsA', 'RegulonDB ID': 'ECK12000155..."
11,42C,1,124,1,1,1.0,3815859,DEL,Δ82 bp,,rph,,1 124 1 1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854..."
12,42C,1,124,1,1,1.0,4187550,SNP,C→T,A734V (GCG→GTG),rpoC,,1 124 1 1,True,"(4187550, 4187550)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088..."
13,42C,1,124,1,1,1.0,4400313,SNP,A→C,D9A (GAT→GCT),hfq,,1 124 1 1,True,"(4400313, 4400313)",{ECK120000431},"[{'name': 'hfq', 'RegulonDB ID': 'ECK120000431..."


In [11]:
all_muts_df.to_pickle("./data/1_5_df.pkl")

In [12]:
# all_muts_df[all_muts_df["mutation target annotation"] == "yhcC/gltB" ]