In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import pandas as pd
import seaborn as sns

from IPython.display import display
import os, sys, itertools, csv
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from util.metadata import get_condition_val_dict, get_condition_field_val_set
from util.gene import get_coding_genetic_target_len_d, get_intergenic_len_d
from util.genome import get_feature_hit_set, is_overlap, get_promoter_range_from_RegulonDB_df_row
from util.params import ENRICH_ALPHA, MULTI_HYP_CORR_METHOD
pd.options.display.max_columns = 100

In [2]:
assoc_muts_df = pd.read_pickle("./data/5_df.pkl")
display(assoc_muts_df.shape, assoc_muts_df.head())

(46, 41)

Unnamed: 0,index,Details,mutation target annotation,Mutation Type,Position,Reference Seq,Sequence Change,ale,exp,flask,isolate,presence,tech_rep,coding,range,gene RegulonDB ID,genetic features,oriC,pseudogene,TFBS,promoter,RBS,attenuator terminator,terminator,genetic,genomic features,genetic feature links,operons,operon links,regulators,regulator links,temperature,carbon-source,supplement,strain-description,taxonomy-id,base-media,nitrogen-source,phosphorous-source,sulfur-source,calcium-source
1232,4,R121C (CGC→TGC),xylR,SNP,3735339,,C→T,1,SSW_XYL,116,1,1.0,1,True,"(3735339, 3735339)",{ECK120002449},"[{'name': 'xylR', 'RegulonDB ID': 'ECK12000244...",False,False,{},{},{},{},{},True,"[{'name': 'xylR', 'RegulonDB ID': 'ECK12000244...",{'ECK120002449': ['ECK120002449']},"[{'name': 'xylFGHR', 'RegulonDB ID': 'ECK12001...",{'ECK120014482': ['ECK120002449']},"[{'name': 'Fis', 'significantly associated con...","{'ArcA': ['xylFGHR'], 'Fis': ['xylFGHR'], 'CRP...",37 celsius,xylose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
1233,5,,rph,DEL,3815859,,Δ82 bp,1,SSW_XYL,116,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK125144791', 'ECK12000085...","[{'name': 'rph-pyrE', 'RegulonDB ID': 'ECK1200...","{'ECK120014627': ['ECK120000854', 'ECK12000085...",[],{},37 celsius,xylose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
1234,6,L770R (CTC→CGC),rpoC,SNP,4187658,,T→G,1,SSW_XYL,116,1,1.0,1,True,"(4187658, 4187658)",{ECK120000886},"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",False,False,{},{},{},{},{},True,"[{'name': 'rpoC', 'RegulonDB ID': 'ECK12000088...",{'ECK120000886': ['ECK120000886']},"[{'name': 'rplKAJL-rpoBC', 'RegulonDB ID': 'EC...",{'ECK120016992': ['ECK120000886']},[],{},37 celsius,xylose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
1235,9,G164A (GGC→GCC),dnaG,SNP,3211597,,G→C,2,SSW_XYL,109,1,1.0,1,True,"(3211597, 3211597)",{ECK120000233},"[{'name': 'dnaG', 'RegulonDB ID': 'ECK12000023...",False,False,{},{},{},{},{},True,"[{'name': 'dnaG', 'RegulonDB ID': 'ECK12000023...",{'ECK120000233': ['ECK120000233']},"[{'name': 'rpsU-dnaG-rpoD', 'RegulonDB ID': 'E...",{'ECK120014515': ['ECK120000233']},"[{'name': 'LexA', 'significantly associated co...",{'LexA': ['rpsU-dnaG-rpoD']},37 celsius,xylose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)
1236,5,,rph,DEL,3815859,,Δ82 bp,2,SSW_XYL,109,1,1.0,1,True,"(3815859, 3815940)",{ECK120000854},"[{'name': 'rph', 'RegulonDB ID': 'ECK120000854...",False,False,{},{},{},{ECK125144791},{},True,"[{'name': 'rph-pyrE attenuator terminator', 'R...","{'ECK120000854': ['ECK125144791', 'ECK12000085...","[{'name': 'rph-pyrE', 'RegulonDB ID': 'ECK1200...","{'ECK120014627': ['ECK120000854', 'ECK12000085...",[],{},37 celsius,xylose(4),NaCl(0.5g/L) trace elements,WT,511145,M9,NH4Cl(1),KH2PO4(3) Na2HPO4(6.8),MgSO4(0.24),CaCl2(0.1)


# Significance study (level specific)

## permutation test

In [3]:
# Genomic features can use the "features" column since there is no linking to them from a lower-level annotation (yet: sub-genetic components)

comp_mut_df = pd.DataFrame(columns=["length", "observed mutation count", "name"])
for mut_feats_l in assoc_muts_df["genomic features"]:
    for mut_comp_d in mut_feats_l:
        if mut_comp_d["RegulonDB ID"] in comp_mut_df.index:
            comp_mut_df.loc[mut_comp_d["RegulonDB ID"], "observed mutation count"] += 1
        else:
            comp_len = mut_comp_d["range"][1] - mut_comp_d["range"][0] + 1
            df = pd.DataFrame([[mut_comp_d["name"], comp_len, 1]],
                              columns=["name", "length", "observed mutation count"],
                              index=[mut_comp_d["RegulonDB ID"]])  # "name" column just for visual inspection
            comp_mut_df = comp_mut_df.append(df, sort=False)
display(comp_mut_df.head())

Unnamed: 0,length,observed mutation count,name
ECK120002449,1179,2,xylR
ECK125144791,113,4,rph-pyrE attenuator terminator
ECK120000854,687,4,rph
ECK120000886,4224,3,rpoC
ECK120000233,1746,1,dnaG


In [4]:
# TODO: This may be obsolete (no longer being used in the pipeline). Check and remove if so.
for _, r in assoc_muts_df.iterrows():
    for feat_d in r["genomic features"]:
        feat_d["mutation set count"] = comp_mut_df.loc[feat_d["RegulonDB ID"]]["observed mutation count"]

In [5]:
mutated_seq_len = np.sum(comp_mut_df["length"])

TOTAL_GENOME_LEN = 4641652
genome_remainder_len = TOTAL_GENOME_LEN - mutated_seq_len
df = pd.DataFrame([[0, genome_remainder_len]], columns=["observed mutation count", "length"], index=["genome remainder"])
comp_mut_df = comp_mut_df.append(df, sort=False)
total_seq_len = TOTAL_GENOME_LEN


comp_mut_df["proportion"] = comp_mut_df["length"].apply(lambda comp_len: comp_len/total_seq_len)
comp_mut_df.head()

Unnamed: 0,length,observed mutation count,name,proportion
ECK120002449,1179,2,xylR,0.000254
ECK125144791,113,4,rph-pyrE attenuator terminator,2.4e-05
ECK120000854,687,4,rph,0.000148
ECK120000886,4224,3,rpoC,0.00091
ECK120000233,1746,1,dnaG,0.000376


In [6]:
from collections import Counter
comp_mut_df['equal or larger count'] = 0
component_l = comp_mut_df.index.tolist()
proportion_l = comp_mut_df["proportion"].tolist()
num_muts = np.sum(comp_mut_df["observed mutation count"])

# slides from UW speak about the relationship of permutations and precision.
# should read through them to get a sense of what is the least iterations that should execute.
num_itr = 10000
for i in range(0, num_itr):
    random_mut_comp_a = np.random.choice(
        a=component_l,
        size=num_muts,
        replace=True,
        p=proportion_l
    )
    random_mut_comp_count_d = Counter(random_mut_comp_a)
    for component in random_mut_comp_count_d.keys():
        if random_mut_comp_count_d[component] >= comp_mut_df.loc[component, 'observed mutation count']:
            comp_mut_df.loc[component, 'equal or larger count'] += 1

comp_mut_df["p value"] = comp_mut_df["equal or larger count"].apply(
    lambda count: count/num_itr)

comp_mut_df["significant"] = comp_mut_df["p value"].apply(lambda p_val: True if p_val < ENRICH_ALPHA else False)

display(comp_mut_df.head())

Unnamed: 0,length,observed mutation count,name,proportion,equal or larger count,p value,significant
ECK120002449,1179,2,xylR,0.000254,2,0.0002,True
ECK125144791,113,4,rph-pyrE attenuator terminator,2.4e-05,0,0.0,True
ECK120000854,687,4,rph,0.000148,0,0.0,True
ECK120000886,4224,3,rpoC,0.00091,0,0.0,True
ECK120000233,1746,1,dnaG,0.000376,206,0.0206,True


In [7]:
from statsmodels.stats import multitest
pvals = comp_mut_df["p value"]
rejects, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests(
    pvals=pvals,
    alpha=ENRICH_ALPHA,
    method=MULTI_HYP_CORR_METHOD)

pd.options.display.max_rows = 500
comp_mut_df["corrected p value"] = pvals_corrected
comp_mut_df["corrected significance"] = rejects
comp_mut_df.head()

Unnamed: 0,length,observed mutation count,name,proportion,equal or larger count,p value,significant,corrected p value,corrected significance
ECK120002449,1179,2,xylR,0.000254,2,0.0002,True,0.008,True
ECK125144791,113,4,rph-pyrE attenuator terminator,2.4e-05,0,0.0,True,0.0,True
ECK120000854,687,4,rph,0.000148,0,0.0,True,0.0,True
ECK120000886,4224,3,rpoC,0.00091,0,0.0,True,0.0,True
ECK120000233,1746,1,dnaG,0.000376,206,0.0206,True,0.824,False


In [8]:
# # not wanting to consider mutation events with only 1 features as being significant
# comp_mut_df["final significance"] = comp_mut_df.apply(lambda r: r["corrected significance"] if r["observed mutation count"] > 1 else False, axis=1)
# comp_mut_df.head()

In [9]:
# writing significance status back into mutations DF.
for i, r in assoc_muts_df.iterrows():
    for d in r["genomic features"]:
        if d["RegulonDB ID"] in comp_mut_df.index:
            d["significantly enriched"] = comp_mut_df.loc[d["RegulonDB ID"]]["corrected significance"]
#             d["significant"] = comp_mut_df.loc[d["RegulonDB ID"]]["final significance"]

# Outputting dataframe for the next pipeline step

In [10]:
assoc_muts_df.to_pickle("./data/6_df.pkl")

# Plots

Currently commented out because some of the experiments have mutated features without names which will crash this cell & NB during whole pipeline execution. Uncomment when investigating individual experiments.

In [11]:
# import seaborn as sns
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.rcParams["figure.dpi"] = 300
# sns.set_context("talk")
# sns.set_style("ticks")

In [12]:
# # currently commented out because some of the experiments have mutated features without names
# # which will crash this cell & NB during whole pipeline execution.
# # Uncomment when investigating individual experiments.

# TODO: found in multiple post-assoc NBs. Should extract into common script module.
# def _get_name_signif_from_ID(RegulonDB_ID, json):
#     name = ""
#     is_signif = False
#     for d in json:
#         if d["RegulonDB ID"] == RegulonDB_ID:
#             name = d["name"]
#             is_signif = d["significant"]
#             break
#     return (name, is_signif)


# targ_cnt_d = dict()
# for _, r in assoc_muts_df.iterrows():
#     for feat_d in r["genomic features"]:
#         feat = feat_d["name"]
#         if "attenuator terminator" in feat:
#             feat = feat.replace("attenuator terminator", "att term")
#         if "terminator" in feat:
#             feat = feat.replace("terminator", "term")
        
#         t = str(feat)
#         if feat_d["significant"]:
#             t = '*' + t
#         if t not in targ_cnt_d.keys():
#             targ_cnt_d[t] = 0
#         targ_cnt_d[t] += 1
        
# df = pd.DataFrame.from_dict(targ_cnt_d, orient='index', columns=["mutated feature count"])
# df = df.sort_values("mutated feature count")
# df = df[-5:]
# ax = df.plot.barh(y='mutated feature count',
# #                   figsize=(10,25),
#                   figsize=(5,2),
#                   color="#4C72B0",
#                   width=1
#                  )
# for i, v in enumerate(df["mutated feature count"]):
#     ax.text(v, i - 0.3, str(v))
# sns.despine(ax=ax, top=True, right=True)
# ax.set_title("Top 5 genomic mutated feature count")
# ax.get_legend().remove()