In [2]:
# import the nessessary packages
import pickle
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import polars as pl
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from models.ecoli.analysis import variantAnalysisPlot
from wholecell.analysis.analysis_tools import (exportFigure,
	read_bulk_molecule_counts, read_stacked_bulk_molecules, read_stacked_columns)
from wholecell.io.tablereader import TableReader

In [3]:
# work with the filterd data from saved_data_ng_internal_shift1: 
os.chdir(os.path.expanduser('~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/filtered_data/'))

# work specifically with the log data for the second sherlock run 
name_front = 'Filtered_AvgProteinCounts_Variant_'
name_back = '_startGen_14.csv'
variant_info = pd.read_excel('~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/sherlock_sim1_info.xlsx')

# access the unfiltered protein counts too so that the total PC divisor can be calculated: 
unfiltered_pth = '~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/unfiltered_data/'
name_front_unfiltered = 'AvgProteinCounts_Variant_'

# define function that creates a sting with the experimental variant's info (the sim1 refers to the mm1 simulation from sherlock)
def get_sim1_var_info(var_num):
    # extract the information for a specific variant: 
    variant = variant_info[var_num:var_num+1]
    EI = variant["NG expression"].item()
    TE = variant["TE"].item()
    S = variant["analysis sims"].item()
    log10NGPCs = variant["log(NG PCs +1)"].item()
    
    # create string for the legends: 
    info_string = "V"+str(var_num)+" (NG EI: "+str(EI)+", NG TE: "+str(TE)+", Sims: "+str(S)+", $log_{10}$(NG PCs+1): "+str(round(log10NGPCs,2))+")"
    
    return info_string, EI, TE, S, log10NGPCs
    
# normalize the data and convert it to log10 values: 
def log10_normalized_data(var_num):
    # load the data frame: 
    df = pd.read_csv(name_front + str(var_num) + name_back)
    
    # convert the data frame to a polars data frame:
    pl_df = pl.DataFrame(df)
    
    # rename the columns:
    Cname = pl_df.columns[1]
    name = pl_df.columns[2]
    pl_df = pl_df.rename({Cname: "Control_Variant", name: "Experimental_Variant"}) 
    
    # calculate the sum of all the protein counts for both the control and experimental variant (unfiltered):
    unfiltered_df = pd.read_csv(unfiltered_pth + name_front_unfiltered + str(var_num) + name_back)
    pl_unfiltered_df = pl.DataFrame(unfiltered_df)
    pl_unfiltered_df = pl_unfiltered_df.rename({pl_unfiltered_df.columns[1]: "Control_Variant", pl_unfiltered_df.columns[2]: "Experimental_Variant"}) 
    df_sum = pl_unfiltered_df.sum()
  
    # normalize the control data and take the log10 of the data (note: log10(protein_count_value) - log10(sum_over_all_protein_counts) = log10(protein_count_value/sum_over_all_protein_counts)):
    df_log10_C = pl_df.with_columns(pl.col("Control_Variant").log10().alias("log10_Control_Variant"))
    df_log10_Csum = df_sum["Control_Variant"].log10()
    df_log10_C = df_log10_C.with_columns(pl.col("log10_Control_Variant") - df_log10_Csum)
    
    # normalize the experimental data and take the log10 of the data :
    df_log10_E = df_log10_C.with_columns(pl.col("Experimental_Variant").log10().alias("log10_Experimental_Variant"))
    df_log10_Esum = df_sum["Experimental_Variant"].log10()
    df_log10_E = df_log10_E.with_columns(pl.col("log10_Experimental_Variant") - df_log10_Esum)
    
    df_normalized_log10 = df_log10_E.select([pl_df.columns[0],"log10_Control_Variant", "log10_Experimental_Variant"])
    df_normalized_log10 = df_normalized_log10.to_pandas()
    
    return df_normalized_log10

# define the function that calculates the linear information, including x-intercept, for each variant:
def get_linear_info_log10Filtered(df):
    # get the data for the control and experimental variant:
    x = df["log10_Control_Variant"]
    y = df["log10_Experimental_Variant"]
    
    # calculate the slope and y-intercept for the linear fit:
    m, b = np.polyfit(x, y, 1)
    
    # calculate the x-intercept:
    x_intercept = -b/m
    
    # return the linear information:
    return m, b, x_intercept
    
    
# create a function that plots the data and the linear fit for each variant:

# note that the following might need to be adjusted depending on the number of variants and which ones are being plotted: alpha values for data and lines, the minimum value for the x and y axis (by default this is set to -10), and the number of columns in the legend.

def plot_linear_fit_log10Filtered(variant_nums, last_val=-10):
    plt.figure(figsize=(10, 10))
    
    for i in range(len(variant_nums)):
        var_num = variant_nums[i]
        df = log10_normalized_data(var_num)
        str_info, EI, TE, S, log10NGPCs = get_sim1_var_info(var_num)
        
        # get the linear information:
        m, b, x_intercept = get_linear_info_log10Filtered(df)
        
        linar_info = "V"+str(var_num)+": y = "+str(round(m, 2))+"x + "+str(round(b, 2))+" (x-intercept: "+str(round(x_intercept, 2))+")"
        
        # plot the data:
        plt.scatter(df["log10_Control_Variant"], df["log10_Experimental_Variant"], label = str_info, alpha = 0.6, s = 3)
        
        # plot the linear fit:
        #x_values = [last_val+0.3, df["log10_Control_Variant"].max()] # extend the line a bit
        x_values = [last_val+0.3, -1.05]
        x_values = np.array(x_values)
        plt.plot(x_values, m*x_values + b, linestyle="--", label = linar_info)
    
    # add a y=x line: 
    yxvals= np.linspace(last_val, -1, 100)
    plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="k", alpha=0.5, label="y=x")
    
    # plotting specs
    plt.axis('square'); plt.ylim([last_val, -1]); plt.xlim([last_val, -1])
    plt.xlabel("log10(Control Variant Protein Counts)")
    plt.ylabel("log10(Experimental Variant Protein Counts)")
    plt.title("Comparison of Experimental Variant Protein Counts to Control Variant Protein Counts \n (Filtered PCs > 0)")
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.09), ncol=1)
    plt.show()
    

In [4]:
# create a function that plots the data and the linear fit for each variant:
def plot_mm1_with_plotly(var_nums, last_num=-10, legend_loc=-0.5):
    # plot the figure:
    fig = go.Figure()
    
    # add the data for each variant:
    for num in var_nums: 
        df = log10_normalized_data(num)
        str_info, EI, TE, S, log10NGPCs = get_sim1_var_info(num)
        name = "V"+str(num)+" (EI="+str(EI)+", TE="+str(TE)+", S="+str(S)+", log10(NG PCs+1)="+str(round(log10NGPCs,2))+")"
        monomer_ids = df["Filtered Monomer ID"]
        
        fig.add_trace(go.Scatter(x=df.log10_Control_Variant,y=df.log10_Experimental_Variant,
                    mode='markers', hovertext=monomer_ids,
                    name=name)) 
        
        # plot a linear fit for each variant:
        m, b, x_intercept = get_linear_info_log10Filtered(df)
        x_values = [last_num+0.3, -1.05]
        x_values = np.array(x_values)
        fig.add_trace(go.Scatter(x=x_values, y=m*x_values + b, mode='lines', name="V"+str(num)+": y = "+str(round(m, 2))+"x + "+str(round(b, 2))+" (x-intercept: "+str(round(x_intercept, 2))+")", line=dict(width=1, dash='dash')))
        
    # add an x=y line:
    fig.add_trace(go.Scatter(x=[-10, -0.5], y=[-10, -0.5], mode='lines', name='y=x', line=dict(color='black', width=.5, backoff=True, dash='dot')))
                  
    # Plot Specs: 
    fig.update_layout(title="Comparison of Experimental Variant Protein Counts to <br>Control Variant Protein Counts (Filter: PCs > 0)")
    fig.update_xaxes(title_text="log10(Control Variant)")
    fig.update_yaxes(title_text="log10(Experimental Variant) (w/ New Gene)")
    fig.update_layout(
        autosize=False,
        width=700,
        height=850)
    
    # resize the plot as needed: 
    fig.update_yaxes(range=[last_num, -0.5])
    fig.update_xaxes(range=[last_num, -0.5])
    
    # make the marker size bigger:
    fig.update_traces(marker=dict(size=3))
    
    # place the legend outside of the plot:
    fig.update_layout(legend=dict(
        orientation="v",
        yanchor="bottom",
        y=legend_loc,
        xanchor="center",
        x=0.5
    ))
    
    fig.show()
    return fig
    
def save_filtered_plotly(var_nums, sim_name, last_num=-10, legend_loc=-0.5):
    # get the figure: 
    fig = plot_mm1_with_plotly(var_nums, last_num, legend_loc)
    
    # save the figure:
    fig_name = sim_name + "_interactive_plot_with_filtered_variants_"+"_".join(str(x) for x in var_nums) + ".html"
    
    # save path
    pth = '~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/filtered_data/'
    save_pth = pth + 'interactive_plots/'
    if not os.path.exists(save_pth):
        os.makedirs(save_pth)
        
    # save the figure:
    pio.write_html(fig, save_pth + fig_name, auto_open=True)

In [14]:
# single variant plotly:
def residual_plotly(fig, df, var_num):
    # get the information for the variant:
    info_string, EI, TE, S, log10NGPCs = get_sim1_var_info(var_num)
    name = "V"+str(var_num)+" (EI="+str(EI)+", TE="+str(TE)+", S="+str(S)+", log10(NG PCs+1)="+str(round(log10NGPCs,2))+")"
    
    # add the data for each variant:
    fig.add_trace(go.Scatter(x=df.log10_Control_Variant,y=df.log10_Experimental_Variant,
                    mode='markers', hovertext=df["Filtered Monomer ID"],
                    name=name))
    
    
    
# sort out data points that are outliers from the linear fit: 
def plot_filtered_outliers(variant_nums, residual_filter_value=0.5, last_num=-10, legend_loc=-0.5):
    fig = go.Figure()
    for i in range(len(variant_nums)):
        var_num = variant_nums[i]
        df = log10_normalized_data(var_num)
        m, b, x_intercept = get_linear_info_log10Filtered(df)
        
        # get the y values for the linear fit:
        y_values = m*df["log10_Control_Variant"] + b
        
        # get the residuals:
        residuals = df["log10_Experimental_Variant"] - y_values

        # add the residuals to the data frame:
        df["residuals"] = residuals
        
        # filter out the proteins that have residual values less than the specifed value:
        df_filtered = df[abs(df["residuals"]) > residual_filter_value]
        
        # plot the residuals:
        residual_plotly(fig, df_filtered, var_num)
    
    # add an x=y line:
    fig.add_trace(go.Scatter(x=[-10, -0.5], y=[-10, -0.5], mode='lines', name='y=x', line=dict(color='black', width=.5, backoff=True, dash='dot')))
    
    # Plot Specs:
    fig.update_layout(title="Filtered Outliers from Linear Fit (Residuals > "+str(residual_filter_value)+")")
    fig.update_xaxes(title_text="log10(Control Variant)")
    fig.update_yaxes(title_text="log10(Experimental Variant) (w/ New Gene)")
    fig.update_layout(
        autosize=False,
        width=700,
        height=850)
    
    # resize the plot as needed:
    fig.update_yaxes(range=[last_num, -0.5])
    fig.update_xaxes(range=[last_num, -0.5])
    
    # make the marker size bigger:
    fig.update_traces(marker=dict(size=3))
    
    # place the legend outside of the plot:
    fig.update_layout(legend=dict(
        orientation="v",
        yanchor="bottom",
        y=legend_loc,
        xanchor="center",
        x=0.5))
    
    #fig.show()
            
    return fig

In [16]:
hi = [16, 17, 18, 19, 20]
plot_filtered_outliers(hi, 0.5)