In [None]:
# import the nessessary packages
import pickle
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import polars as pl
import plotly.graph_objects as go
from models.ecoli.analysis import variantAnalysisPlot
from wholecell.analysis.analysis_tools import (exportFigure,
	read_bulk_molecule_counts, read_stacked_bulk_molecules, read_stacked_columns)
from wholecell.io.tablereader import TableReader

In [None]:
# work with the filterd data: 
os.chdir(os.path.expanduser('~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/filtered_data/'))

# work specifically with the log data for the second sherlock run 
name_front = 'Filtered_AvgProteinCounts_Variant_'
name_back = '_startGen_14.csv'
variant_info = pd.read_excel('~/wcEcoli/out/sherlock_data/saved_data_ng_internal_shift1/sherlock_sim1_info.xlsx')

# define function that creates a sting with the variant's info
def get_sim1_var_info(var_num):
    # extract the information for a specific variant: 
    variant = variant_info[var_num:var_num+1]
    EI = variant["NG expression"].item()
    TE = variant["TE"].item()
    S = variant["analysis sims"].item()
    log10NGPCs = variant["log(NG PCs +1)"].item()
    
    # create string for the legends: 
    info_string = "V"+str(var_num)+" (NG EI: "+str(EI)+", NG TE: "+str(TE)+", Sims: "+str(S)+" $log_{10}$(NG PCs+1): "+str(log10NGPCs)+")"
    
    return info_string
    
# normalize the data and convert it to log10 values: 
def log10_normalized_data(var_num):
    # load the data frame: 
    df = pd.read_csv(name_front + str(var_num) + name_back)
    
    # convert the data frame to a polars data frame:
    pl_df = pl.DataFrame(df)
    
    # rename the columns:
    Cname = pl_df.columns[1]
    name = pl_df.columns[2]
    pl_df = pl_df.rename({Cname: "Control_Variant", name: "Experimental_Variant"}) 
    
    # calculate the sum of all the protein counts for both the control and experimental variant:
    df_sum = pl_df.sum()
  
    # normalize the control data and take the log10 of the data (note: log10(protein_count_value) - log10(sum_over_all_protein_counts) = log10(protein_count_value/sum_over_all_protein_counts)):
    df_log10_C = pl_df.with_columns(pl.col("Control_Variant").log10().alias("log10_Control_Variant"))
    df_log10_Csum = df_sum["Control_Variant"].log10()
    df_log10_C = df_log10_C.with_columns(pl.col("log10_Control_Variant") - df_log10_Csum)
    
    # normalize the experimental data and take the log10 of the data :
    df_log10_E = df_log10_C.with_columns(pl.col("Experimental_Variant").log10().alias("log10_Experimental_Variant"))
    df_log10_Esum = df_sum["Experimental_Variant"].log10()
    df_log10_E = df_log10_E.with_columns(pl.col("log10_Experimental_Variant") - df_log10_Esum)
    
    df_normalized_log10 = df_log10_E.select([pl_df.columns[0],"log10_Control_Variant", "log10_Experimental_Variant"])
    df_normalized_log10 = df_normalized_log10.to_pandas()
    
    return df_normalized_log10

# get the data and legend for each sim:
# v1 = pd.read_csv(name_front + '1' + name_back); str1 = get_sim1_var_info(1)
# v2 = pd.read_csv(name_front + '2' + name_back); str2 = get_sim1_var_info(2)
# v3 = pd.read_csv(name_front + '3' + name_back); str3 = get_sim1_var_info(3)
# v4 = pd.read_csv(name_front + '4' + name_back); str4 = get_sim1_var_info(4)
# v5 = pd.read_csv(name_front + '5' + name_back); str5 = get_sim1_var_info(5)
# v6 = pd.read_csv(name_front + '6' + name_back); str6 = get_sim1_var_info(6)
# v7 = pd.read_csv(name_front + '7' + name_back); str7 = get_sim1_var_info(7)
# v8 = pd.read_csv(name_front + '8' + name_back); str8 = get_sim1_var_info(8)
# v9 = pd.read_csv(name_front + '9' + name_back); str9 = get_sim1_var_info(9)
# v10 = pd.read_csv(name_front + '10' + name_back); str10 = get_sim1_var_info(10)
# v11 = pd.read_csv(name_front + '11' + name_back); str11 = get_sim1_var_info(11)
# v12 = pd.read_csv(name_front + '12' + name_back); str12 = get_sim1_var_info(12)
# v13 = pd.read_csv(name_front + '13' + name_back); str13 = get_sim1_var_info(13)
# v14 = pd.read_csv(name_front + '14' + name_back); str14 = get_sim1_var_info(14)
# v15 = pd.read_csv(name_front + '15' + name_back); str15 = get_sim1_var_info(15)
# v16 = pd.read_csv(name_front + '16' + name_back); str16 = get_sim1_var_info(16)
# v17 = pd.read_csv(name_front + '17' + name_back); str17 = get_sim1_var_info(17)
# v18 = pd.read_csv(name_front + '18' + name_back); str18 = get_sim1_var_info(18)
# v19 = pd.read_csv(name_front + '19' + name_back); str19 = get_sim1_var_info(19)
# v20 = pd.read_csv(name_front + '20' + name_back); str20 = get_sim1_var_info(20)

# v1_log10 = pd.read_csv(name_front + '1' + name_back); str1 = get_sim1_var_info(1)
# v2_log10 = pd.read_csv(name_front + '2' + name_back); str2 = get_sim1_var_info(2)
# v3_log10 = pd.read_csv(name_front + '3' + name_back); str3 = get_sim1_var_info(3)
# v4_log10 = pd.read_csv(name_front + '4' + name_back); str4 = get_sim1_var_info(4)
# v5_log10 = pd.read_csv(name_front + '5' + name_back); str5 = get_sim1_var_info(5)
# v6_log10 = pd.read_csv(name_front + '6' + name_back); str6 = get_sim1_var_info(6)
# v7_log10 = pd.read_csv(name_front + '7' + name_back); str7 = get_sim1_var_info(7)
# v8_log10 = pd.read_csv(name_front + '8' + name_back); str8 = get_sim1_var_info(8)
# v9_log10 = pd.read_csv(name_front + '9' + name_back); str9 = get_sim1_var_info(9)
# v10_log10 = pd.read_csv(name_front + '10' + name_back); str10 = get_sim1_var_info(10)
# v11_log10 = pd.read_csv(name_front + '11' + name_back); str11 = get_sim1_var_info(11)
# v12_log10 = pd.read_csv(name_front + '12' + name_back); str12 = get_sim1_var_info(12)
# v13_log10 = pd.read_csv(name_front + '13' + name_back); str13 = get_sim1_var_info(13)
# v14_log10 = pd.read_csv(name_front + '14' + name_back); str14 = get_sim1_var_info(14)
# v15_log10 = pd.read_csv(name_front + '15' + name_back); str15 = get_sim1_var_info(15)
# v16_log10 = pd.read_csv(name_front + '16' + name_back); str16 = get_sim1_var_info(16)
# v17_log10 = pd.read_csv(name_front + '17' + name_back); str17 = get_sim1_var_info(17)
# v18_log10 = pd.read_csv(name_front + '18' + name_back); str18 = get_sim1_var_info(18)
# v19_log10 = pd.read_csv(name_front + '19' + name_back); str19 = get_sim1_var_info(19)
# v20_log10 = pd.read_csv(name_front + '20' + name_back); str20 = get_sim1_var_info(20)

v1_log10 = log10_normalized_data(1); str1 = get_sim1_var_info(1)
v2_log10 = log10_normalized_data(2); str2 = get_sim1_var_info(2)
v3_log10 = log10_normalized_data(3); str3 = get_sim1_var_info(3)
v4_log10 = log10_normalized_data(4); str4 = get_sim1_var_info(4)
v5_log10 = log10_normalized_data(5); str5 = get_sim1_var_info(5)
v6_log10 = log10_normalized_data(6); str6 = get_sim1_var_info(6)
v7_log10 = log10_normalized_data(7); str7 = get_sim1_var_info(7)
v8_log10 = log10_normalized_data(8); str8 = get_sim1_var_info(8)
v9_log10 = log10_normalized_data(9); str9 = get_sim1_var_info(9)
v10_log10 = log10_normalized_data(10); str10 = get_sim1_var_info(10)
v11_log10 = log10_normalized_data(11); str11 = get_sim1_var_info(11)
v12_log10 = log10_normalized_data(12); str12 = get_sim1_var_info(12)
v13_log10 = log10_normalized_data(13); str13 = get_sim1_var_info(13)
v14_log10 = log10_normalized_data(14); str14 = get_sim1_var_info(14)
v15_log10 = log10_normalized_data(15); str15 = get_sim1_var_info(15)
v16_log10 = log10_normalized_data(16); str16 = get_sim1_var_info(16)
v17_log10 = log10_normalized_data(17); str17 = get_sim1_var_info(17)
v18_log10 = log10_normalized_data(18); str18 = get_sim1_var_info(18)
v19_log10 = log10_normalized_data(19); str19 = get_sim1_var_info(19)
v20_log10 = log10_normalized_data(20); str20 = get_sim1_var_info(20)

# dfs = [v1_log10, v2_log10, v3_log10, v4_log10, v5_log10, v6_log10, v7_log10, v8_log10, v9_log10, v10_log10, v11_log10, v12_log10, v13_log10, v14_log10, v15_log10, v16_log10, v17_log10, v18_log10, v19_log10, v20_log10]
# 
# strings = [str1, str2, str3, str4, str5, str6, str7, str8, str9, str10, str11, str12, str13, str14, str15, str16, str17, str18, str19, str20]

## Sort the data by the NG count

In [None]:
# sort the variant info accordingly: 
var_info_sorted = variant_info.sort_values("log(NG PCs +1)")

## Manually sort into groups (after viewing the above): 

# NG count 0 vars: 0 (disregard), 4, 9, 14, 19 
# NG count 3.45 to 4.64: 3, 8, 2, 18, 13
# NG count 5.10 to 5.95: 1, 5, 7, 17, 12, 6
# NG count 6.10 to 6.29: 10, 16, 11, 15, 20 

## Plot variants with 0 NG counts

In [None]:
# # plot all log data. Plot x = control, y = experimental:
plt.figure(figsize=(10, 10))

# plot the data:
plt.scatter(v4_log10.log10_Control_Variant, v4_log10.log10_Experimental_Variant, .6);
plt.scatter(v9_log10.log10_Control_Variant, v9_log10.log10_Experimental_Variant, .6);
plt.scatter(v14_log10.log10_Control_Variant, v14_log10.log10_Experimental_Variant, .6);
plt.scatter(v19_log10.log10_Control_Variant, v19_log10.log10_Experimental_Variant, .6);

# plot a y=x string
yxvals= np.linspace(-10, -1, 100)
plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="#FF796C"); yxstr = "y=x"
legend = [str4, str9, str14, str19, yxstr]

plt.legend(legend)
plt.axis('square')
plt.xlabel("$log_{10}$(Control Variant) (w/o New Gene)")
plt.ylabel("$log_{10}$(Experimental Variant) (w/ New Gene)")
plt.title(f"Normalized protein counts for variants with 0 New Gene counts")

## Plot variants with 3.45 to 4.64 NG counts: 3, 8, 2, 18, 13

In [None]:
# # plot all log data. Plot x = control, y = experimental:
plt.figure(figsize=(10, 10))

# plot the data:
plt.scatter(v3_log10.log10_Control_Variant, v3_log10.log10_Experimental_Variant, .6);
plt.scatter(v8_log10.log10_Control_Variant, v8_log10.log10_Experimental_Variant, .6);
plt.scatter(v2_log10.log10_Control_Variant, v2_log10.log10_Experimental_Variant, .6);
plt.scatter(v18_log10.log10_Control_Variant, v18_log10.log10_Experimental_Variant, .6);
plt.scatter(v13_log10.log10_Control_Variant, v13_log10.log10_Experimental_Variant, .6);

# plot a y=x string
yxvals= np.linspace(-10, -1, 100)
plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="#FF796C"); yxstr = "y=x"
legend = [str3, str8, str2, str18, str13, yxstr]

plt.legend(legend)
plt.axis('square')
plt.xlabel("$log_{10}$(Control Variant) (w/o New Gene)")
plt.ylabel("$log_{10}$(Experimental Variant) (w/ New Gene)")
plt.title(f"Normalized protein counts for variants with 3.45 to 4.64 New Gene counts")

## Plot variants with 5.10 to 5.95 NG counts: 1, 5, 7, 17, 12, 6

In [None]:
# # plot all log data. Plot x = control, y = experimental:
plt.figure(figsize=(10, 10))

# plot the data:
plt.scatter(v1_log10.log10_Control_Variant, v1_log10.log10_Experimental_Variant, .6);
plt.scatter(v5_log10.log10_Control_Variant, v5_log10.log10_Experimental_Variant, .6);
plt.scatter(v7_log10.log10_Control_Variant, v7_log10.log10_Experimental_Variant, .6);
plt.scatter(v17_log10.log10_Control_Variant, v17_log10.log10_Experimental_Variant, .6);
plt.scatter(v12_log10.log10_Control_Variant, v12_log10.log10_Experimental_Variant, .6);
plt.scatter(v6_log10.log10_Control_Variant, v6_log10.log10_Experimental_Variant, .6);

# plot a y=x string
yxvals= np.linspace(-9.5, -1, 100)
plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="#FF796C"); yxstr = "y=x"
legend = [str1, str5, str7, str17, str12, str6, yxstr]

plt.legend(legend)
plt.axis('square')
plt.xlabel("$log_{10}$(Control Variant) (w/o New Gene)")
plt.ylabel("$log_{10}$(Experimental Variant) (w/ New Gene)")
plt.title(f"Normalized protein counts for variants with 5.10 to 5.95 New Gene counts")

## Plot variants with 6.10 to 6.29 NG counts: 10, 16, 11, 15, 20

In [None]:
# # plot all log data. Plot x = control, y = experimental:
plt.figure(figsize=(10, 10))

# plot the data:
plt.scatter(v10_log10.log10_Control_Variant, v10_log10.log10_Experimental_Variant, .6);
plt.scatter(v16_log10.log10_Control_Variant, v16_log10.log10_Experimental_Variant, .6);
plt.scatter(v11_log10.log10_Control_Variant, v11_log10.log10_Experimental_Variant, .6);
plt.scatter(v15_log10.log10_Control_Variant, v15_log10.log10_Experimental_Variant, .6);
plt.scatter(v20_log10.log10_Control_Variant, v20_log10.log10_Experimental_Variant, .6);

# plot a y=x string
yxvals= np.linspace(-9.5, -1, 100)
plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="#FF796C"); yxstr = "y=x"
legend = [str10, str16, str11, str15, str20, yxstr]

plt.legend(legend)
plt.axis('square')
plt.xlabel("$log_{10}$(Control Variant) (w/o New Gene)")
plt.ylabel("$log_{10}$(Experimental Variant) (w/ New Gene)")
plt.title(f"Normalized protein counts for variants with 6.10 to 6.29 New Gene counts")

In [None]:
# todo: figure out what that super low count is! 
# todo: recreate the full counts graph! 
# todo: consider doing normalization with mass instead of counts

# Average across the four groupings

In [None]:
# make a function that can average: 
def average_across_vars(vars=[]):
    # make a pl dataframe based off the first in the list:
    pl_df0 = pl.DataFrame(vars[0])
    pl_df0 = pl_df0.rename({pl_df0.columns[2]:"Var0"})
    
    for i in range(1, len(vars)):
        pl_df = pl.DataFrame(vars[i]); 
        name = "Var"+str(i)
        pl_df = pl_df.rename({pl_df.columns[2]:name})
        pl_df = pl_df.select([pl_df.columns[0], pl_df.columns[2]])
        pl_df0 = pl_df0.join(pl_df, on=pl_df.columns[0], how="inner")
        
    df_keep = pl_df0.select(pl_df0.columns[0], pl_df0.columns[1])
    df_keep = df_keep.to_pandas()

    df = pl_df0.drop(pl_df0.columns[0], pl_df0.columns[1])
    df = df.to_pandas()
    
    # take the average across rows:
    average_vars = pd.DataFrame(np.mean((df), axis=1)); average_vars = average_vars.rename({"<unamed>":"mean"})
    average_vars_kept = np.hstack((df_keep, average_vars))
    
    average_vars_kept = average_vars_kept
    # todo: add column names! 
    # todo: check I am returning the correct average! 

    return average_vars_kept


df0 = average_across_vars([v4_log10, v9_log10, v14_log10, v19_log10])
df4 = average_across_vars([v3_log10, v8_log10, v2_log10, v18_log10, v13_log10])
df5 = average_across_vars([v1_log10, v5_log10, v7_log10, v17_log10, v6_log10])
df6 = average_across_vars([v10_log10, v16_log10, v11_log10, v15_log10, v20_log10])


In [None]:

# # plot all log data. Plot x = control, y = experimental:
plt.figure(figsize=(10, 10))

# plot the data:
plt.scatter(df0[:,1], df0[:,2], .6);str_avg0 = "Vars w/ 0 NG PCs"
plt.scatter(df4[:,1], df4[:,2], .6);str_avg4 = "Vars w/ 3.45 to 4.64 $log_{10}$(NG PCs+1)"
plt.scatter(df5[:,1], df5[:,2], .6);str_avg5 = "Vars w/ 5.10 to 5.95 $log_{10}$(NG PCs+1)"
plt.scatter(df6[:,1], df6[:,2], .6);str_avg6 = "Vars w/ 6.10 to 6.29 $log_{10}$(NG PCs+1)"

# plot a y=x string
yxvals= np.linspace(-9.5, -1, 100)
plt.plot(yxvals, yxvals, linewidth=.5, linestyle="dashed", color="#FF796C"); yxstr = "y=x"
legend = [str_avg0, str_avg4, str_avg5, str_avg6, yxstr]

plt.legend(legend)
plt.axis('square')
plt.xlabel("$log_{10}$(Control Variant) (w/o New Gene)")
plt.ylabel("$log_{10}$(Experimental Variant) (w/ New Gene)")
plt.title(f"Normalized protein counts for variants with varying levels of New Gene counts")