In [None]:
import pickle
import os
import pandas as pd
from matplotlib import pyplot as plt
os.chdir(os.path.expanduser('~/wcEcoli/out/'))
# noinspection PyUnresolvedReferences
import numpy as np
import plotly.graph_objects as go
from models.ecoli.analysis import cohortAnalysisPlot
from wholecell.analysis.analysis_tools import (exportFigure,
	read_bulk_molecule_counts, read_stacked_bulk_molecules, read_stacked_columns)
from wholecell.io.tablereader import TableReader
from sklearn.metrics import r2_score

In [None]:
# first find the monomer IDs that are substrates of the proteases in the CLClim0NE model:
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# read in the unfiltered data: 
CLClimNE_log_data_with_proteases = pd.read_csv('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLCLim1NE_cohort_saved_protein_count_data/unfiltered_data/log_data/LogAvgProteinCounts_startGen_2.csv')

# add the gene symbols to the dataframe:
CLClimNE_log_data_with_proteases['gene symbol'] = CLClimNE_log_data_with_proteases['Monomer ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x[:-3]])
CLClimNE_log_data_with_proteases['protease assignement'] = "NA"
CLClimNE_log_data_with_proteases

In [None]:
# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
gene_ids = CLClimNE_log_data_with_proteases['gene symbol']

CLClim0NE_full_protease_substrate_gene_symbols = []
for gene_id in gene_ids:
    if gene_id in protease_data['Gene name'].values:
        CLClim0NE_full_protease_substrate_gene_symbols.append(gene_id)
        protease_row = protease_data[protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  
        CLClimNE_log_data_with_proteases.loc[CLClimNE_log_data_with_proteases['gene symbol'] == gene_id, 'protease assignement'] = protease

print("Total Protease Substrates in CLClim0NE (" +str(len(CLClim0NE_full_protease_substrate_gene_symbols))+"): ", CLClim0NE_full_protease_substrate_gene_symbols)
# Differences in (308 to 236) numbers are likely due the the fact that the strain of E. coli in the paper might differ from the one in our model. 
CLClimNE_log_data_with_proteases

In [None]:
# figure out which proteins were assigned to lon in the model: 
lon_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'Lon only']
lon_proteins

# figure out which proteins were assigned to clp in the model:
clp_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'ClpP only']

hslv_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'HslV only']

# figure out which proteins are additive in the model: 
additive_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'Additive']

# figure out which proteins are redundant in the model: 
redundant_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'Redundant']

# figure out which proteins are assinged as "unexplained" in the model: 
unexplained_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'Actively degrading in Tripple KO']

# figure out which proteins were not assigned to a protease in the model:
no_protease_proteins = CLClimNE_log_data_with_proteases[CLClimNE_log_data_with_proteases['protease assignement'] == 'NA']

# todo: note that these numbers will not add up to those in the graph below bc some proteins in the CLClim0NE model are not in the CLNE model 
#298 total 
print("Lon Proteins(" +str(len(lon_proteins))+"): ", list(lon_proteins['gene symbol'])) #14
print("ClpP Proteins(" +str(len(clp_proteins))+"): ", list(clp_proteins['gene symbol'])) #63
print("HslV Proteins(" +str(len(hslv_proteins))+"): ", list(hslv_proteins['gene symbol'])) #1
print("Additive Proteins(" +str(len(additive_proteins))+"): ", list(additive_proteins['gene symbol'])) #79
print("Redundant Proteins(" +str(len(redundant_proteins))+"): ", list(redundant_proteins['gene symbol'])) #39
print("Unexplained Proteins(" +str(len(unexplained_proteins))+"): ", list(unexplained_proteins['gene symbol'])) #102
#print("No Protease Proteins(" +str(len(no_protease_proteins))+"): ", list(no_protease_proteins['gene symbol'])) # 4012


In [None]:
## Create scatter plot of CLNE counts to CLClim0NE counts, but color the protiens based on their protease assignment in the CLClim0NE model
# create the plot!
# TODO: do I need to normalize the data to ? and how do I get the plot to be an exact square? and the figure title centered?
def CLNE_CLClimNE_plot_protease(fig, dataframe, protease=0):
    x = dataframe["Log10 CLNE Protein Counts"]
    y = dataframe["Log10 CLClim0NE Protein Counts"]
    hovertext = dataframe["gene symbol"]
 
    # Add scatter trace
    if protease == 0: # no assignement
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"No Protease ({np.shape(dataframe)[0]})", marker=dict(color='orange', size=3, opacity=.5)))

    elif protease == 1: # no explaination
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Unexplained ({np.shape(dataframe)[0]})", marker=dict(color='deeppink', size=3, opacity=.5)))
        
    elif protease == 2: # redundant
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Redundant ({np.shape(dataframe)[0]})", marker=dict(color='lightseagreen', size=5, opacity=.9)))
        
    elif protease == 3: # additive
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Additive ({np.shape(dataframe)[0]})", marker=dict(color='blue', size=5, opacity=.9)))
        
    elif protease == 4: # lon
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Lon ({np.shape(dataframe)[0]})", marker=dict(color='red', size=5, opacity=.9)))
        
    elif protease == 5: # clp
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"ClpP ({np.shape(dataframe)[0]})", marker=dict(color='green', size=5, opacity=.9)))
        
    elif protease == 6: # hslv
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Hslv ({np.shape(dataframe)[0]})", marker=dict(color='yellow', size=5, opacity=.9)))



# Create figure
fig = go.Figure()

del CLClimNE_log_data_with_proteases
# first find the monomer IDs that are substrates of the proteases in the CLClim0NE model:
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# read in the unfiltered data: 
CLClimNE_log_data_with_proteases = pd.read_csv('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLCLim1NE_cohort_saved_protein_count_data/unfiltered_data/log_data/LogAvgProteinCounts_startGen_2.csv')

CLNE_log_data = pd.read_csv('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/cohort_saved_protein_count_data_10_09/unfiltered_data/log_data/LogAvgProteinCounts_startGen_2.csv')

# merge the two dataframes:
CLNE_log_data = CLNE_log_data.rename(columns={"Log10 Average Protein Count": "Log10 CLNE Protein Counts"})
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases.rename(columns={"Log10 Average Protein Count": "Log10 CLClim0NE Protein Counts"})
CLNE_log_data = CLNE_log_data[["Monomer ID", "Log10 CLNE Protein Counts"]]
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases[["Monomer ID", "Log10 CLClim0NE Protein Counts"]]
CLNE_log_data = CLNE_log_data.set_index("Monomer ID")
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases.set_index("Monomer ID")
CLNE_CLClimNE_log_data_proteases = CLNE_log_data.join(CLClimNE_log_data_with_proteases, how="inner")
CLNE_CLClimNE_log_data_proteases = CLNE_CLClimNE_log_data_proteases.reset_index()

# add the gene symbols to the dataframe:
CLNE_CLClimNE_log_data_proteases['gene symbol'] = CLNE_CLClimNE_log_data_proteases['Monomer ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x[:-3]])
CLNE_CLClimNE_log_data_proteases['protease assignement'] = "NA"

# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
gene_ids = CLNE_CLClimNE_log_data_proteases['gene symbol']
for gene_id in gene_ids:
    if gene_id in protease_data['Gene name'].values:
        protease_row = protease_data[protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  # Extract the protease assignment
        # change the protease assignment for the gene id in the dataframe:
        CLNE_CLClimNE_log_data_proteases.loc[CLNE_CLClimNE_log_data_proteases['gene symbol'] == gene_id, 'protease assignement'] = protease

# make a copy of the CLClimNE_log_data_with_proteases data:
CLClimNE_log_data_with_Lon_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Clp_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_HslV_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Additive_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Redundant_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Unexplained_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_no_protease_proteases = CLNE_CLClimNE_log_data_proteases.copy()

# remove the proteins that were not assigned to the protease in the model:
CLClimNE_log_data_with_Lon_proteases = CLClimNE_log_data_with_Lon_proteases[CLClimNE_log_data_with_Lon_proteases['protease assignement'] == 'Lon only']
CLClimNE_log_data_with_Clp_proteases = CLClimNE_log_data_with_Clp_proteases[CLClimNE_log_data_with_Clp_proteases['protease assignement'] == 'ClpP only']
CLClimNE_log_data_with_HslV_proteases = CLClimNE_log_data_with_HslV_proteases[CLClimNE_log_data_with_HslV_proteases['protease assignement'] == 'HslV only']
CLClimNE_log_data_with_Additive_proteases = CLClimNE_log_data_with_Additive_proteases[CLClimNE_log_data_with_Additive_proteases['protease assignement'] == 'Additive']
CLClimNE_log_data_with_Redundant_proteases = CLClimNE_log_data_with_Redundant_proteases[CLClimNE_log_data_with_Redundant_proteases['protease assignement'] == 'Redundant']
CLClimNE_log_data_with_Unexplained_proteases = CLClimNE_log_data_with_Unexplained_proteases[CLClimNE_log_data_with_Unexplained_proteases['protease assignement'] == 'Actively degrading in Tripple KO']
CLClimNE_log_data_with_no_protease_proteases = CLClimNE_log_data_with_no_protease_proteases[CLClimNE_log_data_with_no_protease_proteases['protease assignement'] == 'NA']

# now plot!

# Add scatter plots and trendlines for each dataset
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_no_protease_proteases, 0)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Unexplained_proteases, 1)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Redundant_proteases, 2)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Additive_proteases, 3)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Lon_proteases, 4)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Clp_proteases, 5)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_HslV_proteases, 6)

# add a y=x line
fig.add_trace(go.Scatter(x=[0, 6], y=[0, 6], mode="lines",
    line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
    name="y=x"));

# Update layout
fig.update_layout(
    title=f"Protein Counts from the New WCM plotted <br>against those from the Original WCM",
    xaxis_title="log10(Orginal WCM Protein Counts)",
    yaxis_title=f"log10(New WCM Protein Counts)",
    autosize=False,
    width=900,
    height=600, 
    showlegend=True,
    legend_title="Protease Assignments")


# Show figure
fig.show()


In [None]:
## Create scatter plot of CLNE counts to CLClim0NE counts, but color the protiens based on their protease assignment in the CLClim0NE model
# create the plot!
# TODO: do I need to normalize the data? and how do I get the plot to be an exact square? and the figure title centered?
def CLNE_CLClimNE_plot_protease(fig, dataframe, protease=0):
    x = dataframe["Log10 CLNE Protein Counts"]
    y = dataframe["Log10 CLClim0NE Protein Counts"]
    hovertext = dataframe["gene symbol"]
 
    # Add scatter trace
    if protease == 0: # no assignement
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"No Protease ({np.shape(dataframe)[0]})", marker=dict(color='orange', size=3, opacity=.5)))

    elif protease == 1: # no explaination
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Unexplained ({np.shape(dataframe)[0]})", marker=dict(color='deeppink', size=3, opacity=.5)))
        
    elif protease == 2: # redundant
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Redundant ({np.shape(dataframe)[0]})", marker=dict(color='lightseagreen', size=5, opacity=.9)))
        
    elif protease == 3: # additive
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Additive ({np.shape(dataframe)[0]})", marker=dict(color='blue', size=5, opacity=.9)))
        
    elif protease == 4: # lon
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Lon ({np.shape(dataframe)[0]})", marker=dict(color='red', size=5, opacity=.9)))
        
    elif protease == 5: # clp
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"ClpP ({np.shape(dataframe)[0]})", marker=dict(color='green', size=5, opacity=.9)))
        
    elif protease == 6: # hslv
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=f"Hslv ({np.shape(dataframe)[0]})", marker=dict(color='yellow', size=5, opacity=.9)))



# Create figure
fig = go.Figure()

del CLClimNE_log_data_with_proteases
# first find the monomer IDs that are substrates of the proteases in the CLClim0NE model:
priority_protease_data = pd.read_csv('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/priority_protease_assignments_0.tsv')

# read in the unfiltered data: 
CLClimNE_log_data_with_proteases = pd.read_csv('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLCLim1NE_cohort_saved_protein_count_data/unfiltered_data/log_data/LogAvgProteinCounts_startGen_2.csv')

CLNE_log_data = pd.read_csv('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/cohort_saved_protein_count_data_10_09/unfiltered_data/log_data/LogAvgProteinCounts_startGen_2.csv')


# merge the two dataframes:
CLNE_log_data = CLNE_log_data.rename(columns={"Log10 Average Protein Count": "Log10 CLNE Protein Counts"})
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases.rename(columns={"Log10 Average Protein Count": "Log10 CLClim0NE Protein Counts"})
CLNE_log_data = CLNE_log_data[["Monomer ID", "Log10 CLNE Protein Counts"]]
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases[["Monomer ID", "Log10 CLClim0NE Protein Counts"]]
CLNE_log_data = CLNE_log_data.set_index("Monomer ID")
CLClimNE_log_data_with_proteases = CLClimNE_log_data_with_proteases.set_index("Monomer ID")
CLNE_CLClimNE_log_data_proteases = CLNE_log_data.join(CLClimNE_log_data_with_proteases, how="inner")
CLNE_CLClimNE_log_data_proteases = CLNE_CLClimNE_log_data_proteases.reset_index()

# add the gene symbols to the dataframe:
CLNE_CLClimNE_log_data_proteases['gene symbol'] = CLNE_CLClimNE_log_data_proteases['Monomer ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x[:-3]])
CLNE_CLClimNE_log_data_proteases['protease assignement'] = "NA"

# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
gene_ids = CLNE_CLClimNE_log_data_proteases['gene symbol']
for gene_id in gene_ids:
    if gene_id in priority_protease_data['Gene name'].values:
        protease_row = priority_protease_data[priority_protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  # Extract the protease assignment
        # change the protease assignment for the gene id in the dataframe:
        CLNE_CLClimNE_log_data_proteases.loc[CLNE_CLClimNE_log_data_proteases['gene symbol'] == gene_id, 'protease assignement'] = protease

# make a copy of the CLClimNE_log_data_with_proteases data:
CLClimNE_log_data_with_Lon_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Clp_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_HslV_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Additive_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Redundant_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_Unexplained_proteases = CLNE_CLClimNE_log_data_proteases.copy()
CLClimNE_log_data_with_no_protease_proteases = CLNE_CLClimNE_log_data_proteases.copy()

# remove the proteins that were not assigned to the protease in the model:
CLClimNE_log_data_with_Lon_proteases = CLClimNE_log_data_with_Lon_proteases[CLClimNE_log_data_with_Lon_proteases['protease assignement'] == 'Lon only']
CLClimNE_log_data_with_Clp_proteases = CLClimNE_log_data_with_Clp_proteases[CLClimNE_log_data_with_Clp_proteases['protease assignement'] == 'ClpP only']
CLClimNE_log_data_with_HslV_proteases = CLClimNE_log_data_with_HslV_proteases[CLClimNE_log_data_with_HslV_proteases['protease assignement'] == 'HslV only']
CLClimNE_log_data_with_Additive_proteases = CLClimNE_log_data_with_Additive_proteases[CLClimNE_log_data_with_Additive_proteases['protease assignement'] == 'Additive']
CLClimNE_log_data_with_Redundant_proteases = CLClimNE_log_data_with_Redundant_proteases[CLClimNE_log_data_with_Redundant_proteases['protease assignement'] == 'Redundant']
CLClimNE_log_data_with_Unexplained_proteases = CLClimNE_log_data_with_Unexplained_proteases[CLClimNE_log_data_with_Unexplained_proteases['protease assignement'] == 'Actively degrading in Tripple KO']
CLClimNE_log_data_with_no_protease_proteases = CLClimNE_log_data_with_no_protease_proteases[CLClimNE_log_data_with_no_protease_proteases['protease assignement'] == 'NA']

# now plot!

# Add scatter plots and trendlines for each dataset
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_no_protease_proteases, 0)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Unexplained_proteases, 1)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Redundant_proteases, 2)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Additive_proteases, 3)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Lon_proteases, 4)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_Clp_proteases, 5)
CLNE_CLClimNE_plot_protease(fig, CLClimNE_log_data_with_HslV_proteases, 6)

# add a y=x line
fig.add_trace(go.Scatter(x=[0, 6], y=[0, 6], mode="lines",
    line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
    name="y=x"));

# Update layout
fig.update_layout(
    title=f"Protein Counts from the New WCM plotted <br>against those from the Original WCM",
    xaxis_title="log10(Orginal WCM Protein Counts)",
    yaxis_title=f"log10(New WCM Protein Counts)",
    autosize=False,
    width=900,
    height=600, 
    showlegend=True,
    legend_title="Protease Assignments")


# Show figure
fig.show()
