# CLClim0NE: Visualize the results of the CLClim0NE protein outputs compared to those in the original model

In [3]:
import pickle
import os
import pandas as pd
from matplotlib import pyplot as plt
os.chdir(os.path.expanduser('~/wcEcoli/out/'))
# noinspection PyUnresolvedReferences
import numpy as np
import plotly.graph_objects as go
from models.ecoli.analysis import cohortAnalysisPlot
from wholecell.analysis.analysis_tools import (exportFigure,
	read_bulk_molecule_counts, read_stacked_bulk_molecules, read_stacked_columns)
from wholecell.io.tablereader import TableReader
from sklearn.metrics import r2_score

# Load in the simulation data for the original model (CLNE) and the CLClim0NE model

In [20]:
# CLNE # todo explain how I got this data from running a sim and runing the cohort save data file 

VS_log_CLNE = pd.read_csv('PDR_CLNE/wildtype_000000/cohort_saved_protein_count_data/validation_data/log_data/Log10_Schmidt_Comparison_startGen_2.csv')
VW_log_CLNE = pd.read_csv('PDR_CLNE/wildtype_000000/cohort_saved_protein_count_data/validation_data/log_data/Log10_Wisniewski_Comparison_startGen_2.csv')

# CLClimNE
VS_log_CLClimNE = pd.read_csv('CLClimNE/wildtype_000000/cohort_saved_protein_count_data/validation_data/log_data/Log10_Schmidt_Comparison_startGen_2.csv')
VW_log_CLClimNE = pd.read_csv('CLClimNE/wildtype_000000/cohort_saved_protein_count_data/validation_data/log_data/Log10_Wisniewski_Comparison_startGen_2.csv')

In [27]:
# specifiy the current sequence being used: 
current_sequence = 'CLClim0NE'
Clim_name = 'Clim0'

# Schmidt Validation Data Comparisons


Plot #1: Compare the Schmidt Validation Protein Counts for the CLNE and CLClim0NE models

In [11]:
# Create scatter plot of Schmidt Validation Protein Counts vs CLNE and CLClimNE Counts
def add_scatter_and_trendline(fig, df, name):
    x = df["Log10 Validation Schmidt Counts"]
    y = df["Log10 Simulated Schmidt Counts"]
    hovertext = df["Monomer ID"]

    # Add scatter trace
    fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=name))

    # Compute trendline
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    trendline_y = p(x)

    # Add trendline trace
    fig.add_trace(go.Scatter(x=x, y=trendline_y, mode='lines', name=f'{name} Trendline: {p}'))

# Create figure
fig = go.Figure()

# Add scatter plots and trendlines for each dataset
add_scatter_and_trendline(fig, VS_log_CLNE, 'CLNE vs Schmidt')
add_scatter_and_trendline(fig, VS_log_CLClimNE, (current_sequence +' vs Schmidt'))

# Update layout
fig.update_traces(marker_size=3)
fig.update_layout(
    title=f"Schmidt Validation Protein Counts vs CLNE and {current_sequence}",
    xaxis_title="log10(Schmidt Validation Protein Counts)",
    yaxis_title=f"log10(CLNE, {current_sequence})",
    autosize=False,
    width=900,
    height=600
)

# add a y=x line
fig.add_trace(go.Scatter(x=[0, 6], y=[0, 6], mode="lines",
        line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
        name="y=x"));
# Show figure
fig.show()

Plot #2: Compare the Schmidt Validation Protein Counts for the CLNE and CLClim0NE models (but filter values below log10(2))

In [13]:
# Plot the simulation protein counts against the validation counts, filtering out proteins with counts below log10(2) (these are likely to be subgenerational proteins) 
def add_scatter_and_trendline(fig, df, name):
    x = df["Log10 Validation Schmidt Counts"]
    y = df["Log10 Simulated Schmidt Counts"]
    hovertext = df["Monomer ID"]

    # Add scatter 
    fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=name))

    # Compute trendline for data 
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    trendline_y = p(x)

    # Add trendline 
    fig.add_trace(go.Scatter(x=x, y=trendline_y, mode='lines', name=f'{name} Trendline: {p}'))

# Create figure
fig = go.Figure()

# filter out any counts below log10(2):
VS_log_CLNE_filtered = VS_log_CLNE[(VS_log_CLNE["Log10 Validation Schmidt Counts"] > 2) & (VS_log_CLNE["Log10 Simulated Schmidt Counts"] > 2)]
VS_log_CLClimNE_filtered = VS_log_CLClimNE[(VS_log_CLClimNE["Log10 Validation Schmidt Counts"] > 2) & (VS_log_CLClimNE["Log10 Simulated Schmidt Counts"] > 2)]

# Add scatter plots and trendlines for each dataset
add_scatter_and_trendline(fig, VS_log_CLNE_filtered, 'CLNE vs Schmidt')
add_scatter_and_trendline(fig, VS_log_CLClimNE_filtered, (current_sequence +' vs Schmidt'))

# Update layout
fig.update_traces(marker_size=3)
fig.update_layout(
    title=f"Schmidt Validation Protein Counts vs CLNE and {current_sequence} (filtered for counts > log10(2))",
    xaxis_title="log10(Schmidt Validation Protein Counts)",
    yaxis_title=f"log10(CLNE, {current_sequence})",
    autosize=False,
    width=900,
    height=600
)

# add a y=x line
fig.add_trace(go.Scatter(x=[2, 6], y=[2, 6], mode="lines",
        line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
        name="y=x"));
# Show figure
fig.show()

# Wisniewski Validation Data Comparisons

Plot #3: Compare the Wisniewski Validation Protein Counts for the CLNE and CLClim0NE models

In [14]:
# Create scatter plot of Schmidt Validation Protein Counts vs CLNE and CLClimNE Counts
def add_scatter_and_trendline(fig, df, name):
    x = df["Log10 Validation Wisniewski Counts"]
    y = df["Log10 Simulated Wisniewski Counts"]
    hovertext = df["Monomer ID"]

    # Add scatter trace
    fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=name))

    # Compute trendline
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    trendline_y = p(x)

    # Add trendline trace
    fig.add_trace(go.Scatter(x=x, y=trendline_y, mode='lines', name=f'{name} Trendline: {p}'))

# Create figure
fig = go.Figure()

# Add scatter plots and trendlines for each dataset
add_scatter_and_trendline(fig, VW_log_CLNE, 'CLNE vs Wisniewski')
add_scatter_and_trendline(fig, VW_log_CLClimNE, (current_sequence +' vs Wisniewski'))

# Update layout
fig.update_traces(marker_size=3)
fig.update_layout(
    title=f"Wisniewski Validation Protein Counts vs CLNE and {current_sequence}",
    xaxis_title="log10(Wisniewski Validation Protein Counts)",
    yaxis_title=f"log10(CLNE, {current_sequence})",
    autosize=False,
    width=900,
    height=600
)

# add a y=x line
fig.add_trace(go.Scatter(x=[0, 6], y=[0, 6], mode="lines",
        line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
        name="y=x"));
# Show figure
fig.show()

Plot #4: Compare the Wisniewski Validation Protein Counts for the CLNE and CLClim0NE models (but filter values below log10(2))

In [17]:
# Create scatter plot of Schmidt Validation Protein Counts vs CLNE and CLClimNE Counts
def add_scatter_and_trendline(fig, df, name):
    x = df["Log10 Validation Wisniewski Counts"]
    y = df["Log10 Simulated Wisniewski Counts"]
    hovertext = df["Monomer ID"]

    # Add scatter trace
    fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', name=name))

    # Compute trendline
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    trendline_y = p(x)

    # Add trendline trace
    fig.add_trace(go.Scatter(x=x, y=trendline_y, mode='lines', name=f'{name} Trendline: {p}'))

# Create figure
fig = go.Figure()

# Add scatter plots and trendlines for each dataset
# filter out any counts below log10(2):
VW_log_CLNE = VW_log_CLNE[(VW_log_CLNE["Log10 Validation Wisniewski Counts"] > 2) & (VW_log_CLNE["Log10 Simulated Wisniewski Counts"] > 2)]
VW_log_CLClimNE = VW_log_CLClimNE[(VW_log_CLClimNE["Log10 Validation Wisniewski Counts"] > 2) & (VW_log_CLClimNE["Log10 Simulated Wisniewski Counts"] > 2)]
add_scatter_and_trendline(fig, VW_log_CLNE, 'CLNE vs Wisniewski')
add_scatter_and_trendline(fig, VW_log_CLClimNE, (current_sequence +' vs Wisniewski'))

# Update layout
fig.update_traces(marker_size=3)
fig.update_layout(
    title=f"Wisniewski Validation Protein Counts vs CLNE and {current_sequence}",
    xaxis_title="log10(Wisniewski Validation Protein Counts)",
    yaxis_title=f"log10(CLNE, {current_sequence})",
    autosize=False,
    width=900,
    height=600
)

# add a y=x line
fig.add_trace(go.Scatter(x=[2, 6], y=[2, 6], mode="lines",
        line=go.scatter.Line(color="black", dash="dash"), opacity=0.2,
        name="y=x"));
# Show figure
fig.show()

# Plot of the proteases in the ClimNE model

In [26]:
# FUNCTIONS

# get the gene IDs for each monomer IDs:
def get_gene_ids_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_ids[protein_id] = gene_id

        return monomer_ids_to_gene_ids
    
def get_gene_symbols_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_symbol_index = headers.index('common_name')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_symbols = {}
        for line in reader:
            gene_symbol = line[gene_symbol_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_symbols[protein_id] = gene_symbol

        return monomer_ids_to_gene_symbols
    
    
# convert gene IDs to monomer IDs:
def get_monomer_ids_for_gene_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        gene_ids_to_monomer_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            gene_ids_to_monomer_ids[gene_id] = protein_id

        return gene_ids_to_monomer_ids
    
    
# recall the genes defined as "essential" in the model, and return them as a list of gene names:
def get_essential_genes():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    GENES_FILE = os.path.join(ROOT_PATH, 'validation', 'ecoli', 'flat', 'essential_genes.tsv')
    with io.open(GENES_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('FrameID')
        essential_genes = []
        for line in reader:
            essential_gene = line[gene_id_index]
            essential_genes.append(essential_gene)

        return essential_genes
	
	
# convert the data from the ParCa translation.py raw HL saves: 
def convert_HL_data(dataframe):
    name_s = 'rate constant (s^1)' # name in seconds
    name_m = 'half life (min)' # name in mins
    dataframe.rename(columns={'Rate Constant': name_s}, inplace=True)
    dataframe[name_m] = dataframe[name_s] * 60 * (1/np.log(2))
    dataframe[name_m] = 1 / dataframe[name_m] # units of mins
    return dataframe


# convert each data source in the saved HL data file folders 
def convert_data_and_add_HL_source(dataframe, HL_source):
    name_s = HL_source + ' rate constant (s^1)' # name in seconds
    name_m = HL_source + ' half life (min)' # name in mins
    dataframe.rename(columns={'Rate Constant': name_s}, inplace=True)
    dataframe[name_m] = dataframe[name_s] * 60 * (1/np.log(2))
    dataframe[name_m] = 1 / dataframe[name_m] # units of mins
    return dataframe
    


In [23]:
# load in the protease-substrate assignment datatable: 
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# determine if any of the proteins assigned to a protease are considered an essential gene in the model: 
essential_substrate_gene_symbols = []
for gene in get_essential_genes():
    if get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]] in protease_data['Gene name'].values:
        essential_substrate_gene_symbols.append(get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]])
        
print("There are " + str(len(protease_data['Gene name'].values)) + " proteins assigned to a protease in the Gupta et al., 2024 data.")
print("Essential Genes in Model that are Assigned to a Protease (" +str(len(essential_substrate_gene_symbols))+"): ", essential_substrate_gene_symbols)

There are 308 proteins assigned to a protease in the Gupta et al., 2024 data.
Essential Genes in Model that are Assigned to a Protease (57):  ['bioB', 'ilvE', 'argA', 'aroE', 'cysH', 'cysI', 'cysJ', 'cysN', 'leuA', 'leuC', 'leuD', 'lysA', 'thrA', 'trpB', 'tyrA', 'iscS', 'nadA', 'nadB', 'metR', 'ispH', 'ftsI', 'ftsQ', 'ftsZ', 'lpxC', 'erpA', 'frr', 'dxr', 'ispU', 'dnaE', 'thiL', 'dnaX', 'ftsK', 'rpsA', 'mukB', 'acpP', 'minE', 'hemA', 'folE', 'ligA', 'dapA', 'ispG', 'era', 'ffh', 'metK', 'parE', 'ribB', 'obgE', 'def', 'rplX', 'dnaA', 'birA', 'rpoB', 'rpoC', 'ubiA', 'lexA', 'dnaB', 'dnaC']


In [35]:
VS_log_CLClimNE

Unnamed: 0,Monomer ID,Log10 Simulated Schmidt Counts,Log10 Validation Schmidt Counts
0,G6814-MONOMER[c],2.430598,1.986772
1,ALDHDEHYDROG-MONOMER[c],2.069846,2.068186
2,EG11838-MONOMER[c],1.552474,0.845098
3,EG11195-MONOMER[c],2.963099,2.709270
4,G7651-MONOMER[c],2.429409,0.845098
...,...,...,...
2136,G369-MONOMER[c],3.558480,1.230449
2137,EG11802-MONOMER[c],0.731940,1.204120
2138,NUOK-MONOMER[m],1.931831,2.361728
2139,EG10877-MONOMER[c],4.344847,4.520038


In [64]:
# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
monomer_ids = VS_log_CLClimNE['Monomer ID']
monomer_ids = monomer_ids.str.slice(0, -3)

CLClim0NE_protease_substrate_gene_symbols = []
CLClim0NE_protease_substrate_monomer_ids = []
for monomer_id in monomer_ids:
	monomer = monomer_id
	gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
	if gene_symbol in protease_data['Gene name'].values:
		CLClim0NE_protease_substrate_gene_symbols.append(gene_symbol)
		CLClim0NE_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in CLClimNE (" +str(len(CLClim0NE_protease_substrate_gene_symbols))+"): ", Clim0_protease_substrate_gene_symbols)

VS_log_CLClimNE['gene symbol'] = VS_log_CLClimNE['Monomer ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x[:-3]]) # this is to add to the hovertext
# if a monomer in CLClim0NE_protease_substrate_gene_symbols is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
CLClim0NE_protease_substrate_idxs = monomer_ids.isin(CLClim0NE_protease_substrate_monomer_ids)
CLClim0NE_protease_substrates = VS_log_CLClimNE[CLClim0NE_protease_substrate_idxs].copy()
CLClim0NE_protease_substrates['HL Source'] = f"{current_sequence} (protease substrate)"
CLClim0NE_VS_log_data_remaining = VS_log_CLClimNE[~CLClim0NE_protease_substrate_idxs].copy()

Total Protease Substrates in CLClimNE (236):  ['leuA', 'ubiA', 'ackA', 'acnB', 'fadE', 'glpD', 'cysJ', 'uxaA', 'astA', 'aroD', 'aroE', 'aroG', 'aroK', 'aroL', 'asnA', 'aspA', 'thrA', 'cysI', 'bioB', 'birA', 'ilvE', 'chbB', 'cfa', 'tyrA', 'ubiC', 'cycA', 'cysD', 'cysN', 'dadA', 'lysA', 'dapA', 'nfsB', 'ribB', 'cobT', 'dxr', 'yaaA', 'bisC', 'clpA', 'clpX', 'dedA', 'dedD', 'dksA', 'dnaB', 'dnaC', 'dnaE', 'dnaK', 'dnaQ', 'era', 'ffh', 'frr', 'ftsI', 'ftsZ', 'ispG', 'mnmG', 'greA', 'helD', 'hflX', 'hmp', 'hsdM', 'hypE', 'ligA', 'mazG', 'minE', 'mukB', 'mutS', 'nfo', 'parC', 'parE', 'pcnB', 'priC', 'pspA', 'recA', 'recD', 'recF', 'recJ', 'rhlB', 'rplX', 'rpsA', 'ruvC', 'sbcC', 'selD', 'prlF', 'srmB', 'tonB', 'topB', 'uvrB', 'uvrD', 'ispH', 'sbcD', 'ybaB', 'relE', 'yebC', 'yecA', 'uvrY', 'yibA', 'yidA', 'mioC', 'ybhA', 'rlmD', 'mazF', 'radA', 'rng', 'lipA', 'dinG', 'ahpC', 'ahpF', 'ychF', 'yjgA', 'dps', 'def', 'yigI', 'ycgB', 'ibpA', 'greB', 'grcA', 'yihD', 'yihI', 'yiiQ', 'rsuA', 'radD', 'cc

In [76]:
import numpy as np
import plotly.graph_objs as go

# Create scatter plot of Schmidt Validation Protein Counts vs CLNE and CLClimNE Counts
def add_scatter_and_trendline(fig, df, name, is_protease_substrate=False):
    x = df["Log10 Validation Schmidt Counts"]
    y = df["Log10 Simulated Schmidt Counts"]
    hovertext = df["gene symbol"]
	
    # Properly indented if statement
    if is_protease_substrate == True:
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', 
                                  marker_size=7, marker=dict(color="orange", symbol='star'), 
                                  name=name))
    else: 
        fig.add_trace(go.Scatter(x=x, y=y, hovertext=hovertext, mode='markers', marker_size=4,
                                 marker=dict(color="lightseagreen"),  name=name))
		
        # Compute trendline
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        trendline_y = p(x)
    
        # Add trendline trace
        fig.add_trace(go.Scatter(x=x, y=trendline_y, mode='lines', line=dict(color="salmon"), name=f'{name} Trendline: {p}'))

# Create figure
fig = go.Figure()

# Add scatter plots and trendlines for each dataset
add_scatter_and_trendline(fig, CLClim0NE_VS_log_data_remaining, (current_sequence + ' vs Schmidt'))
add_scatter_and_trendline(fig, CLClim0NE_protease_substrates, (current_sequence + '(protease substrates) vs Schmidt'), True)

# Update layout
fig.update_layout(
    title=f"Schmidt Validation Protein Counts vs {current_sequence}",
    xaxis_title="log10(Schmidt Validation Protein Counts)",
    yaxis_title=f"log10({current_sequence})",
    autosize=False,
    width=1200,
    height=1000
)

# Add a y=x line
fig.add_trace(go.Scatter(x=[0, 6], y=[0, 6], mode="lines",
        line=dict(color="black", dash="dash"), opacity=0.2,
        name="y=x"))

# Show figure
fig.show()
